diff --git a/.gitlab/ci/build.gitlab-ci.yml b/.gitlab/ci/build.gitlab-ci.yml
index a8b6595731f3ba8ecc14fffcd306ada52bb52616..c5d22e779753ee0fbfa0bcbd828f85639ace8b9f 100644
--- a/.gitlab/ci/build.gitlab-ci.yml
+++ b/.gitlab/ci/build.gitlab-ci.yml
@@ -1,5 +1,6 @@
 include:
-  - remote: 'https://gitlab.eclipse.org/eclipse/aidge/gitlab_shared_files/-/raw/main/.gitlab/ci/shared_script.gitlab-ci.yml'
+  #- remote: 'https://gitlab.eclipse.org/eclipse/aidge/gitlab_shared_files/-/raw/main/.gitlab/ci/shared_script.gitlab-ci.yml'
+  - remote: 'https://gitlab.eclipse.org/hrouis/gitlab_shared_files/-/raw/test_hro/.gitlab/ci/shared_script.gitlab-ci.yml'
 
 build:ubuntu_cpp:
   stage: build
diff --git a/include/aidge/backend/cuda.hpp b/include/aidge/backend/cuda.hpp
index 6b5a4d639d58476a20325716361f38de28df0d24..da62b81022550a79d63fa1f20aa9429753e5ab6c 100644
--- a/include/aidge/backend/cuda.hpp
+++ b/include/aidge/backend/cuda.hpp
@@ -13,13 +13,20 @@
 #define AIDGE_BACKEND_CUDA_IMPORTS_H_
 
 #include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/AddImpl.hpp"
 #include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp"
+#include "aidge/backend/cuda/operator/BatchNormImpl.hpp"
 #include "aidge/backend/cuda/operator/ConvImpl.hpp"
 #include "aidge/backend/cuda/operator/FCImpl.hpp"
+#include "aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp"
 #include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp"
+#include "aidge/backend/cuda/operator/PadImpl.hpp"
 #include "aidge/backend/cuda/operator/ReLUImpl.hpp"
 #include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
 #include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
+#include "aidge/backend/cuda/operator/ReshapeImpl.hpp"
+#include "aidge/backend/cuda/operator/SigmoidImpl.hpp"
+#include "aidge/backend/cuda/operator/SubImpl.hpp"
+#include "aidge/backend/cuda/operator/TanhImpl.hpp"
 
-
-#endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */
\ No newline at end of file
+#endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */
diff --git a/include/aidge/backend/cuda/data/TensorImpl.hpp b/include/aidge/backend/cuda/data/TensorImpl.hpp
index f083a8ba9f68ed53929db95b3fd6604f31548e21..96045781647f93f0627ca0853a0cdaa66a08af83 100644
--- a/include/aidge/backend/cuda/data/TensorImpl.hpp
+++ b/include/aidge/backend/cuda/data/TensorImpl.hpp
@@ -85,14 +85,20 @@ public:
     // native interface
     const future_std::span<T>& data() const { return mData; }
 
+    inline std::size_t capacity() const noexcept override { return mData.size(); }
+
     std::size_t scalarSize() const noexcept override { return sizeof(T); }
 
+    void zeros() override final {
+        CHECK_CUDA_STATUS(cudaMemset(rawPtr(), T(0), mNbElts * sizeof(T)));
+    }
+
     void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copy(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
         const T* srcT = static_cast<const T *>(src);
         T* dstT = static_cast<T *>(rawPtr(offset));
 
-        AIDGE_ASSERT(dstT < srcT || dstT >= srcT + length, "overlapping copy is not supported");
+        AIDGE_ASSERT(dstT < srcT || dstT >= srcT + length, "TensorImpl_cuda<{}>::copy(): overlapping copy is not supported", typeid(T).name());
         CHECK_CUDA_STATUS(cudaMemcpy(dstT, srcT, length * sizeof(T), cudaMemcpyDeviceToDevice));
     }
 
@@ -101,7 +107,7 @@ public:
             return;
         }
 
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyCast(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
         switch (srcDt) {
         case DataType::Float64:
             thrust_copy(static_cast<const double*>(src),
@@ -159,23 +165,23 @@ public:
                         length);
             break;
         default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Unsupported data type.");
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "TensorImpl_cuda<{}>::copyCast(): unsupported data type {}.", typeid(T).name(), srcDt);
             break;
         }
     }
 
     void copyFromDevice(const void *src, const std::pair<std::string, DeviceIdx_t>& device, NbElts_t length, NbElts_t offset = 0) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromDevice(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
         CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyDeviceToDevice));
     }
 
     void copyFromHost(const void *src, NbElts_t length, NbElts_t offset = 0) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
         CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyHostToDevice));
     }
 
     void copyToHost(void *dst, NbElts_t length, NbElts_t offset = 0) const override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyToHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
         CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(offset), length * sizeof(T), cudaMemcpyDeviceToHost));
     }
 
@@ -185,7 +191,7 @@ public:
     };
 
     const void *rawPtr(NbElts_t offset = 0) const override {
-        AIDGE_ASSERT(mData.size() >= mNbElts, "accessing uninitialized const rawPtr");
+        AIDGE_ASSERT(mData.size() >= mNbElts, "TensorImpl_cuda<{}>::rawPtr(): accessing uninitialized const rawPtr", typeid(T).name());
         return (mData.data() + offset);
     };
 
@@ -220,7 +226,7 @@ public:
     }
 
     void setRawPtr(void *ptr, NbElts_t length) override final {
-        AIDGE_ASSERT(length >= mNbElts, "trying to set raw pointer of insufficient capacity");
+        AIDGE_ASSERT(length >= mNbElts, "TensorImpl_cuda<{}>::setRawPtr(): trying to set raw pointer (length: {}) of insufficient capacity (required: {})", typeid(T).name(), length, mNbElts);
         mData = future_std::span<T>(static_cast<T *>(ptr), length);
         mDataOwner.reset();
     };
@@ -231,7 +237,7 @@ private:
     void lazyInit() {
         if (mData.size() < mNbElts) {
             // Need more data, a re-allocation will occur
-            AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "trying to enlarge non-owned data");
+            AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "TensorImpl_cuda<{}>: trying to enlarge non-owned data", typeid(T).name());
             mDataOwner.reset(cudaAlloc(mNbElts));
             mData = future_std::span<T>(mDataOwner.get(), mNbElts);
         }
diff --git a/include/aidge/backend/cuda/operator/AddImpl.hpp b/include/aidge/backend/cuda/operator/AddImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd1819753cd00a325443d9c9c992f3d2347bb377
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/AddImpl.hpp
@@ -0,0 +1,56 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ADDIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_ADDIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Add.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class AddImpl_cuda : public OperatorImpl {
+private:
+
+
+public:
+    AddImpl_cuda(const Add_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<AddImpl_cuda> create(const Add_Op &op) {
+        return std::make_unique<AddImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    void backward();
+    // ~AddImpl_cuda();
+private:
+    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+    template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+};
+
+namespace {
+// add cuda backend to Add_Op implementation registry
+static Registrar<Add_Op> registrarAddImpl_cuda("cuda", Aidge::AddImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ADDIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
index 43a6bd57c0c6431705abe73d3f3c175046d72dc9..540ec574f9b5fbcea8b8f28e390cbe05f1e0fa8e 100644
--- a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
@@ -33,7 +33,7 @@ private:
     // CuDNN specific variables
     cudnnPoolingDescriptor_t mAvgPoolingDesc = nullptr;
     cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-    std::shared_ptr<Tensor> mInputFallback;
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
 
 public:
     AvgPoolingImpl_cuda(const AvgPooling_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
@@ -44,10 +44,12 @@ public:
 
 public:
     void forward();
+    void backward();
     ~AvgPoolingImpl_cuda();
 
 private:
     template <class T> void forward_(const Tensor& input);
+    template <class T> void backward_(const Tensor& output_grad);
 };
 
 namespace {
diff --git a/include/aidge/backend/cuda/operator/BatchNormImpl.hpp b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3451d07f289371202570434f96546344c0c4fb26
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp
@@ -0,0 +1,61 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_BATCHNORMIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_BATCHNORMIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/BatchNorm.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+template <DimIdx_t DIM>
+class BatchNormImpl_cuda : public OperatorImpl {
+private:
+    // CuDNN specific variables
+    cudnnTensorDescriptor_t mBNDesc = nullptr;
+    cudnnBatchNormMode_t mMode;
+    double mEpsilon;
+
+public:
+    BatchNormImpl_cuda(const BatchNorm_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<BatchNormImpl_cuda> create(const BatchNorm_Op<DIM> &op) {
+        return std::make_unique<BatchNormImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    void backward();
+    ~BatchNormImpl_cuda();
+
+private:
+    template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, const Tensor& input3, const Tensor& input4);
+    template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
+};
+
+namespace {
+// add cuda backend to BatchNorm_Op<2> implementation registry
+static Registrar<BatchNorm_Op<2>> registrarBatchNormImpl_cuda("cuda", Aidge::BatchNormImpl_cuda<2>::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_BATCHNORMIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ConvImpl.hpp b/include/aidge/backend/cuda/operator/ConvImpl.hpp
index 8c591927ce0e52daeff447726c114ce3ae4d0103..0722048f7cf021104a9694a621b1c0dad00ce423 100644
--- a/include/aidge/backend/cuda/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ConvImpl.hpp
@@ -21,11 +21,13 @@
 
 #include "aidge/backend/OperatorImpl.hpp"
 #include "aidge/operator/Conv.hpp"
+#include "aidge/operator/ConvDepthWise.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
+
 namespace Aidge {
 template <DimIdx_t DIM>
 class ConvImpl_cuda : public OperatorImpl {
@@ -42,14 +44,19 @@ private:
     std::shared_ptr<Tensor> mInput0Fallback;
     std::shared_ptr<Tensor> mInput1Fallback;
     std::shared_ptr<Tensor> mInput2Fallback;
+    bool mDepthWise = false;
 
 public:
-    ConvImpl_cuda(const Conv_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
+    ConvImpl_cuda(const Operator&op, bool depthWise = false) : OperatorImpl(op, "cuda"), mDepthWise(depthWise) {}
 
     static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<DIM> &op) {
         return std::make_unique<ConvImpl_cuda>(op);
     }
 
+    static std::unique_ptr<ConvImpl_cuda> createDW(const ConvDepthWise_Op<DIM> &op) {
+        return std::make_unique<ConvImpl_cuda>(op, true);
+    }
+
 public:
     void forward();
     void backward();
@@ -61,8 +68,8 @@ private:
 };
 
 namespace {
-// add cuda backend to Conv_Op<2> implementation registry
 static Registrar<Conv_Op<2>> registrarConvImpl_cuda("cuda", Aidge::ConvImpl_cuda<2>::create);
+static Registrar<ConvDepthWise_Op<2>> registrarConvDepthWiseImpl_cuda("cuda", Aidge::ConvImpl_cuda<2>::createDW);
 }  // namespace
 }  // namespace Aidge
 
diff --git a/include/aidge/backend/cuda/operator/FCImpl.hpp b/include/aidge/backend/cuda/operator/FCImpl.hpp
index 3d8a1348d500fc533c7c9b601b09629995f97427..46f7849d1f17aab5496bdbde013ef078ad1f5a7c 100644
--- a/include/aidge/backend/cuda/operator/FCImpl.hpp
+++ b/include/aidge/backend/cuda/operator/FCImpl.hpp
@@ -27,9 +27,6 @@
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
-class FCImplForward_cuda : public Registrable<FCImplForward_cuda,
-                                                 std::tuple<DataType>,
-                                                 void(std::size_t , std::size_t, std::size_t, bool, const void* , const void* , const void* , void*)> {};
 class FCImpl_cuda : public OperatorImpl {
 private:
     std::shared_ptr<Tensor> mInput0Fallback;
@@ -46,10 +43,12 @@ public:
 
 public:
     void forward();
+    void backward();
     // ~FCImpl_cuda();
 
 private:
-    template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels);
+    template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels);
+    template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels);
 };
 
 namespace {
diff --git a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp
index 9084e01fc08cb3d00e80fc8cf6246064b20591f2..8d1af8f7c5954c2eae9179926aec433eee34414f 100644
--- a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp
@@ -32,5 +32,14 @@ cublasStatus_t cublasGemm(cublasHandle_t handle,
                           const T *B, int ldb,
                           const T *beta,
                           T *C, int ldc);
+
+template <class T>
+cublasStatus_t cublasGemv(cublasHandle_t handle, cublasOperation_t trans,
+                          int m, int n,
+                          const T  *alpha,
+                          const T *A, int lda,
+                          const T *x, int incx,
+                          const T *beta,
+                          T *y, int incy);
 }
 #endif /* AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp b/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e0fad5c01efb6474f527dee0bfbfdc594788bc6
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp
@@ -0,0 +1,60 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/GlobalAveragePooling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class GlobalAveragePoolingImpl_cuda : public OperatorImpl {
+private:
+    // CuDNN specific variables
+    cudnnPoolingDescriptor_t mGlobalAveragePoolingDesc = nullptr;
+    cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
+
+public:
+    GlobalAveragePoolingImpl_cuda(const GlobalAveragePooling_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<GlobalAveragePoolingImpl_cuda> create(const GlobalAveragePooling_Op &op) {
+        return std::make_unique<GlobalAveragePoolingImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    void backward();
+    ~GlobalAveragePoolingImpl_cuda();
+
+private:
+    template <class T> void forward_(const Tensor& input);
+    template <class T> void backward_(const Tensor& output_grad);
+};
+
+namespace {
+// add cuda backend to GlobalAveragePooling_Op implementation registry
+static Registrar<GlobalAveragePooling_Op> registrarGlobalAveragePoolingImpl_cuda("cuda", Aidge::GlobalAveragePoolingImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp
index 4d08d7fab3c7cb2baa18838fd872e44a8eccc923..db7f1e376013db52aeb1b27f8cc3ff192c7f0629 100644
--- a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp
+++ b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp
@@ -33,7 +33,7 @@ private:
     // CuDNN specific variables
     cudnnPoolingDescriptor_t mMaxPoolingDesc = nullptr;
     cudnnPoolingMode_t mMode = CUDNN_POOLING_MAX;
-    std::shared_ptr<Tensor> mInputFallback;
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
 
 public:
     MaxPoolingImpl_cuda(const MaxPooling_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
@@ -44,10 +44,12 @@ public:
 
 public:
     void forward();
+    void backward();
     ~MaxPoolingImpl_cuda();
 
 private:
     template <class T> void forward_(const Tensor& input);
+    template <class T> void backward_(const Tensor& output_grad);
 };
 
 namespace {
diff --git a/include/aidge/backend/cuda/operator/PadImpl.hpp b/include/aidge/backend/cuda/operator/PadImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4452d3408e7b4780c1e5c4ea6553ba0b713df231
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/PadImpl.hpp
@@ -0,0 +1,61 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_PADIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_PADIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Pad.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+template <DimIdx_t DIM>
+class PadImpl_cuda : public OperatorImpl {
+private:
+    // CuDNN specific variables
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
+    int mLeftPad, mTopPad;
+    double mPadVal;
+    unsigned int mPadType;
+
+public:
+    PadImpl_cuda(const Pad_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<PadImpl_cuda> create(const Pad_Op<2> &op) {
+        return std::make_unique<PadImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    void backward();
+
+private:
+    template <class T> void forward_(const Tensor& input);
+    template <class T> void backward_(const Tensor& outGrad);
+};
+
+namespace {
+// add cuda backend to Pad_Op<2> implementation registry
+static Registrar<Pad_Op<2>> registrarPadImpl_cuda("cuda", Aidge::PadImpl_cuda<2>::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_PADIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6a83160da5cf3fea3d3415959c965e16c1eb4ff
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp
@@ -0,0 +1,37 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_PADIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CUDA_OPERATOR_PADIMPL_FORWARD_KERNEL_H_
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge
+{
+
+    template <class T>
+    void cudaPadding(const cudaDeviceProp &deviceProp,
+                     unsigned int nbOutputs,
+                     unsigned int outputsWidth,
+                     unsigned int outputsHeight,
+                     unsigned int nbChannels,
+                     unsigned int batchSize,
+                     unsigned int inputWidth,
+                     unsigned int inputHeight,
+                     int leftPad,
+                     int topPad,
+                     unsigned int padType,
+                     T padValue,
+                     const T *input,
+                     T *outputs);
+}
+#endif /* AIDGE_CUDA_OPERATOR_PADIMPL_FORWARD_KERNEL_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/ReLUImpl.hpp b/include/aidge/backend/cuda/operator/ReLUImpl.hpp
index 6570662fa5df27e54a9df6f357e918243a71330a..285713f460b9d5b5e868c0c07ab23804f30dd694 100644
--- a/include/aidge/backend/cuda/operator/ReLUImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ReLUImpl.hpp
@@ -36,6 +36,7 @@ private:
         cudnnActivationMode_t mReLUDesc = nullptr;
     #endif
     std::shared_ptr<Tensor> mInputFallback;
+    std::shared_ptr<Tensor> mOutputGradFallback;
 
 public:
     ReLUImpl_cuda(const ReLU_Op &op) : OperatorImpl(op, "cuda") {}
@@ -46,10 +47,12 @@ public:
 
 public:
     void forward();
+    void backward();
     ~ReLUImpl_cuda();
 
 private:
     template <class T> void forward_(const Tensor& input);
+    template <class T> void backward_(const Tensor& output_grad);
 };
 
 namespace {
diff --git a/include/aidge/backend/cuda/operator/ReshapeImpl.hpp b/include/aidge/backend/cuda/operator/ReshapeImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b43df680bef115310669f0d55f2f78ef4fe9fa6
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/ReshapeImpl.hpp
@@ -0,0 +1,53 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_RESHAPEIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_RESHAPEIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Reshape.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class ReshapeImpl_cuda : public OperatorImpl {
+private:
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
+
+public:
+    ReshapeImpl_cuda(const Reshape_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<ReshapeImpl_cuda> create(const Reshape_Op &op) {
+        return std::make_unique<ReshapeImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    void backward();
+    ~ReshapeImpl_cuda();
+};
+
+namespace {
+// add cuda backend to Reshape_Op implementation registry
+static Registrar<Reshape_Op> registrarReshapeImpl_cuda("cuda", Aidge::ReshapeImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_RESHAPEIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/SigmoidImpl.hpp b/include/aidge/backend/cuda/operator/SigmoidImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..90dbb717732ad788b868fdc95eb55579a5e0b9f6
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/SigmoidImpl.hpp
@@ -0,0 +1,64 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_SIGMOIDIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_SIGMOIDIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Sigmoid.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class SigmoidImpl_cuda : public OperatorImpl {
+private:
+    // CuDNN specific variables
+    #if CUDNN_VERSION >= 5000
+        cudnnActivationDescriptor_t mSigmoidDesc = nullptr;
+    #else
+        cudnnActivationMode_t mSigmoidDesc = nullptr;
+    #endif
+    std::shared_ptr<Tensor> mInputFallback;
+    std::shared_ptr<Tensor> mOutputGradFallback;
+
+public:
+    SigmoidImpl_cuda(const Sigmoid_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<SigmoidImpl_cuda> create(const Sigmoid_Op &op) {
+        return std::make_unique<SigmoidImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    void backward();
+    ~SigmoidImpl_cuda();
+
+private:
+    template <class T> void forward_(const Tensor& input);
+    template <class T> void backward_(const Tensor& output_grad);
+};
+
+namespace {
+// add cuda backend to Sigmoid_Op implementation registry
+static Registrar<Sigmoid_Op> registrarSigmoidImpl_cuda("cuda", Aidge::SigmoidImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_SIGMOIDIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/SubImpl.hpp b/include/aidge/backend/cuda/operator/SubImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd1a76692abdf16b9854b90f535f68329ae5877a
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/SubImpl.hpp
@@ -0,0 +1,56 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_SUBIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_SUBIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Sub.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class SubImpl_cuda : public OperatorImpl {
+private:
+
+
+public:
+    SubImpl_cuda(const Sub_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<SubImpl_cuda> create(const Sub_Op &op) {
+        return std::make_unique<SubImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    void backward();
+    // ~SubImpl_cuda();
+private:
+    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+    template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+};
+
+namespace {
+// add cuda backend to Sub_Op implementation registry
+static Registrar<Sub_Op> registrarSubImpl_cuda("cuda", Aidge::SubImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_SUBIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/TanhImpl.hpp b/include/aidge/backend/cuda/operator/TanhImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..35e879513fee0ec9354edecefd3d53860e54a0b1
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/TanhImpl.hpp
@@ -0,0 +1,64 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_TANHIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_TANHIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Tanh.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class TanhImpl_cuda : public OperatorImpl {
+private:
+    // CuDNN specific variables
+    #if CUDNN_VERSION >= 5000
+        cudnnActivationDescriptor_t mTanhDesc = nullptr;
+    #else
+        cudnnActivationMode_t mTanhDesc = nullptr;
+    #endif
+    std::shared_ptr<Tensor> mInputFallback;
+    std::shared_ptr<Tensor> mOutputGradFallback;
+
+public:
+    TanhImpl_cuda(const Tanh_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<TanhImpl_cuda> create(const Tanh_Op &op) {
+        return std::make_unique<TanhImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    void backward();
+    ~TanhImpl_cuda();
+
+private:
+    template <class T> void forward_(const Tensor& input);
+    template <class T> void backward_(const Tensor& output_grad);
+};
+
+namespace {
+// add cuda backend to Tanh_Op implementation registry
+static Registrar<Tanh_Op> registrarTanhImpl_cuda("cuda", Aidge::TanhImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_TANHIMPL_H_ */
diff --git a/python_binding/pybind_backend_cuda.cpp b/python_binding/pybind_backend_cuda.cpp
index abd1997389f3574a24e171f6ab26628dcfe40cfd..3d7564459781d6933827aa66b405b03085806467 100644
--- a/python_binding/pybind_backend_cuda.cpp
+++ b/python_binding/pybind_backend_cuda.cpp
@@ -1,9 +1,20 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
 #include <pybind11/pybind11.h>
+
 // Need to call this header to register every impl
 #include "aidge/backend/cuda.hpp"
 
 namespace py = pybind11;
-
 namespace Aidge {
 
 void init_cuda_sys_info(py::module& m);
@@ -15,4 +26,4 @@ void init_Aidge(py::module& m){
 PYBIND11_MODULE(aidge_backend_cuda, m) {
     init_Aidge(m);
 }
-}
+} // namespace Aidge
diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..74d89c405530766324407fc42345f237931dc2f4
--- /dev/null
+++ b/src/operator/AddImpl.cpp
@@ -0,0 +1,213 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/AddImpl.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/Add.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::AddImpl_cuda::forward() {
+    const Add_Op& op = static_cast<const Add_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in Add operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Add forward because the 0-th input has no implementation.");
+    DataType datatypeFirstInput = op.getInput(0)->dataType();
+    for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
+        AIDGE_ASSERT(op.getInput(i), "missing input in Add operator");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Add forward because the {}-th input has no implementation.", i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type.");
+    }
+
+    std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs());
+    std::vector<Tensor> inputs(op.nbInputs());
+    std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
+    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
+        inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
+
+        // Get tensor dims and broadcast them
+        std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+
+        // Compute the corresponding strides
+        std::vector<int> tensorStrides(dims[i].size());
+        int product = 1;
+        for (size_t j = dims[i].size(); j > 0; --j) {
+            tensorStrides[j - 1] = product;
+            product *= dims[i][j - 1];
+        }
+        strides[i] = tensorStrides;
+    }
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(inputs, dims, strides);
+            break;
+        case DataType::Float32:
+            forward_<float>(inputs, dims, strides);
+            break;
+        case DataType::Float16:
+            forward_<half>(inputs, dims, strides);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::AddImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+
+    // Create a Tensor descriptor with the broadcasted dims and strides
+    cudnnTensorDescriptor_t tensorDesc;
+    CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc));
+    CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data()));
+    // Add first input
+    CHECK_CUDNN_STATUS(
+        cudnnAddTensor(CudaContext::cudnnHandle(),
+                       &alpha,
+                       tensorDesc,
+                       inputs[0].getImpl()->rawPtr(),
+                       &beta,
+                       std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                       std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())
+    );
+    // Add other inputs if there are any
+    for (size_t i = 1; i < op.nbInputs(); ++i)
+    {
+        CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[i].size(), inputsDims[i].data(), inputsStrides[i].data()));
+        CHECK_CUDNN_STATUS(
+            cudnnAddTensor(CudaContext::cudnnHandle(),
+                        &alpha,
+                        tensorDesc,
+                        inputs[i].getImpl()->rawPtr(),
+                        &alpha,
+                        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                        std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())
+        );
+    }
+    CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc));
+}
+
+void Aidge::AddImpl_cuda::backward() {
+    const Add_Op& op = static_cast<const Add_Op&>(mOp);
+    // Check output
+    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Add operator");
+    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Add backward because the output gradient has no implementation.");
+
+    std::shared_ptr<Tensor> outputGradFallback;
+    const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad());
+
+    std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
+    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
+        std::shared_ptr<Tensor> inputFallback;
+        const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0));
+
+        // Get tensor dims and broadcast them
+        std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+
+        // Compute the corresponding strides
+        std::vector<int> tensorStrides(dims[i].size());
+        int product = 1;
+        for (size_t j = dims[i].size(); j > 0; --j) {
+            tensorStrides[j - 1] = product;
+            product *= dims[i][j - 1];
+        }
+        strides[i] = tensorStrides;
+    }
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            backward_<double>(outputGrad, dims, strides);
+            break;
+        case DataType::Float32:
+            backward_<float>(outputGrad, dims, strides);
+            break;
+        case DataType::Float16:
+            backward_<half>(outputGrad, dims, strides);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::AddImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+
+    for (std::size_t i = 0; i < inputsDims.size(); i++)
+    {
+        if (op.getInput(i)->size() == op.getOutput(0)->size())
+        {
+            // TODO: Test if we can avoid copy and simply set rawPtr
+            op.getInput(i)->grad()->getImpl()->copy(outputGrad.getImpl()->rawPtr(), op.getInput(i)->grad()->size());
+        }
+        else // In case of broadcasting
+        {
+            // Gradient with respect to input_i: sum outputGrad over the broadcasted dimensions using cudnnReduceTensor
+            cudnnReduceTensorDescriptor_t reduceDesc;
+            CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc));
+            CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc,
+                                                              CUDNN_REDUCE_TENSOR_ADD,
+                                                              CudaContext::data_type<T>::value,
+                                                              CUDNN_PROPAGATE_NAN,
+                                                              CUDNN_REDUCE_TENSOR_NO_INDICES,
+                                                              CUDNN_32BIT_INDICES));
+
+            cudnnTensorDescriptor_t outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(*op.getOutput(0));
+            // Create a Tensor descriptor with the broadcasted dims and strides
+            cudnnTensorDescriptor_t tensorDesc;
+            CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc));
+            CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc,
+                                                          CudaContext::data_type<T>::value,
+                                                          inputsDims[i].size(),
+                                                          inputsDims[i].data(),
+                                                          inputsStrides[i].data()));
+            size_t workspaceSize;
+            CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(),
+                               reduceDesc,
+                               outputDesc,
+                               tensorDesc,
+                               &workspaceSize));
+
+            float *d_workspace;
+            CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize));
+
+            CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(),
+                               reduceDesc,
+                               NULL,
+                               0,
+                               d_workspace,
+                               workspaceSize,
+                               &alpha,
+                               outputDesc,
+                               outputGrad.getImpl()->rawPtr(),
+                               &beta,
+                               tensorDesc,
+                               op.getInput(i)->grad()->getImpl()->rawPtr()));
+
+            CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc));
+        }
+    }
+}
diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp
index 6692b342c7f745eede689dd79a9a704bbefa9d77..d1270ee4b0a556e1053f3cfde8d71ec5efbee279 100644
--- a/src/operator/AvgPoolingImpl.cpp
+++ b/src/operator/AvgPoolingImpl.cpp
@@ -9,7 +9,6 @@
  *
  ********************************************************************************/
 
-#include <cassert>
 #include <vector>
 
 #include "aidge/backend/cuda/data/TensorImpl.hpp"
@@ -17,22 +16,22 @@
 #include "aidge/backend/cuda/utils/CudaContext.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 #include "aidge/operator/AvgPooling.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
 #include "aidge/utils/Types.h"
 
 template <Aidge::DimIdx_t DIM>
 void Aidge::AvgPoolingImpl_cuda<DIM>::forward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const AvgPooling_Op<DIM>& op = dynamic_cast<const AvgPooling_Op<DIM>&>(mOp);
 
-    assert(mOp.getRawInput(0) && "missing input #0");
+    AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0");
 
     const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
 
     // Lazy-initialize CuDNN AvgPooling descriptor
     if (mAvgPoolingDesc == nullptr) {
-        const AvgPooling_Op<DIM>& avgPoolingOp = static_cast<const AvgPooling_Op<DIM>&>(op);
-        const std::vector<int> strides(avgPoolingOp.template getAttr<AvgPoolingAttr::StrideDims>().begin(), avgPoolingOp.template getAttr<AvgPoolingAttr::StrideDims>().end());
+        const std::vector<int> strides(op.strideDims().begin(), op.strideDims().end());
         const std::vector<int> paddings(DIM, 0);
-        const std::vector<int> window_dims(avgPoolingOp.template getAttr<AvgPoolingAttr::KernelDims>().begin(), avgPoolingOp.template getAttr<AvgPoolingAttr::KernelDims>().end());
+        const std::vector<int> window_dims(op.kernelDims().begin(), op.kernelDims().end());
 
         CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mAvgPoolingDesc));
         CHECK_CUDNN_STATUS(
@@ -59,7 +58,7 @@ void Aidge::AvgPoolingImpl_cuda<DIM>::forward() {
 template <Aidge::DimIdx_t DIM>
 template <class T>
 void Aidge::AvgPoolingImpl_cuda<DIM>::forward_(const Tensor& input) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const AvgPooling_Op<DIM>& op = dynamic_cast<const AvgPooling_Op<DIM>&>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
     CHECK_CUDNN_STATUS(
@@ -71,11 +70,53 @@ void Aidge::AvgPoolingImpl_cuda<DIM>::forward_(const Tensor& input) {
             input.getImpl()->rawPtr(),
             &beta,
             std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-            std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()
+            op.getOutput(0)->getImpl()->rawPtr()
         )
     );
 }
 
+template <Aidge::DimIdx_t DIM>
+void Aidge::AvgPoolingImpl_cuda<DIM>::backward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    AIDGE_ASSERT(mAvgPoolingDesc != nullptr, "AvgPool descriptor must be created during forward!");
+    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output grad #0");
+
+    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+
+    // Do the actual backward computation
+    // Template is only for scaling parameters, which are always in float
+    // excepted when the convolution is performed in double precision.
+    if (op.getInput(0)->grad()->dataType() == DataType::Float64) {
+        backward_<double>(output_grad);
+    }
+    else {
+        backward_<float>(output_grad);
+    }
+}
+
+template <Aidge::DimIdx_t DIM>
+template <class T>
+void Aidge::AvgPoolingImpl_cuda<DIM>::backward_(const Tensor& output_grad) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    const T alpha = 1.0f;
+    const T beta = 0.0f;
+    CHECK_CUDNN_STATUS(
+        cudnnPoolingBackward(CudaContext::cudnnHandle(),
+                             mAvgPoolingDesc,
+                             &alpha,
+                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                             std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(),
+                             std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad),
+                             output_grad.getImpl()->rawPtr(),
+                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
+                             op.getInput(0)->getImpl()->rawPtr(),
+                             &beta,
+                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
+                             op.getInput(0)->grad()->getImpl()->rawPtr()));
+}
+
 template <Aidge::DimIdx_t DIM>
 Aidge::AvgPoolingImpl_cuda<DIM>::~AvgPoolingImpl_cuda() {
     if(mAvgPoolingDesc != nullptr)
diff --git a/src/operator/BatchNormImpl.cpp b/src/operator/BatchNormImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5cf079326a0ea003fb72875bcaebefe847086ecb
--- /dev/null
+++ b/src/operator/BatchNormImpl.cpp
@@ -0,0 +1,220 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <cuda_runtime.h>
+#include <cudnn.h>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/BatchNormImpl.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/operator/BatchNorm.hpp"
+#include "aidge/utils/Types.h"
+
+template <Aidge::DimIdx_t DIM>
+void Aidge::BatchNormImpl_cuda<DIM>::forward() {
+    // FIXME: uncomment the following code once memory handling will work
+    AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0");
+    AIDGE_ASSERT(mOp.getRawInput(1), "missing input #1");
+    AIDGE_ASSERT(mOp.getRawInput(2), "missing input #2");
+    AIDGE_ASSERT(mOp.getRawInput(3), "missing input #3");
+    AIDGE_ASSERT(mOp.getRawInput(4), "missing input #4");
+
+
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback, input3Fallback, input4Fallback;
+    const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto& input3 = std::static_pointer_cast<Tensor>(mOp.getRawInput(3))->refCastFrom(input3Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto& input4 = std::static_pointer_cast<Tensor>(mOp.getRawInput(4))->refCastFrom(input4Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+
+   if (mBNDesc == nullptr)
+    {
+        const BatchNorm_Op<DIM>& bnOp = static_cast<const BatchNorm_Op<DIM>&>(mOp);
+        mEpsilon = static_cast<double>(bnOp.epsilon());
+        mMode = CUDNN_BATCHNORM_SPATIAL;
+
+        // CUDNN_BN_MIN_EPSILON is set to 0.0 since cuDNN 7.5.0
+        if (CUDNN_BN_MIN_EPSILON > 0.0 && mEpsilon < CUDNN_BN_MIN_EPSILON) {
+            mEpsilon = CUDNN_BN_MIN_EPSILON;
+        }
+
+        CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mBNDesc));
+        CHECK_CUDNN_STATUS(cudnnDeriveBNTensorDescriptor(
+            mBNDesc, std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), mMode));
+
+
+        cudnnDataType_t dataType;
+        const unsigned int nbDimsRequested = DIM;
+        std::vector<int> dims(nbDimsRequested);
+        std::vector<int> strides(nbDimsRequested);
+        int nbDims;
+        CHECK_CUDNN_STATUS(cudnnGetTensorNdDescriptor(mBNDesc,
+                                                    nbDimsRequested,
+                                                    &dataType,
+                                                    &nbDims,
+                                                    &dims[0],
+                                                    &strides[0]));
+        dims.resize(nbDims);
+        strides.resize(nbDims);
+    }
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input0, input1, input2, input3, input4);
+            break;
+        case DataType::Float32:
+            forward_<float>(input0, input1, input2, input3, input4);
+            break;
+        case DataType::Float16:
+            forward_<half>(input0, input1, input2, input3, input4);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <Aidge::DimIdx_t DIM>
+template <class T>
+void Aidge::BatchNormImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, const Tensor& input3, const Tensor& input4) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+
+    cudnnTensorDescriptor_t tensorDesc;
+    // For scale, bias, var and mean, if we have a 1D tensor, the dim should go on the channels
+    if (input1.nbDims() == 1)
+    {
+        CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc));
+        const std::vector<int> dims = {1, static_cast<int>(input1.size()),1, 1};
+        const std::vector<int> strides = {static_cast<int>(input1.size()), 1, 1, 1};
+        CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, dims.size(), dims.data(), strides.data()));
+    }
+    else {
+        tensorDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(input1.getImpl())->getCudnnTensorDesc(input1);
+    }
+    CHECK_CUDNN_STATUS(
+        cudnnBatchNormalizationForwardInference(
+                CudaContext::cudnnHandle(),
+                mMode,
+                &alpha,
+                &beta,
+                std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
+                input0.getImpl()->rawPtr(),
+                std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(),
+                tensorDesc,
+                input1.getImpl()->rawPtr(),
+                input2.getImpl()->rawPtr(),
+                input3.getImpl()->rawPtr(),
+                input4.getImpl()->rawPtr(),
+                mEpsilon)
+    );
+    if (input1.nbDims() == 1)
+    {
+        CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc));
+    }
+}
+
+template <Aidge::DimIdx_t DIM>
+void Aidge::BatchNormImpl_cuda<DIM>::backward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    AIDGE_ASSERT(mBNDesc != nullptr, "BatchNorm descriptor must be created during forward!");
+    for (IOIndex_t i = 0; i < (op.nbInputs() - 2); ++i) {
+        AIDGE_ASSERT(op.getInput(i), "missing input # {} in BatchNorm operator", i);
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run BatchNorm backward because the {}-th input has no implementation.", i);
+    }
+    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing outputGrad in BatchNorm operator");
+    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run BatchNorm backward because the output grad has no implementation.");
+
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback, outputGradFallback;
+    const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto& weights = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto& bias = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad());
+
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            backward_<double>(input0, outputGrad, weights);
+            break;
+        case DataType::Float32:
+            backward_<float>(input0, outputGrad, weights);
+            break;
+        case DataType::Float16:
+            backward_<half>(input0, outputGrad, weights);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <Aidge::DimIdx_t DIM>
+template <class T>
+void Aidge::BatchNormImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& outputGrad, const Tensor& weights) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type alphaData = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type betaData = 0.0f;
+
+    cudnnTensorDescriptor_t scaleBiasDesc;
+    // For scale, bias, var and mean, if we have a 1D tensor, the dim should go on the channels
+    if (weights.nbDims() == 1)
+    {
+        CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&scaleBiasDesc));
+        const std::vector<int> dims = {1, static_cast<int>(weights.size()),1, 1};
+        const std::vector<int> strides = {static_cast<int>(weights.size()), 1, 1, 1};
+        CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(scaleBiasDesc, CudaContext::data_type<T>::value, dims.size(), dims.data(), strides.data()));
+    }
+    else {
+        scaleBiasDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(weights.getImpl())->getCudnnTensorDesc(weights);
+    }
+
+    CHECK_CUDNN_STATUS(
+        cudnnBatchNormalizationBackward(
+                CudaContext::cudnnHandle(),
+                mMode,
+                &alphaData,
+                &betaData,
+                &alpha,
+                &beta,
+                std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
+                input0.getImpl()->rawPtr(),
+                std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(outputGrad),
+                outputGrad.getImpl()->rawPtr(),
+                std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
+                op.getInput(0)->grad()->getImpl()->rawPtr(),
+                scaleBiasDesc,
+                weights.getImpl()->rawPtr(),
+                op.getInput(1)->grad()->getImpl()->rawPtr(),
+                op.getInput(2)->grad()->getImpl()->rawPtr(),
+                mEpsilon,
+                nullptr,
+                nullptr) // TODO add savedMean and savedVar?
+    );
+    if (weights.nbDims() == 1)
+    {
+        CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(scaleBiasDesc));
+    }
+}
+
+template <Aidge::DimIdx_t DIM>
+Aidge::BatchNormImpl_cuda<DIM>::~BatchNormImpl_cuda() {
+    if(mBNDesc != nullptr)
+    {
+        cudnnDestroyTensorDescriptor(mBNDesc);
+    }
+}
+
+// Template declarations
+template class Aidge::BatchNormImpl_cuda<2>;
diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp
index c0c32d3bbb758c9403577c84500bfe951e5e1a96..b627f69a289340b42e1de4baa6bb09d1ea2e5e99 100644
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -9,13 +9,15 @@
  *
  ********************************************************************************/
 
+#include "aidge/backend/cuda/operator/ConvImpl.hpp"
+
 #include <cassert>
 #include <vector>
 
 #include "aidge/backend/cuda/data/TensorImpl.hpp"
-#include "aidge/backend/cuda/operator/ConvImpl.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 #include "aidge/operator/Conv.hpp"
+#include "aidge/operator/ConvDepthWise.hpp"
 #include "aidge/utils/Types.h"
 
 template <Aidge::DimIdx_t DIM>
@@ -33,19 +35,27 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
 
     // Lazy-initialize CuDNN convolution descriptor
     if (mConvDesc == nullptr) {
-        const Conv_Op<DIM>& convOp = static_cast<const Conv_Op<DIM>&>(mOp);
-        const std::vector<int> strides(convOp.template getAttr<ConvAttr::StrideDims>().begin(), convOp.template getAttr<ConvAttr::StrideDims>().end());
         const std::vector<int> paddings(DIM, 0);
-        const std::vector<int> upscales(convOp.template getAttr<ConvAttr::DilationDims>().begin(), convOp.template getAttr<ConvAttr::DilationDims>().end());
-
-        CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc));
-        CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc,
-            DIM,
-            &paddings[0],
-            &strides[0],
-            &upscales[0],
-            CUDNN_CROSS_CORRELATION,
-            DataTypeToCudnn(op.getOutput(0)->dataType())));
+        std::vector<int> strides, upscales;
+        if (mDepthWise) {
+            const ConvDepthWise_Op<DIM>& convDWOp = static_cast<const ConvDepthWise_Op<DIM>&>(mOp);
+            strides = std::vector<int>(convDWOp.strideDims().begin(), convDWOp.strideDims().end());
+            upscales = std::vector<int>(convDWOp.dilationDims().begin(), convDWOp.dilationDims().end());
+        }
+        else {
+            const Conv_Op<DIM>& convOp = static_cast<const Conv_Op<DIM>&>(mOp);
+            strides = std::vector<int>(convOp.strideDims().begin(), convOp.strideDims().end());
+            upscales = std::vector<int>(convOp.dilationDims().begin(), convOp.dilationDims().end());
+        }
+
+            CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc));
+            CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc,
+                DIM,
+                &paddings[0],
+                &strides[0],
+                &upscales[0],
+                CUDNN_CROSS_CORRELATION,
+                DataTypeToCudnn(op.getOutput(0)->dataType())));
     }
 
     // Lazy-initialize CuDNN filter descriptor
@@ -339,4 +349,5 @@ Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() {
 
 
 // Template declarations
+template class Aidge::ConvImpl_cuda<1>;
 template class Aidge::ConvImpl_cuda<2>;
diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp
index 8b60f7fd6aa41f206b2c6eaa5d8f8daa1bd81374..9948ee1356ad4fedb5d830016ae66ca69a033e38 100644
--- a/src/operator/FCImpl.cpp
+++ b/src/operator/FCImpl.cpp
@@ -24,27 +24,26 @@
 #include "aidge/utils/Types.h"
 
 void Aidge::FCImpl_cuda::forward() {
-    assert(mOp.getRawInput(0) && "missing input #0");
-    assert(mOp.getRawInput(1) && "missing input #1");
-    assert(mOp.getRawInput(2) && "missing input #2");
+    AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0");
+    AIDGE_ASSERT(mOp.getRawInput(1), "missing input #1");
+    AIDGE_ASSERT(mOp.getRawInput(2), "missing input #2");
 
     const auto& fcOp = static_cast<const FC_Op&>(mOp);
-    bool noBias = fcOp.template getAttr<FCAttr::NoBias>();
-    std::size_t outChannels = static_cast<std::size_t>(fcOp.template getAttr<FCAttr::OutChannels>());
+    std::size_t outChannels = fcOp.outChannels();
 
     const auto& input0 = fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0));
     const auto& input1 = fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0));
-    const auto& input2 = fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0));
+    const auto& input2 = (fcOp.getInput(2)) ? fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0)) : Tensor();
 
     switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
         case DataType::Float64:
-            forward_<double>(input0, input1, input2, noBias, outChannels);
+            forward_<double>(input0, input1, input2, outChannels);
             break;
         case DataType::Float32:
-            forward_<float>(input0, input1, input2, noBias, outChannels);
+            forward_<float>(input0, input1, input2, outChannels);
             break;
         case DataType::Float16:
-            forward_<half>(input0, input1, input2, noBias, outChannels);
+            forward_<half>(input0, input1, input2, outChannels);
             break;
         default:
             AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
@@ -52,7 +51,7 @@ void Aidge::FCImpl_cuda::forward() {
 }
 
 template<class T>
-void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels)
+void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels)
 {
     const T * input = static_cast<const T*>(input0.getImpl()->rawPtr());
     const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr());
@@ -67,8 +66,8 @@ void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, co
     int lda = k;  // leading dimension of weights
     int ldb = k;  // leading dimension of input
     int ldc = n;  // leading dimension of output
-    const T alpha = 1.0f;
-    const T beta = 0.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
     CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(),
                                     CUBLAS_OP_T,
                                     CUBLAS_OP_N,
@@ -78,13 +77,13 @@ void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, co
                                     reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
                                     weights,
                                     ldb,
-                                    input, 
+                                    input,
                                     lda,
                                     reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta),
                                     output,
                                     ldc));
 
-    if(!noBias){
+    if(!input2.empty()){
         T* onesVector;
         CHECK_CUDA_STATUS(cudaMalloc((void**)&onesVector, m * sizeof(T)));
         // Fill the vector with ones
@@ -114,4 +113,109 @@ void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, co
         cudaFree(onesVector);
     }
 
-}
\ No newline at end of file
+}
+
+void Aidge::FCImpl_cuda::backward() {
+    AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0");
+    AIDGE_ASSERT(mOp.getRawInput(1), "missing input #1");
+    AIDGE_ASSERT(mOp.getRawInput(2), "missing input #2");
+
+    const auto& fcOp = static_cast<const FC_Op&>(mOp);
+    std::size_t outChannels = fcOp.outChannels();
+
+    const auto& input0 = fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0));
+    const auto& input1 = fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0));
+    const auto& input2 = fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0));
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            backward_<double>(input0, input1, input2, outChannels);
+            break;
+        case DataType::Float32:
+            backward_<float>(input0, input1, input2, outChannels);
+            break;
+        case DataType::Float16:
+            backward_<half>(input0, input1, input2, outChannels);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template<class T>
+void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels)
+{
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type betaData = 0.0f;
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const T * input = static_cast<const T*>(input0.getImpl()->rawPtr());
+    const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr());
+    const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr());
+    T * weightsGrad = static_cast<T*>(op.getInput(1)->grad()->getImpl()->rawPtr());
+
+    // Performing weightsGrad = (input) * T(outputGrad)
+    //              [n x m]   = [n x k] *   [k x m]
+    int m = input0.dims()[input0.nbDims()-1];
+    int k = input0.size()/m;
+    int n = input1.size()/m;
+    int input0LastDim = input0.dims()[input0.nbDims()-1];
+    CHECK_CUBLAS_STATUS(cublasGemm(
+        CudaContext::cublasHandle(),
+        CUBLAS_OP_N,
+        CUBLAS_OP_T,
+        m,
+        n,
+        k,
+        reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
+        input,
+        m,
+        outputGrad,
+        n,
+        reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta),
+        weightsGrad,
+        m));
+
+    if(!input2.empty()){
+        T * biasGrad = static_cast<T*>(op.getInput(2)->grad()->getImpl()->rawPtr());
+        T* onesVector;
+        CHECK_CUDA_STATUS(cudaMalloc((void**)&onesVector, m * sizeof(T)));
+        // Fill the vector with ones
+        std::vector<T> onesVec(m, T(1.0));
+        CHECK_CUDA_STATUS(cudaMemcpy(onesVector,
+                                    &onesVec[0],
+                                    m * sizeof(T),
+                                    cudaMemcpyHostToDevice));
+        // Performing biasGrad = outputGrad * onesVector
+        CHECK_CUBLAS_STATUS(cublasGemv(CudaContext::cublasHandle(),
+                                       CUBLAS_OP_N,
+                                       outChannels,
+                                       input0LastDim,
+                                       reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
+                                       outputGrad,
+                                       outChannels,
+                                       onesVector,
+                                       1,
+                                       reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta),
+                                       biasGrad,
+                                       1));
+        cudaFree(onesVector);
+    }
+    // Performing inputGrad = (weights) * (outputGrad)
+    CHECK_CUBLAS_STATUS(cublasGemm(
+        CudaContext::cublasHandle(),
+        CUBLAS_OP_N,
+        CUBLAS_OP_N,
+        op.getInput(1)->grad()->size()/outChannels,
+        input0LastDim,
+        outChannels,
+        reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
+        weights,//w
+        op.getInput(1)->grad()->size()/outChannels,
+        outputGrad,//dY
+        outChannels,
+        reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&betaData),
+        static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()),//dX
+        op.getInput(1)->grad()->size()/outChannels));
+
+}
diff --git a/src/operator/FCImpl_CUDA_kernels.cu b/src/operator/FCImpl_CUDA_kernels.cu
index 5139ac1d7edf61cf347870e6add2870b2792a0e5..90b7c6bbc9cf0f470a7184e24273aae04da6f3c6 100644
--- a/src/operator/FCImpl_CUDA_kernels.cu
+++ b/src/operator/FCImpl_CUDA_kernels.cu
@@ -73,4 +73,60 @@ cublasStatus_t cublasGemm<double>(cublasHandle_t handle,
 					   beta,
 					   C, ldc);
 }
+
+template <>
+cublasStatus_t cublasGemv<__half>(cublasHandle_t handle, cublasOperation_t trans,
+                                 int m, int n,
+                                 const __half          *alpha,
+                                 const __half          *A, int lda,
+                                 const __half          *x, int incx,
+                                 const __half          *beta,
+                                 __half          *y, int incy)
+{
+    // Using cublasHgemm() because there is no cublasHgemv() yet
+    return cublasHgemm(handle,
+        trans, CUBLAS_OP_N,
+        m, 1, n,
+        alpha,
+        A, lda,
+        x, incx,
+        beta,
+        y, incy);
+}
+
+template <>
+cublasStatus_t cublasGemv<float>(cublasHandle_t handle, cublasOperation_t trans,
+                                 int m, int n,
+                                 const float          *alpha,
+                                 const float          *A, int lda,
+                                 const float          *x, int incx,
+                                 const float          *beta,
+                                 float          *y, int incy)
+{
+    return cublasSgemv(handle, trans,
+        m, n,
+        alpha,
+        A, lda,
+        x, incx,
+        beta,
+        y, incy);
+}
+
+template <>
+cublasStatus_t cublasGemv<double>(cublasHandle_t handle, cublasOperation_t trans,
+                                 int m, int n,
+                                 const double          *alpha,
+                                 const double          *A, int lda,
+                                 const double          *x, int incx,
+                                 const double          *beta,
+                                 double          *y, int incy)
+{
+    return cublasDgemv(handle, trans,
+        m, n,
+        alpha,
+        A, lda,
+        x, incx,
+        beta,
+        y, incy);
+}
 }
\ No newline at end of file
diff --git a/src/operator/GlobalAveragePoolingImpl.cpp b/src/operator/GlobalAveragePoolingImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c83d477094d9cce41807d888cca57bd614e9cc6
--- /dev/null
+++ b/src/operator/GlobalAveragePoolingImpl.cpp
@@ -0,0 +1,111 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/GlobalAveragePooling.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::GlobalAveragePoolingImpl_cuda::forward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0");
+
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+
+    // Lazy-initialize CuDNN GlobalAveragePooling descriptor
+    if (mGlobalAveragePoolingDesc == nullptr) {
+        int poolingDims = 2; // Assuming 2D pooling
+        int windowDims[2] = {static_cast<int>(input.dims().at(2)), static_cast<int>(input.dims().at(3))}; // Pooling window dimensions matching spatial dimensions of input tensor
+        int padding[2] = {0, 0}; // No padding
+        int stride[2] = {1, 1}; // Stride of 1
+        CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mGlobalAveragePoolingDesc));
+        CHECK_CUDNN_STATUS(
+            cudnnSetPoolingNdDescriptor(mGlobalAveragePoolingDesc, mMode, CUDNN_NOT_PROPAGATE_NAN, poolingDims, windowDims, padding, stride)
+            // cudnnSetPooling2dDesccomputedOutputriptor(mGlobalAveragePoolingDesc, mMode, CUDNN_NOT_PROPAGATE_NAN, 1, 1, 0, 0, 1, 1)
+        );
+    }
+
+    if (op.getOutput(0)->dataType() == DataType::Float64) {
+        forward_<double>(input);
+    }
+    else {
+        forward_<float>(input);
+    }
+}
+
+template <class T>
+void Aidge::GlobalAveragePoolingImpl_cuda::forward_(const Tensor& input) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+
+    CHECK_CUDNN_STATUS(
+        cudnnPoolingForward(
+            CudaContext::cudnnHandle(),
+            mGlobalAveragePoolingDesc,
+            &alpha,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
+            input.getImpl()->rawPtr(),
+            &beta,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()
+        )
+    );
+}
+
+void Aidge::GlobalAveragePoolingImpl_cuda::backward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    AIDGE_ASSERT(mGlobalAveragePoolingDesc != nullptr, "GlobalAvgPool descriptor must be created during forward!");
+    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output grad #0");
+
+    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+
+    if (op.getOutput(0)->dataType() == DataType::Float64) {
+        backward_<double>(output_grad);
+    }
+    else {
+        backward_<float>(output_grad);
+    }
+}
+
+template <class T>
+void Aidge::GlobalAveragePoolingImpl_cuda::backward_(const Tensor& output_grad) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    const T alpha = 1.0f;
+    const T beta = 0.0f;
+    CHECK_CUDNN_STATUS(
+        cudnnPoolingBackward(CudaContext::cudnnHandle(),
+                             mGlobalAveragePoolingDesc,
+                             &alpha,
+                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                             std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(),
+                             std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad),
+                             output_grad.getImpl()->rawPtr(),
+                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
+                             op.getInput(0)->getImpl()->rawPtr(),
+                             &beta,
+                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
+                             op.getInput(0)->grad()->getImpl()->rawPtr()));
+}
+
+Aidge::GlobalAveragePoolingImpl_cuda::~GlobalAveragePoolingImpl_cuda() {
+    if(mGlobalAveragePoolingDesc != nullptr)
+        cudnnDestroyPoolingDescriptor(mGlobalAveragePoolingDesc);
+}
+
diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp
index de41915e7506cd121f25a6112252ecea92b047d5..39050635102ebebaed8192cb4bb338e2bc31d5e8 100644
--- a/src/operator/MaxPoolingImpl.cpp
+++ b/src/operator/MaxPoolingImpl.cpp
@@ -21,18 +21,17 @@
 
 template <Aidge::DimIdx_t DIM>
 void Aidge::MaxPoolingImpl_cuda<DIM>::forward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp);
 
-    assert(mOp.getRawInput(0) && "missing input #0");
+    AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0");
 
-    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+    const auto& input = op_.getInput(0)->refCastFrom(mInputFallback, *op_.getOutput(0));
 
     // Lazy-initialize CuDNN MaxPooling descriptor
     if (mMaxPoolingDesc == nullptr) {
-        const MaxPooling_Op<DIM>& maxPoolingOp = static_cast<const MaxPooling_Op<DIM>&>(op);
-        const std::vector<int> strides(maxPoolingOp.template getAttr<MaxPoolingAttr::StrideDims>().begin(), maxPoolingOp.template getAttr<MaxPoolingAttr::StrideDims>().end());
+        const std::vector<int> strides(op_.strideDims().begin(), op_.strideDims().end());
         const std::vector<int> paddings(DIM, 0);
-        const std::vector<int> window_dims(maxPoolingOp.template getAttr<MaxPoolingAttr::KernelDims>().begin(), maxPoolingOp.template getAttr<MaxPoolingAttr::KernelDims>().end());
+        const std::vector<int> window_dims(op_.kernelDims().begin(), op_.kernelDims().end());
 
         CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mMaxPoolingDesc));
         CHECK_CUDNN_STATUS(
@@ -45,10 +44,11 @@ void Aidge::MaxPoolingImpl_cuda<DIM>::forward() {
                                         &strides[0]));
     }
 
+
     // Do the actual forward computation
     // Template is only for scaling parameters, which are always in float
     // excepted when the convolution is performed in double precision.
-    if (op.getOutput(0)->dataType() == DataType::Float64) {
+    if (op_.getOutput(0)->dataType() == DataType::Float64) {
         forward_<double>(input);
     }
     else {
@@ -59,7 +59,7 @@ void Aidge::MaxPoolingImpl_cuda<DIM>::forward() {
 template <Aidge::DimIdx_t DIM>
 template <class T>
 void Aidge::MaxPoolingImpl_cuda<DIM>::forward_(const Tensor& input) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
     CHECK_CUDNN_STATUS(
@@ -70,12 +70,54 @@ void Aidge::MaxPoolingImpl_cuda<DIM>::forward_(const Tensor& input) {
             std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
             input.getImpl()->rawPtr(),
             &beta,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-            std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getOutput(0)->getImpl())->getCudnnTensorDesc(*op_.getOutput(0)),
+            op_.getOutput(0)->getImpl()->rawPtr()
         )
     );
 }
 
+template <Aidge::DimIdx_t DIM>
+void Aidge::MaxPoolingImpl_cuda<DIM>::backward() {
+    const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp);
+
+    AIDGE_ASSERT(mMaxPoolingDesc != nullptr, "MaxPool descriptor must be created during forward!");
+    AIDGE_ASSERT(op_.getOutput(0)->grad(), "missing output grad #0");
+
+    const auto& output_grad = op_.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op_.getOutput(0)->grad());
+
+    // Do the actual backward computation
+    // Template is only for scaling parameters, which are always in float
+    // excepted when the convolution is performed in double precision.
+    if (op_.getOutput(0)->dataType() == DataType::Float64) {
+        backward_<double>(output_grad);
+    }
+    else {
+        backward_<float>(output_grad);
+    }
+}
+
+template <Aidge::DimIdx_t DIM>
+template <class T>
+void Aidge::MaxPoolingImpl_cuda<DIM>::backward_(const Tensor& output_grad) {
+    const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp);
+
+    const T alpha = 1.0f;
+    const T beta = 0.0f;
+    CHECK_CUDNN_STATUS(
+        cudnnPoolingBackward(CudaContext::cudnnHandle(),
+                             mMaxPoolingDesc,
+                             &alpha,
+                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getOutput(0)->getImpl())->getCudnnTensorDesc(*op_.getOutput(0)),
+                             op_.getOutput(0)->getImpl()->rawPtr(),
+                             std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad),
+                             output_grad.getImpl()->rawPtr(),
+                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getInput(0)->getImpl())->getCudnnTensorDesc(*op_.getInput(0)),
+                             op_.getInput(0)->getImpl()->rawPtr(),
+                             &beta,
+                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op_.getInput(0)),
+                             op_.getInput(0)->grad()->getImpl()->rawPtr()));
+}
+
 template <Aidge::DimIdx_t DIM>
 Aidge::MaxPoolingImpl_cuda<DIM>::~MaxPoolingImpl_cuda() {
     if(mMaxPoolingDesc != nullptr)
diff --git a/src/operator/PadImpl.cpp b/src/operator/PadImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3606ba66d002f1467aa65771015cab02c066d5a5
--- /dev/null
+++ b/src/operator/PadImpl.cpp
@@ -0,0 +1,137 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cuda/operator/PadImpl.hpp"
+
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/Pad.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Types.h"
+
+template <Aidge::DimIdx_t DIM>
+void Aidge::PadImpl_cuda<DIM>::forward()
+{
+    const Pad_Op<DIM> &op = static_cast<const Pad_Op<DIM> &>(mOp);
+
+    AIDGE_ASSERT(op.getInput(0), "missing input in Pad operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Pad forward input has no implementation.");
+
+    const auto &input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+
+    auto paddingBorders = op.beginEndBorders();
+
+    mLeftPad = paddingBorders[2];
+    mTopPad = paddingBorders[0];
+    mPadVal = op.borderValue();
+    mPadType = static_cast<unsigned int>(op.borderType());
+
+    switch (op.getOutput(0)->dataType())
+    {
+    case DataType::Float64:
+        forward_<double>(input);
+        break;
+    case DataType::Float32:
+        forward_<float>(input);
+        break;
+    case DataType::Float16:
+        forward_<half>(input);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <Aidge::DimIdx_t DIM>
+template <class T>
+void Aidge::PadImpl_cuda<DIM>::forward_(const Tensor &input)
+{
+    const auto outDims = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims();
+    const T *inputPtr = static_cast<const T *>(input.getImpl()->rawPtr());
+    T *output = static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
+    Aidge::cudaPadding(CudaContext::getDeviceProp(),
+                       outDims[1],
+                       outDims[3],
+                       outDims[2],
+                       input.dims()[1],
+                       input.dims()[0],
+                       input.dims()[3],
+                       input.dims()[2],
+                       mLeftPad,
+                       mTopPad,
+                       mPadType,
+                       static_cast<T>(mPadVal),
+                       inputPtr,
+                       output);
+}
+
+template <Aidge::DimIdx_t DIM>
+void Aidge::PadImpl_cuda<DIM>::backward()
+{
+    const Pad_Op<DIM> &op = static_cast<const Pad_Op<DIM> &>(mOp);
+
+    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Pad operator");
+    AIDGE_ASSERT(op.getOutput(0)->grad(), "cannot run Pad backward, output gradient has no implementation.");
+
+    const auto &outGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getInput(0));
+
+    auto paddingBorders = op.beginEndBorders();
+
+    mLeftPad = paddingBorders[2];
+    mTopPad = paddingBorders[0];
+    mPadVal = op.borderValue();
+    mPadType = static_cast<unsigned int>(op.borderType());
+
+    switch (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType())
+    {
+    case DataType::Float64:
+        backward_<double>(outGrad);
+        break;
+    case DataType::Float32:
+        backward_<float>(outGrad);
+        break;
+    case DataType::Float16:
+        backward_<half>(outGrad);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <Aidge::DimIdx_t DIM>
+template <class T>
+void Aidge::PadImpl_cuda<DIM>::backward_(const Tensor &outGrad)
+{
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const auto inputGradDims = op.getInput(0)->grad()->dims();
+    T *inputGrad = static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr());
+    Aidge::cudaPadding(CudaContext::getDeviceProp(),
+                       inputGradDims[1],
+                       inputGradDims[3],
+                       inputGradDims[2],
+                       outGrad.dims()[1],
+                       outGrad.dims()[0],
+                       outGrad.dims()[3],
+                       outGrad.dims()[2],
+                       -mLeftPad,
+                       -mTopPad,
+                       mPadType,
+                       static_cast<T>(mPadVal),
+                       static_cast<const T *>(outGrad.getImpl()->rawPtr()),
+                       inputGrad);
+}
+
+// Template declarations
+template class Aidge::PadImpl_cuda<2>;
diff --git a/src/operator/PadImpl_CUDA_kernels.cu b/src/operator/PadImpl_CUDA_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a20a4c10a6cb5e783a09868389b8f968bc0f42a3
--- /dev/null
+++ b/src/operator/PadImpl_CUDA_kernels.cu
@@ -0,0 +1,224 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp"
+
+template <typename T>
+__global__ void cudaPadding_kernel(unsigned int nbOutputs,
+                                   unsigned int outputWidth,
+                                   unsigned int outputHeight,
+                                   unsigned int nbChannels,
+                                   unsigned int inputWidth,
+                                   unsigned int inputHeight,
+                                   int leftPad,
+                                   int topPad,
+                                   unsigned int padType,
+                                   T padValue,
+                                   const T *input,
+                                   T *outputs)
+{
+    const unsigned int inputOffset = (blockIdx.z * blockDim.z + threadIdx.z) * nbChannels * inputWidth * inputHeight;
+
+    const unsigned int outputOffset = (blockIdx.z * blockDim.z + threadIdx.z) * nbOutputs * outputWidth * outputHeight;
+
+    // nbCh = nbChannels for propagate
+    //      = nbOutputs for back-propagate
+    const unsigned int nbCh = min(nbChannels, nbOutputs);
+
+    for (unsigned int ch = blockIdx.x; ch < nbCh; ch += gridDim.x)
+    {
+        for (unsigned int oy = threadIdx.y; oy < outputHeight; oy += blockDim.y)
+        {
+            for (unsigned int ox = threadIdx.x; ox < outputWidth; ox += blockDim.x)
+            {
+                T outputValue = padValue;
+
+                if (padType == 0) // Const padding
+                {
+                    int ix = (int)ox - leftPad;
+                    int iy = (int)oy - topPad;
+
+                    if (ix >= 0 && ix < (int)inputWidth && iy >= 0 && iy < (int)inputHeight)
+                    {
+                        outputValue = input[ix +
+                                            iy * inputWidth + ch * inputWidth * inputHeight + inputOffset];
+                    }
+                }
+                else if (padType == 1) // Edge padding
+                {
+                    int ix = max(0, min((int)inputWidth - 1, (int)ox - leftPad));
+                    int iy = max(0, min((int)inputHeight - 1, (int)oy - topPad));
+
+                    outputValue = input[ix +
+                                        iy * inputWidth + ch * inputWidth * inputHeight + inputOffset];
+                }
+                else if (padType == 2) // Reflect padding
+                {
+                    int ix = (int)ox - leftPad;
+                    int iy = (int)oy - topPad;
+
+                    if (ix < 0)
+                        ix = 0 - ix;
+                    if (iy < 0)
+                        iy = 0 - iy;
+                    if (ix >= (int)inputWidth)
+                        ix = (int)inputWidth - ix;
+                    if (iy >= (int)inputHeight)
+                        iy = (int)inputHeight - iy;
+
+                    outputValue = input[ix +
+                                        iy * inputWidth + ch * inputWidth * inputHeight + inputOffset];
+                }
+                else if (padType == 3) // Wrap padding
+                {
+                    int ix = (inputWidth + (int)ox - leftPad) % inputWidth;
+                    int iy = (inputHeight + (int)oy - topPad) % inputHeight;
+
+                    outputValue = input[ix +
+                                        iy * inputWidth + ch * inputWidth * inputHeight + inputOffset];
+                }
+                outputs[ox + oy * outputWidth + ch * outputWidth * outputHeight + outputOffset] = outputValue;
+            }
+        }
+    }
+}
+
+template <> // double
+void Aidge::cudaPadding(const cudaDeviceProp &deviceProp,
+                        unsigned int nbOutputs,
+                        unsigned int outputsWidth,
+                        unsigned int outputsHeight,
+                        unsigned int nbChannels,
+                        unsigned int batchSize,
+                        unsigned int inputWidth,
+                        unsigned int inputHeight,
+                        int leftPad,
+                        int topPad,
+                        unsigned int padType,
+                        double padValue,
+                        const double *input,
+                        double *outputs)
+{
+    const unsigned int maxSize = (unsigned int)deviceProp.maxThreadsPerBlock;
+    const unsigned int prefMultiple = (unsigned int)deviceProp.warpSize;
+
+    const unsigned int groupSize = (outputsWidth * outputsHeight < maxSize)
+                                       ? outputsWidth * outputsHeight
+                                       : maxSize;
+
+    const unsigned int reqWidth = (unsigned int)ceilf((float)groupSize / (float)outputsWidth);
+
+    const unsigned int groupWidth = min(prefMultiple, reqWidth);
+    const dim3 blocksPerGrid = {nbChannels, 1, batchSize};
+    const dim3 threadsPerBlocks = {groupWidth, groupSize / groupWidth, 1};
+
+    cudaPadding_kernel<<<blocksPerGrid, threadsPerBlocks>>>(nbOutputs,
+                                                            outputsWidth,
+                                                            outputsHeight,
+                                                            nbChannels,
+                                                            inputWidth,
+                                                            inputHeight,
+                                                            leftPad,
+                                                            topPad,
+                                                            padType,
+                                                            padValue,
+                                                            input,
+                                                            outputs);
+    CHECK_CUDA_STATUS(cudaPeekAtLastError());
+}
+
+template <> // float
+void Aidge::cudaPadding(const cudaDeviceProp &deviceProp,
+                        unsigned int nbOutputs,
+                        unsigned int outputsWidth,
+                        unsigned int outputsHeight,
+                        unsigned int nbChannels,
+                        unsigned int batchSize,
+                        unsigned int inputWidth,
+                        unsigned int inputHeight,
+                        int leftPad,
+                        int topPad,
+                        unsigned int padType,
+                        float padValue,
+                        const float *input,
+                        float *outputs)
+{
+    const unsigned int maxSize = (unsigned int)deviceProp.maxThreadsPerBlock;
+    const unsigned int prefMultiple = (unsigned int)deviceProp.warpSize;
+
+    const unsigned int groupSize = (outputsWidth * outputsHeight < maxSize)
+                                       ? outputsWidth * outputsHeight
+                                       : maxSize;
+
+    const unsigned int reqWidth = (unsigned int)ceilf((float)groupSize / (float)outputsWidth);
+
+    const unsigned int groupWidth = min(prefMultiple, reqWidth);
+    const dim3 blocksPerGrid = {nbChannels, 1, batchSize};
+    const dim3 threadsPerBlocks = {groupWidth, groupSize / groupWidth, 1};
+
+    cudaPadding_kernel<<<blocksPerGrid, threadsPerBlocks>>>(nbOutputs,
+                                                            outputsWidth,
+                                                            outputsHeight,
+                                                            nbChannels,
+                                                            inputWidth,
+                                                            inputHeight,
+                                                            leftPad,
+                                                            topPad,
+                                                            padType,
+                                                            padValue,
+                                                            input,
+                                                            outputs);
+    CHECK_CUDA_STATUS(cudaPeekAtLastError());
+}
+
+template <> // half
+void Aidge::cudaPadding(const cudaDeviceProp &deviceProp,
+                        unsigned int nbOutputs,
+                        unsigned int outputsWidth,
+                        unsigned int outputsHeight,
+                        unsigned int nbChannels,
+                        unsigned int batchSize,
+                        unsigned int inputWidth,
+                        unsigned int inputHeight,
+                        int leftPad,
+                        int topPad,
+                        unsigned int padType,
+                        half padValue,
+                        const half *input,
+                        half *outputs)
+{
+    const unsigned int maxSize = (unsigned int)deviceProp.maxThreadsPerBlock;
+    const unsigned int prefMultiple = (unsigned int)deviceProp.warpSize;
+
+    const unsigned int groupSize = (outputsWidth * outputsHeight < maxSize)
+                                       ? outputsWidth * outputsHeight
+                                       : maxSize;
+
+    const unsigned int reqWidth = (unsigned int)ceilf((float)groupSize / (float)outputsWidth);
+
+    const unsigned int groupWidth = min(prefMultiple, reqWidth);
+    const dim3 blocksPerGrid = {nbChannels, 1, batchSize};
+    const dim3 threadsPerBlocks = {groupWidth, groupSize / groupWidth, 1};
+
+    cudaPadding_kernel<<<blocksPerGrid, threadsPerBlocks>>>(nbOutputs,
+                                                            outputsWidth,
+                                                            outputsHeight,
+                                                            nbChannels,
+                                                            inputWidth,
+                                                            inputHeight,
+                                                            leftPad,
+                                                            topPad,
+                                                            padType,
+                                                            padValue,
+                                                            input,
+                                                            outputs);
+    CHECK_CUDA_STATUS(cudaPeekAtLastError());
+}
\ No newline at end of file
diff --git a/src/operator/ReLUImpl.cpp b/src/operator/ReLUImpl.cpp
index 0a4eeeb7d8a2a4be94b5ac6b43dbae69cd8e3869..80d52045e832b42a95b6d7448f2016530bb9d1ac 100644
--- a/src/operator/ReLUImpl.cpp
+++ b/src/operator/ReLUImpl.cpp
@@ -1,5 +1,5 @@
 /********************************************************************************
- * Copyright (c) 2023 CEA-List
+ * Copyright (c) 2024 CEA-List
  *
  * This program and the accompanying materials are made available under the
  * terms of the Eclipse Public License 2.0 which is available at
@@ -64,6 +64,55 @@ void Aidge::ReLUImpl_cuda::forward_(const Tensor& input) {
                                std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
 }
 
+void Aidge::ReLUImpl_cuda::backward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    assert(op.getOutput(0)->grad() && "missing output #0");
+
+    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+
+    // Lazy-initialize CuDNN ReLU descriptor
+    if (mReLUDesc == nullptr) {
+		#if CUDNN_VERSION >= 5000
+			CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mReLUDesc));
+			CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor(
+				mReLUDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0));
+		#else
+			mReLUDesc = CUDNN_ACTIVATION_RELU;
+		#endif
+    }
+
+    // Do the actual backward computation
+    // Template is only for scaling parameters, which are always in float
+    // excepted when the convolution is performed in double precision.
+    if (op.getInput(0)->grad()->dataType() == DataType::Float64) {
+        backward_<double>(output_grad);
+    }
+    else {
+        backward_<float>(output_grad);
+    }
+}
+
+template <class T>
+void Aidge::ReLUImpl_cuda::backward_(const Tensor& output_grad) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    CHECK_CUDNN_STATUS(
+        cudnnActivationBackward(CudaContext::cudnnHandle(),
+                               mReLUDesc,
+                               &alpha,
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                               std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(),
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad),
+                               output_grad.getImpl()->rawPtr(),
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
+                               std::static_pointer_cast<Tensor>(op.getRawInput(0))->getImpl()->rawPtr(),
+                               &beta,
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)->grad()),
+                               op.getInput(0)->grad()->getImpl()->rawPtr()));
+}
+
 Aidge::ReLUImpl_cuda::~ReLUImpl_cuda() {
     if (mReLUDesc != nullptr) {
 		#if CUDNN_VERSION >= 5000
diff --git a/src/operator/ReshapeImpl.cpp b/src/operator/ReshapeImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8016a5a9d1dfc26454af2cb03b6fe573820245f5
--- /dev/null
+++ b/src/operator/ReshapeImpl.cpp
@@ -0,0 +1,46 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/ReshapeImpl.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/operator/Reshape.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::ReshapeImpl_cuda::forward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    // FIXME: uncomment the following code once memory handling will work
+    assert(mOp.getRawInput(0) && "missing input #0");
+
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+
+    std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))-> getImpl() -> setRawPtr(input.getImpl()->rawPtr(), input.getImpl()->size());
+}
+
+void Aidge::ReshapeImpl_cuda::backward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output grad #0");
+
+    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+
+    std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->grad() -> getImpl() -> setRawPtr(output_grad.getImpl()->rawPtr(), output_grad.getImpl()->size());
+}
+
+Aidge::ReshapeImpl_cuda::~ReshapeImpl_cuda() {
+
+}
+
diff --git a/src/operator/SigmoidImpl.cpp b/src/operator/SigmoidImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..386cd9d821b3019cf8f0de2cc757ae514446f1a6
--- /dev/null
+++ b/src/operator/SigmoidImpl.cpp
@@ -0,0 +1,123 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/SigmoidImpl.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/Sigmoid.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::SigmoidImpl_cuda::forward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    assert(mOp.getRawInput(0) && "missing input #0");
+
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+
+    // Lazy-initialize CuDNN Sigmoid descriptor
+    if (mSigmoidDesc == nullptr) {
+		#if CUDNN_VERSION >= 5000
+			CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mSigmoidDesc));
+			CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor(
+				mSigmoidDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0.0));
+		#else
+			mSigmoidDesc = CUDNN_ACTIVATION_SIGMOID;
+		#endif
+    }
+
+    // Do the actual forward computation
+    // Template is only for scaling parameters, which are always in float
+    // excepted when the convolution is performed in double precision.
+    if (op.getOutput(0)->dataType() == DataType::Float64) {
+        forward_<double>(input);
+    }
+    else {
+        forward_<float>(input);
+    }
+}
+
+template <class T>
+void Aidge::SigmoidImpl_cuda::forward_(const Tensor& input) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    CHECK_CUDNN_STATUS(
+        cudnnActivationForward(CudaContext::cudnnHandle(),
+                               mSigmoidDesc,
+                               &alpha,
+							   std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
+                               input.getImpl()->rawPtr(),
+                               &beta,
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                               std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+}
+
+void Aidge::SigmoidImpl_cuda::backward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    assert(op.getOutput(0)->grad() && "missing output #0");
+
+    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+
+    // Lazy-initialize CuDNN Sigmoid descriptor
+    if (mSigmoidDesc == nullptr) {
+		#if CUDNN_VERSION >= 5000
+			CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mSigmoidDesc));
+			CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor(
+				mSigmoidDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0.0));
+		#else
+			mSigmoidDesc = CUDNN_ACTIVATION_SIGMOID;
+		#endif
+    }
+
+    // Do the actual backward computation
+    // Template is only for scaling parameters, which are always in float
+    // excepted when the convolution is performed in double precision.
+    if (op.getInput(0)->grad()->dataType() == DataType::Float64) {
+        backward_<double>(output_grad);
+    }
+    else {
+        backward_<float>(output_grad);
+    }
+}
+
+template <class T>
+void Aidge::SigmoidImpl_cuda::backward_(const Tensor& output_grad) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    CHECK_CUDNN_STATUS(
+        cudnnActivationBackward(CudaContext::cudnnHandle(),
+                               mSigmoidDesc,
+                               &alpha,
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                               std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(),
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad),
+                               output_grad.getImpl()->rawPtr(),
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
+                               std::static_pointer_cast<Tensor>(op.getRawInput(0))->getImpl()->rawPtr(),
+                               &beta,
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)->grad()),
+                               op.getInput(0)->grad()->getImpl()->rawPtr()));
+}
+
+Aidge::SigmoidImpl_cuda::~SigmoidImpl_cuda() {
+    if (mSigmoidDesc != nullptr) {
+		#if CUDNN_VERSION >= 5000
+            cudnnDestroyActivationDescriptor(mSigmoidDesc);
+		#endif
+    }
+}
+
diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..adebd2a1bdcede94f159627f67860e7ec60a5d85
--- /dev/null
+++ b/src/operator/SubImpl.cpp
@@ -0,0 +1,219 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/SubImpl.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/Sub.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::SubImpl_cuda::forward() {
+    const Sub_Op& op = static_cast<const Sub_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in Sub operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Sub forward because the 0-th input has no implementation.");
+    DataType datatypeFirstInput = op.getInput(0)->dataType();
+    for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
+        AIDGE_ASSERT(op.getInput(i), "missing input in Sub operator");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Sub forward because the {}-th input has no implementation.", i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type.");
+    }
+
+    std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs());
+    std::vector<Tensor> inputs(op.nbInputs());
+    std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
+    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
+        inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
+
+        // Get tensor dims and broadcast them
+        std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+
+        // Compute the corresponding strides
+        std::vector<int> tensorStrides(dims[i].size());
+        int product = 1;
+        for (size_t j = dims[i].size(); j > 0; --j) {
+            tensorStrides[j - 1] = product;
+            product *= dims[i][j - 1];
+        }
+        strides[i] = tensorStrides;
+    }
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(inputs, dims, strides);
+            break;
+        case DataType::Float32:
+            forward_<float>(inputs, dims, strides);
+            break;
+        case DataType::Float16:
+            forward_<half>(inputs, dims, strides);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::SubImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type gamma = -1.0f;
+    // Create a Tensor descriptor with the broadcasted dims and strides
+    cudnnTensorDescriptor_t tensorDesc;
+    CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc));
+    CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data()));
+    // Add first input to the output
+    CHECK_CUDNN_STATUS(
+        cudnnAddTensor(CudaContext::cudnnHandle(),
+                       &alpha,
+                       tensorDesc,
+                       inputs[0].getImpl()->rawPtr(),
+                       &beta,
+                       std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                       std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())
+    );
+    // Substract other inputs if there are any
+    for (size_t i = 1; i < op.nbInputs(); ++i)
+    {
+        CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[i].size(), inputsDims[i].data(), inputsStrides[i].data()));
+        CHECK_CUDNN_STATUS(
+            cudnnAddTensor(CudaContext::cudnnHandle(),
+                        &gamma,
+                        tensorDesc,
+                        inputs[i].getImpl()->rawPtr(),
+                        &alpha,
+                        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                        std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())
+        );
+    }
+    CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc));
+}
+
+void Aidge::SubImpl_cuda::backward() {
+    const Sub_Op& op = static_cast<const Sub_Op&>(mOp);
+    // Check output
+    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Sub operator");
+    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Sub backward because the output gradient has no implementation.");
+
+    std::shared_ptr<Tensor> outputGradFallback;
+    const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad());
+
+    std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
+    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
+        std::shared_ptr<Tensor> inputFallback;
+        const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0));
+
+        // Get tensor dims and broadcast them
+        std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+
+        // Compute the corresponding strides
+        std::vector<int> tensorStrides(dims[i].size());
+        int product = 1;
+        for (size_t j = dims[i].size(); j > 0; --j) {
+            tensorStrides[j - 1] = product;
+            product *= dims[i][j - 1];
+        }
+        strides[i] = tensorStrides;
+    }
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            backward_<double>(outputGrad, dims, strides);
+            break;
+        case DataType::Float32:
+            backward_<float>(outputGrad, dims, strides);
+            break;
+        case DataType::Float16:
+            backward_<half>(outputGrad, dims, strides);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::SubImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type gamma = -1.0f;
+    for (std::size_t i = 0; i < inputsDims.size(); i++)
+    {
+        if (op.getInput(i)->size() == op.getOutput(0)->size())
+        {
+            CHECK_CUDNN_STATUS(
+            cudnnAddTensor(CudaContext::cudnnHandle(),
+                        i==0 ? &alpha: &gamma,
+                        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                        outputGrad.getImpl()->rawPtr(),
+                        &beta,
+                        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(i)->getImpl())->getCudnnTensorDesc(*op.getInput(i)),
+                        op.getInput(i)->grad()->getImpl()->rawPtr()));
+        }
+        else // In case of broadcasting
+        {
+            // Gradient with respect to input_i: sum outputGrad over the broadcasted dimensions using cudnnReduceTensor
+            cudnnReduceTensorDescriptor_t reduceDesc;
+            CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc));
+            CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc,
+                                                              CUDNN_REDUCE_TENSOR_ADD,
+                                                              CudaContext::data_type<T>::value,
+                                                              CUDNN_PROPAGATE_NAN,
+                                                              CUDNN_REDUCE_TENSOR_NO_INDICES,
+                                                              CUDNN_32BIT_INDICES));
+
+            cudnnTensorDescriptor_t outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(*op.getOutput(0));
+            // Create a Tensor descriptor with the broadcasted dims and strides
+            cudnnTensorDescriptor_t tensorDesc;
+            CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc));
+            CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc,
+                                                          CudaContext::data_type<T>::value,
+                                                          inputsDims[i].size(),
+                                                          inputsDims[i].data(),
+                                                          inputsStrides[i].data()));
+            size_t workspaceSize;
+            CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(),
+                               reduceDesc,
+                               outputDesc,
+                               tensorDesc,
+                               &workspaceSize));
+
+            float *d_workspace;
+            CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize));
+
+            CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(),
+                               reduceDesc,
+                               NULL,
+                               0,
+                               d_workspace,
+                               workspaceSize,
+                               i==0 ? &alpha: &gamma,
+                               outputDesc,
+                               outputGrad.getImpl()->rawPtr(),
+                               &beta,
+                               tensorDesc,
+                               op.getInput(i)->grad()->getImpl()->rawPtr()));
+
+            CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc));
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/operator/TanhImpl.cpp b/src/operator/TanhImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..96c0330febba35cfea04bbbac97d9308195d6309
--- /dev/null
+++ b/src/operator/TanhImpl.cpp
@@ -0,0 +1,123 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/TanhImpl.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/Tanh.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::TanhImpl_cuda::forward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    assert(mOp.getRawInput(0) && "missing input #0");
+
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+
+    // Lazy-initialize CuDNN Tanh descriptor
+    if (mTanhDesc == nullptr) {
+		#if CUDNN_VERSION >= 5000
+			CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mTanhDesc));
+			CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor(
+				mTanhDesc, CUDNN_ACTIVATION_TANH, CUDNN_NOT_PROPAGATE_NAN, 0.0));
+		#else
+			mTanhDesc = CUDNN_ACTIVATION_TANH;
+		#endif
+    }
+
+    // Do the actual forward computation
+    // Template is only for scaling parameters, which are always in float
+    // excepted when the convolution is performed in double precision.
+    if (op.getOutput(0)->dataType() == DataType::Float64) {
+        forward_<double>(input);
+    }
+    else {
+        forward_<float>(input);
+    }
+}
+
+template <class T>
+void Aidge::TanhImpl_cuda::forward_(const Tensor& input) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    CHECK_CUDNN_STATUS(
+        cudnnActivationForward(CudaContext::cudnnHandle(),
+                               mTanhDesc,
+                               &alpha,
+							   std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
+                               input.getImpl()->rawPtr(),
+                               &beta,
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                               std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+}
+
+void Aidge::TanhImpl_cuda::backward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    assert(op.getOutput(0)->grad() && "missing output #0");
+
+    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+
+    // Lazy-initialize CuDNN Tanh descriptor
+    if (mTanhDesc == nullptr) {
+		#if CUDNN_VERSION >= 5000
+			CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mTanhDesc));
+			CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor(
+				mTanhDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0.0));
+		#else
+			mTanhDesc = CUDNN_ACTIVATION_SIGMOID;
+		#endif
+    }
+
+    // Do the actual backward computation
+    // Template is only for scaling parameters, which are always in float
+    // excepted when the convolution is performed in double precision.
+    if (op.getInput(0)->grad()->dataType() == DataType::Float64) {
+        backward_<double>(output_grad);
+    }
+    else {
+        backward_<float>(output_grad);
+    }
+}
+
+template <class T>
+void Aidge::TanhImpl_cuda::backward_(const Tensor& output_grad) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    CHECK_CUDNN_STATUS(
+        cudnnActivationBackward(CudaContext::cudnnHandle(),
+                               mTanhDesc,
+                               &alpha,
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                               std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(),
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad),
+                               output_grad.getImpl()->rawPtr(),
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
+                               std::static_pointer_cast<Tensor>(op.getRawInput(0))->getImpl()->rawPtr(),
+                               &beta,
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)->grad()),
+                               op.getInput(0)->grad()->getImpl()->rawPtr()));
+}
+
+Aidge::TanhImpl_cuda::~TanhImpl_cuda() {
+    if (mTanhDesc != nullptr) {
+		#if CUDNN_VERSION >= 5000
+            cudnnDestroyActivationDescriptor(mTanhDesc);
+		#endif
+    }
+}
+
diff --git a/unit_tests/Test_AddImpl.cpp b/unit_tests/Test_AddImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8129175d88323c896244e531f1dd52a5cbaa19e
--- /dev/null
+++ b/unit_tests/Test_AddImpl.cpp
@@ -0,0 +1,448 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Add.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") {
+    std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> {
+        {                                       //
+            {                                   //
+                {{20, 47},{21, 48},{22, 49}},   //
+                {{23, 50},{24, 51},{25, 52}},   //
+                {{26, 53},{27, 54},{28, 55}}    //
+            },                                  //
+            {                                   //
+                {{29, 56},{30, 57},{31, 58}},   //
+                {{32, 59},{33, 60},{34, 61}},   //
+                {{35, 62},{36, 63},{37, 64}}    //
+            },                                  //
+            {                                   //
+                {{38, 65},{39, 66},{40, 67}},   //
+                {{41, 68},{42, 69},{43, 70}},   //
+                {{44, 71},{45, 72},{46, 73}}    //
+            }                                   //
+        }                                       //
+    });                                         //
+    input1->setBackend("cuda");
+    SECTION("One input") {
+        std::shared_ptr<Node> myAdd = Add(1);
+        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
+        op->associateInput(0, input1);
+        op->setBackend("cuda");
+        op->setDataType(DataType::Float32);
+        myAdd->forward();
+
+        float* computedOutput   = new float[input1->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * input1->size(), cudaMemcpyDeviceToHost);
+        float* targetOutput   = new float[input1->size()]();
+        cudaMemcpy(targetOutput, input1->getImpl()->rawPtr(), sizeof(float) * input1->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < input1->size(); i++){
+            REQUIRE(fabs(computedOutput[i] - targetOutput[i]) < 1e-6);
+        }
+
+        delete[] computedOutput;
+        delete[] targetOutput;
+    }
+
+    SECTION("Two inputs") {
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,3,3,3,2> {
+            {
+                {
+                    {{40,  94},{42,  96},{44,  98}},
+                    {{46, 100},{48, 102},{50, 104}},
+                    {{52, 106},{54, 108},{56, 110}}
+                },
+                {
+                    {{58, 112},{60, 114},{62, 116}},
+                    {{64, 118},{66, 120},{68, 122}},
+                    {{70, 124},{72, 126},{74, 128}}
+                },
+                {
+                    {{76, 130},{78, 132},{80, 134}},
+                    {{82, 136},{84, 138},{86, 140}},
+                    {{88, 142},{90, 144},{92, 146}}
+                }
+            }
+        });
+
+        std::shared_ptr<Node> myAdd = Add(2);
+        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
+        op->associateInput(0, input1);
+        op->associateInput(1, input1);
+        op->setBackend("cuda");
+        op->setDataType(DataType::Float32);
+        myAdd->forward();
+
+        float* computedOutput   = new float[input1->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < expectedOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+
+    SECTION("Three inputs") {
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,3,3,3,2> {
+            {
+                {
+                    {{ 60, 141},{ 63, 144},{ 66, 147}},
+                    {{ 69, 150},{ 72, 153},{ 75, 156}},
+                    {{ 78, 159},{ 81, 162},{ 84, 165}}
+                },
+                {
+                    {{ 87, 168},{ 90, 171},{ 93, 174}},
+                    {{ 96, 177},{ 99, 180},{102, 183}},
+                    {{105, 186},{108, 189},{111, 192}}
+                },
+                {
+                    {{114, 195},{117, 198},{120, 201}},
+                    {{123, 204},{126, 207},{129, 210}},
+                    {{132, 213},{135, 216},{138, 219}}
+                }
+            }
+        });
+
+        std::shared_ptr<Node> myAdd = Add(3);
+        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
+        op->associateInput(0, input1);
+        op->associateInput(1, input1);
+        op->associateInput(2, input1);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        myAdd->forward();
+
+        float* computedOutput   = new float[input1->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < expectedOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+
+    SECTION("Broadcasting") {
+        std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> {
+        {                                       //
+            {                                   //
+                {{0, 1},{2, 3},{4, 5}}          //
+            },                                  //
+            {                                   //
+                {{6, 7},{8, 9},{10, 11}}        //
+            },                                  //
+            {                                   //
+                {{12, 13},{14, 15},{16, 17}}    //
+            }                                   //
+        }                                       //
+        });                                     //
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,1,3,3,2> {
+        {                                       //
+            {                                   //
+                {{20, 21},{22, 23},{24, 25}},   //
+                {{26, 27},{28, 29},{30, 31}},   //
+                {{32, 33},{34, 35},{36, 37}}    //
+            }                                   //
+        }                                       //
+        });                                     //
+
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{100,200}});  
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,3,3,3,2> {
+            {                                               //
+                {                                           //
+                    {{ 120, 222},{ 124, 226},{ 128, 230}},  //
+                    {{ 126, 228},{ 130, 232},{ 134, 236}},  //
+                    {{ 132, 234},{ 136, 238},{ 140, 242}}   //
+                },                                          //
+                {                                           //
+                    {{ 126, 228},{ 130, 232},{ 134, 236}},  //
+                    {{ 132, 234},{ 136, 238},{ 140, 242}},  //
+                    {{ 138, 240},{ 142, 244},{ 146, 248}}   //
+                },                                          //
+                {                                           //
+                    {{ 132, 234},{ 136, 238},{140, 242}},   //
+                    {{ 138, 240},{ 142, 244},{146, 248}},   //
+                    {{ 144, 246},{ 148, 250},{152, 254}}    //
+                }                                           //
+            }                                               //
+        });                                                 //
+        input_0->setBackend("cuda");
+        input_1->setBackend("cuda");
+        input_2->setBackend("cuda");
+        std::shared_ptr<Node> myAdd = Add(3);
+        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
+        op->associateInput(0, input_0);
+        op->associateInput(1, input_1);
+        op->associateInput(2, input_2);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        myAdd->forward();
+
+        float* computedOutput   = new float[input1->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < expectedOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+
+    SECTION("Random Input") {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
+                                                               std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        // To measure execution time of 'forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        std::size_t number_of_operation = 0;
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+        {
+            // Create Add Operator CUDA
+            std::shared_ptr<Node> myAddCUDA = Add(2, "myaddcuda");
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myAddCUDA -> getOperator());
+
+            // Create Add Operator CPU
+            std::shared_ptr<Node> myAddCPU = Add(2, "myaddcpu");
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myAddCPU -> getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+            const std::size_t nbDims = nbDimsDist(gen);
+            std::vector<std::size_t> dims0, dims1, dims;
+            for (std::size_t i = 0; i < nbDims; ++i) {
+                const std::size_t dim = dimSizeDist(gen);
+                // To test broadcasting, set some dims to 1
+                if (boolDist(gen)) {
+                    dims0.push_back(1);
+                }else{
+                    dims0.push_back(dim);
+                }
+                if (boolDist(gen)) {
+                    dims1.push_back(1);
+                }else{
+                    dims1.push_back(dim);
+                }
+                dims.push_back(std::max(dims0[i], dims1[i]));
+            }
+            const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements1 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            number_of_operation += nb_elements;
+
+            float* array0 = new float[nb_elements0];
+            float* array1 = new float[nb_elements1];
+
+            for (std::size_t i = 0; i < nb_elements0; ++i) {
+                array0[i] = valueDist(gen);
+            }
+            for (std::size_t i = 0; i < nb_elements1; ++i) {
+                array1[i] = valueDist(gen);
+            }
+
+            // input0 CUDA
+            float* array0_d, *array1_d;
+            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+            T0_cuda->setDataType(DataType::Float32);
+            T0_cuda->setBackend("cuda");
+            T0_cuda->resize(dims0);
+            op_cuda->associateInput(0, T0_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements0);
+            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements0, cudaMemcpyHostToDevice);
+            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0);
+
+            // input0 CPU
+            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(0,T0_cpu);
+            T0_cpu->setDataType(DataType::Float32);
+            T0_cpu->setBackend("cpu");
+            T0_cpu->resize(dims0);
+            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements0);
+
+            // input1 CUDA
+            std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>();
+            T1_cuda->setDataType(DataType::Float32);
+            T1_cuda->setBackend("cuda");
+            T1_cuda->resize(dims1);
+            op_cuda->associateInput(1, T1_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements1);
+            cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements1, cudaMemcpyHostToDevice);
+            T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1);
+
+            // input1 CPU
+            std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(1,T1_cpu);
+            T1_cpu->setDataType(DataType::Float32);
+            T1_cpu->setBackend("cpu");
+            T1_cpu->resize(dims1);
+            T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1);
+
+            // forward CUDA
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+            start = std::chrono::system_clock::now();
+            op_cuda->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            float *computedOutput = new float[nb_elements]();
+            cudaMemcpy(computedOutput, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nb_elements, cudaMemcpyDeviceToHost);
+
+            // forward CPU
+            op_cpu->forward();
+            float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            REQUIRE(approxEq<float>(*computedOutput, *computedCPU));
+
+            delete[] array0;
+            delete[] array1;
+            delete[] computedOutput;
+            cudaFree(array0_d);
+            cudaFree(array1_d);
+
+        }
+    }
+
+}
+
+TEST_CASE("[gpu/operator] Add(backward)", "[Add][GPU]") {
+        std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> {
+        {                                       //
+            {                                   //
+                {{0, 1},{2, 3},{4, 5}}          //
+            },                                  //
+            {                                   //
+                {{6, 7},{8, 9},{10, 11}}        //
+            },                                  //
+            {                                   //
+                {{12, 13},{14, 15},{16, 17}}    //
+            }                                   //
+        }                                       //
+        });                                     //
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,1,3,3,2> {
+        {                                       //
+            {                                   //
+                {{20, 21},{22, 23},{24, 25}},   //
+                {{26, 27},{28, 29},{30, 31}},   //
+                {{32, 33},{34, 35},{36, 37}}    //
+            }                                   //
+        }                                       //
+        });                                     //
+
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{100,200}});
+
+        input_0->setBackend("cuda");
+        input_1->setBackend("cuda");
+        input_2->setBackend("cuda");
+        std::shared_ptr<Node> myAdd = Add(3);
+        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
+        op->associateInput(0, input_0);
+        op->associateInput(1, input_1);
+        op->associateInput(2, input_2);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        myAdd->forward();
+
+        // Run and test backward operation
+        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,3,3,3,2> {
+            {                                         //
+                {                                     //
+                    {{  1,  2},{  3,  4},{  5,  6}},  //
+                    {{  7,  8},{  9, 10},{ 11, 12}},  //
+                    {{ 13, 14},{ 15, 16},{ 17, 18}}   //
+                },                                    //
+                {                                     //
+                    {{ 19, 20},{ 21, 22},{ 23, 24}},  //
+                    {{ 25, 26},{ 27, 28},{ 29, 30}},  //
+                    {{ 31, 32},{ 33, 34},{ 35, 36}}   //
+                },                                    //
+                {                                     //
+                    {{ 37, 38},{ 39, 40},{41, 42}},   //
+                    {{ 43, 44},{ 45, 46},{47, 48}},   //
+                    {{ 49, 50},{ 51, 52},{53, 54}}    //
+                }                                     //
+            }                                         //
+        });                                           //
+        myOutputGrad->setBackend("cuda");
+        op->getOutput(0)->setGrad(myOutputGrad);
+        REQUIRE_NOTHROW(myAdd->backward());
+
+        std::shared_ptr<Tensor> expectedInput1Grad = std::make_shared<Tensor>(Array4D<float,3,1,3,2> {
+        {                                         //
+            {                                     //
+                {{21, 24},{27, 30},{33, 36}}      //
+            },                                    //
+            {                                     //
+                {{75, 78},{81, 84},{87, 90}}      //
+            },                                    //
+            {                                     //
+                {{129, 132},{135, 138},{141, 144}}//
+            }                                     //
+        }                                         //
+        });                                       //
+        std::shared_ptr<Tensor> expectedInput2Grad = std::make_shared<Tensor>(Array4D<float,1,3,3,2> {
+        {                                       //
+            {                                   //
+                {{57, 60},{63, 66},{69, 72}},   //
+                {{75, 78},{81, 84},{87, 90}},   //
+                {{93, 96},{99, 102},{105, 108}} //
+            }                                   //
+        }                                       //
+        });                                     //
+        std::shared_ptr<Tensor> expectedInput3Grad = std::make_shared<Tensor>(Array1D<float,2> {{729, 756}});
+
+        float *computedGrad1Cuda = new float[expectedInput1Grad->size()]();
+        cudaMemcpy(computedGrad1Cuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput1Grad->size(), cudaMemcpyDeviceToHost);
+        float *computedGrad2Cuda = new float[expectedInput2Grad->size()]();
+        cudaMemcpy(computedGrad2Cuda, op->getInput(1)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput2Grad->size(), cudaMemcpyDeviceToHost);
+        float *computedGrad3Cuda = new float[expectedInput3Grad->size()]();
+        cudaMemcpy(computedGrad3Cuda, op->getInput(2)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput3Grad->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < expectedInput1Grad->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedInput1Grad->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedGrad1Cuda[i] - targetOutput) < 1e-6);
+        }
+        for(int i = 0; i < expectedInput2Grad->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedInput2Grad->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedGrad2Cuda[i] - targetOutput) < 1e-6);
+        }
+        for(int i = 0; i < expectedInput3Grad->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedInput3Grad->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedGrad3Cuda[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedGrad1Cuda;
+        delete[] computedGrad2Cuda;
+        delete[] computedGrad3Cuda;
+}
\ No newline at end of file
diff --git a/unit_tests/Test_AvgPoolingImpl.cpp b/unit_tests/Test_AvgPoolingImpl.cpp
index dfadebbe07aa38371576cf4006773484494751a0..3dccd6b7f909a9e9b4f8affb151898b77d94a7cf 100644
--- a/unit_tests/Test_AvgPoolingImpl.cpp
+++ b/unit_tests/Test_AvgPoolingImpl.cpp
@@ -10,151 +10,289 @@
  ********************************************************************************/
 
 #include <array>
+#include <cuda_fp16.h> // half type
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
-#include <cuda_fp16.h>
-#include <numeric>   // std::accumulate
-#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
-
-#include "Test_cuda.hpp"
-
-#include "aidge/data/half.hpp"
-#include "aidge/data/Tensor.hpp"
 
 #include "aidge/backend/cpu.hpp"
 #include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/data/half.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
-TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") {
-    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW
-        {
-            {
-                {{  0,   1,   2,   3,   4},
-                 {  5,   6,   7,   8,   9},
-                 { 10,  11,  12,  13,  14},
-                 { 15,  16,  17,  18,  19},
-                 { 20,  21,  22,  23,  24}},
-
-                {{ 25,  26,  27,  28,  29},
-                 { 30,  31,  32,  33,  34},
-                 { 35,  36,  37,  38,  39},
-                 { 40,  41,  42,  43,  44},
-                 { 45,  46,  47,  48,  49}}
-            },
-            {
-                {{100, 101, 102, 103, 104},
-                 {105, 106, 107, 108, 109},
-                 {110, 111, 112, 113, 114},
-                 {115, 116, 117, 118, 119},
-                 {120, 121, 122, 123, 124}},
-
-                {{125, 126, 127, 128, 129},
-                 {130, 131, 132, 133, 134},
-                 {135, 136, 137, 138, 139},
-                 {140, 141, 142, 143, 144},
-                 {145, 146, 147, 148, 149}}
-            }
-        }
-    });
-    SECTION("Stride") {
-        std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "myAvgPool", {2,2});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
+TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]")
+{
+    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float, 2, 2, 5, 5>{// NCHW
+                                                                                          {
+                                                                                              {{{0, 1, 2, 3, 4},
+                                                                                                {5, 6, 7, 8, 9},
+                                                                                                {10, 11, 12, 13, 14},
+                                                                                                {15, 16, 17, 18, 19},
+                                                                                                {20, 21, 22, 23, 24}},
 
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
-            {
-                {
-                    {{  3,   5},
-                     { 13,  15}},
-                    {{ 28,  30},
-                     { 38,  40}}
-                },
-                {
-                    {{103, 105},
-                     {113, 115}},
-                    {{128, 130},
-                     {138, 140}}
-                }
-            }
-        });
-        op->associateInput(0,myInput);
+                                                                                               {{25, 26, 27, 28, 29},
+                                                                                                {30, 31, 32, 33, 34},
+                                                                                                {35, 36, 37, 38, 39},
+                                                                                                {40, 41, 42, 43, 44},
+                                                                                                {45, 46, 47, 48, 49}}},
+                                                                                              {{{100, 101, 102, 103, 104},
+                                                                                                {105, 106, 107, 108, 109},
+                                                                                                {110, 111, 112, 113, 114},
+                                                                                                {115, 116, 117, 118, 119},
+                                                                                                {120, 121, 122, 123, 124}},
+
+                                                                                               {{125, 126, 127, 128, 129},
+                                                                                                {130, 131, 132, 133, 134},
+                                                                                                {135, 136, 137, 138, 139},
+                                                                                                {140, 141, 142, 143, 144},
+                                                                                                {145, 146, 147, 148, 149}}}}});
+    SECTION("Stride")
+    {
+        std::shared_ptr<Node> myAvgPool = AvgPooling({2, 2}, "myAvgPool", {2, 2});
+        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
+            {{{{3, 5},
+               {13, 15}},
+              {{28, 30},
+               {38, 40}}},
+             {{{103, 105},
+               {113, 115}},
+              {{128, 130},
+               {138, 140}}}}});
+        op->associateInput(0, myInput);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myAvgPool->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
+        float *computedOutput = new float[myOutput->size()]();
         cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++)
+        {
+            const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedOutput;
     }
 
-    SECTION("Stride >= feature dim") {
-        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { //NCHW
-        {
-            {
-                {{0.3745, 0.9507, 0.7320},
-                 {0.5987, 0.1560, 0.1560},
-                 {0.0581, 0.8662, 0.6011}}
-            }
-        }
-        });
-        std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "myAvgPool", {3,3});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
-
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,1,1,1,1> {
-            {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011)/9.0}}}}
-        });
-        op->associateInput(0,myInput2);
+    SECTION("Stride >= feature dim")
+    {
+        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float, 1, 1, 3, 3>{// NCHW
+                                                                                               {
+                                                                                                   {{{0.3745, 0.9507, 0.7320},
+                                                                                                     {0.5987, 0.1560, 0.1560},
+                                                                                                     {0.0581, 0.8662, 0.6011}}}}});
+        std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPool", {3, 3});
+        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 1>{
+            {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011) / 9.0}}}}});
+        op->associateInput(0, myInput2);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myAvgPool->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
+        float *computedOutput = new float[myOutput->size()]();
         cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++)
+        {
+            const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedOutput;
     }
 
-    SECTION("half") {
-        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half,1,1,3,3> { //NCHW
-        {
-            {
-                {{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)},
-                 {half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)},
-                 {half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}}
-            }
-        }
-        });
+    SECTION("half")
+    {
+        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 3, 3>{// NCHW
+                                                                                                          {
+                                                                                                              {{{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)},
+                                                                                                                {half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)},
+                                                                                                                {half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}}}}});
         myInput2->setBackend("cuda");
 
-        std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "mymyAvgPoolcdw", {3,3});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half,1,1,1,1> {
-            {{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011))/half_float::half(9.0)}}}}
-        });
-        op->associateInput(0,myInput2);
+        std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPoolcdw", {3, 3});
+        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 1, 1>{
+            {{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011)) / half_float::half(9.0)}}}}});
+        op->associateInput(0, myInput2);
         op->setDataType(DataType::Float16);
         op->setBackend("cuda");
         myAvgPool->forward();
 
-        half_float::half* computedOutput   = new half_float::half[myOutput->size()]();
+        half_float::half *computedOutput = new half_float::half[myOutput->size()]();
         cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(half_float::half) * myOutput->size(), cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const half_float::half targetOutput = *(static_cast<half_float::half*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++)
+        {
+            const half_float::half targetOutput = *(static_cast<half_float::half *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedOutput;
     }
+
+    SECTION("Random Input") {
+        constexpr std::uint16_t NBTRIALS = 10;
+        std::size_t kernel = 3;
+        std::size_t stride = 3;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(kernel),
+                                                               std::size_t(10));
+
+        // To measure execution time of 'AveragePooling_Op::forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        std::size_t number_of_operation = 0;
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+        {
+            // Create AveragePooling Operator CUDA
+            std::shared_ptr<Node> myAvgPoolCuda = AvgPooling({kernel, kernel}, "myAvgPoolCuda", {stride, stride});
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myAvgPoolCuda->getOperator());
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+
+            // Create AveragePooling Operator CUDA
+            std::shared_ptr<Node> myAvgPoolCpu = AvgPooling({kernel, kernel}, "myAvgPoolCpu", {stride, stride});
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myAvgPoolCpu->getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+            // generate a random Tensor
+            const std::size_t nbDims = 4;
+            std::vector<std::size_t> dims;
+            for (std::size_t i = 0; i < nbDims; ++i)
+            {
+                dims.push_back(dimSizeDist(gen));
+            }
+
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            number_of_operation += nb_elements;
+
+            // Fill input tensor
+            float *array0 = new float[nb_elements];
+            for (std::size_t i = 0; i < nb_elements; ++i)
+            {
+                array0[i] = valueDist(gen);
+            }
+
+            // input0 CUDA
+            float* array0_d;
+            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+            T0_cuda->setDataType(DataType::Float32);
+            T0_cuda->setBackend("cuda");
+            T0_cuda->resize(dims);
+            op_cuda->associateInput(0, T0_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
+
+            // input0 CPU
+            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(0,T0_cpu);
+            T0_cpu->setDataType(DataType::Float32);
+            T0_cpu->setBackend("cpu");
+            T0_cpu->resize(dims);
+            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+
+            // Run inference            
+            start = std::chrono::system_clock::now();
+            op_cuda->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            float *computed_cuda = new float[outSize]();
+            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+
+            // forward CPU
+            op_cpu->forward();
+            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
+
+            delete[] computed_cuda;
+            delete[] array0;
+            cudaFree(array0_d);
+        }
+        std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
+        std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+    }
+}
+
+TEST_CASE("[gpu/operator] AvgPooling(backward)", "[AvgPooling][GPU]")
+{
+    // Run forward operation
+    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float, 1, 1, 4, 4> {// NCHW
+        {
+            {
+                {
+                    {1, 2, 3, 4},
+                    {5, 6, 7, 8},
+                    {9, 10, 11, 12},
+                    {13, 14, 15, 16}
+                }
+            }
+        }
+    });
+    myInput->setBackend("cuda");
+
+    std::shared_ptr<Node> myAvgPool = AvgPooling({2, 2}, "myAvgPool", {2, 2});
+    auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
+    op->associateInput(0, myInput);
+    op->setDataType(DataType::Float32);
+    op->setBackend("cuda");
+    myAvgPool->forward();
+
+    // Run and test backward operation
+    std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float, 1,1,2,2> {
+        {
+            {
+                {
+                    {1, 2},
+                    {3, 4}
+                }
+            }
+        }
+    });
+    myOutputGrad->setBackend("cuda");
+    std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
+    std::shared_ptr<Tensor> input = op->getInput(0);
+    predictedOutput->setGrad(myOutputGrad);
+    REQUIRE_NOTHROW(myAvgPool->backward());
+
+    std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array4D<float, 1, 1, 4, 4>{
+        {
+            {
+                {
+                    {0.25, 0.25, 0.5, 0.5},
+                    {0.25, 0.25, 0.5, 0.5},
+                    {0.75, 0.75, 1, 1},
+                    {0.75, 0.75, 1, 1}
+                }
+            }
+        }
+    });
+
+    float *computedGradCuda = new float[expectedInputGrad->size()]();
+    cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost);
+    
+    for(int i = 0; i < expectedInputGrad->size(); i++){
+        const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i);
+        REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
+    }
+
+    delete[] computedGradCuda;
 }
\ No newline at end of file
diff --git a/unit_tests/Test_BatchNormImpl.cpp b/unit_tests/Test_BatchNormImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c83624020d86a2eb786d249c5ee664ca3bfdde3b
--- /dev/null
+++ b/unit_tests/Test_BatchNormImpl.cpp
@@ -0,0 +1,438 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+#include "Test_cuda.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[gpu/operator] BatchNorm(forward)") {
+    SECTION("Static Input") {
+        std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm");
+        auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator());
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        std::shared_ptr<Tensor> myWeights= std::make_shared<Tensor>(Array1D<float,3> {{0.9159252643585205, 0.18772238492965698, 0.4479946792125702}});
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.33898890018463135, 0.3167555630207062, 0.7047033309936523}});
+        std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(Array1D<float,3> {{0.45547693967819214, 0.22650663554668427, 0.6612948179244995}});
+        std::shared_ptr<Tensor> myVar = std::make_shared<Tensor>(Array1D<float,3> {{0.02570258639752865, 0.026536229997873306, 0.15111008286476135}});
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> { //NCHW
+            {
+                    {
+                        {{0.12943482, 0.6451229 , 0.24979436},
+                        {0.7551012,  0.32007095, 0.89463896},
+                        {0.7087448,  0.6266124,  0.4782957 }},
+
+                        {{0.13796203, 0.9950787,  0.71555305},
+                        {0.01347321, 0.4395316,  0.43097174},
+                        {0.6056306 , 0.9561122 , 0.5783939 }},
+
+                        {{0.7174486 , 0.503465 ,  0.23695093},
+                        {0.5145477,  0.39576462, 0.02779444},
+                        {0.60789394 ,0.14119725 ,0.20753163}}
+                    },
+
+
+                        {{{0.74452287, 0.5354875 , 0.8148496 },
+                        {0.73356223, 0.4304034 , 0.11783765},
+                        {0.8966221,  0.41049036, 0.95982736}},
+
+                        {{0.03161403, 0.71250844, 0.14337301},
+                        {0.5338889 , 0.13484782, 0.8055851 },
+                        {0.71784616 ,0.8349626 , 0.10107189}},
+
+                        {{0.85701346, 0.58286697, 0.9836816 },
+                        {0.36061534, 0.03660944, 0.7375317 },
+                        {0.6977233,  0.51965624, 0.29440993}}
+                    }
+                }
+        });
+        
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> {
+            {
+                {
+                    {{-1.5233592,   1.4222438,  -0.83586717},
+                    { 2.0504384,  -0.43444824,  2.847476  },
+                    { 1.7856512,   1.3165123,   0.46932936}},
+
+                    {{ 0.21473758 , 1.2022772,   0.8802177 },
+                    { 0.07130594 , 0.5621954,   0.55233306},
+                    { 0.7535689 ,  1.1573814,   0.72218764}},
+
+                    {{ 0.7694162 ,  0.52281666,  0.2156798 },
+                    { 0.5355886  , 0.3987003,  -0.02535689},
+                    { 0.6431629 ,  0.10533108 , 0.18177633}}},
+
+
+                    {{{ 1.990015,    0.7960079,   2.3917203 },
+                    { 1.9274082,   0.19576907, -1.5896021 },
+                    { 2.8588037 ,  0.08202624 , 3.2198315 }},
+
+                    {{ 0.09220716,  0.8767097,   0.22097193},
+                    { 0.6709106 ,  0.2111495,   0.9839494 },
+                    { 0.8828597 ,  1.0177971 ,  0.17223406}},
+
+                    {{ 0.9302539 ,  0.6143213 ,  1.0762292 },
+                    { 0.35819346, -0.01519828,  0.79256046},
+                    { 0.7466844 ,  0.5414758 ,  0.28189686}}
+                }
+            }
+        });
+        myInput->setBackend("cuda");
+        myWeights->setBackend("cuda");
+        myBias->setBackend("cuda");
+        myMean->setBackend("cuda");
+        myVar->setBackend("cuda");
+
+        op->associateInput(0,myInput);
+        op->associateInput(1,myWeights);
+        op->associateInput(2,myBias);
+        op->associateInput(3,myMean);
+        op->associateInput(4,myVar);
+        op->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-5);
+        }
+
+        delete[] computedOutput;
+    }
+
+    SECTION("Random Input") {
+        constexpr std::uint16_t NBTRIALS = 10;
+        constexpr float epsilon = 0.00001F;
+        constexpr float momentum = 0.1F;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
+                                                               std::size_t(10));
+        // To measure execution time of 'forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+        {
+            // generate a random Tensor
+            const std::size_t nbDims = 4;
+            std::vector<std::size_t> dims;
+            for (std::size_t i = 0; i < nbDims; ++i)
+            {
+                dims.push_back(dimSizeDist(gen));
+            }
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nbChannels = dims[1];
+
+
+            // Create BatchNorm Operator Cuda
+            std::shared_ptr<Node> myBatchNormCuda = BatchNorm<2>(nbChannels, epsilon, momentum, "mybatchnormcuda");
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myBatchNormCuda -> getOperator());
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+
+            // Create BatchNorm Operator CPU
+            std::shared_ptr<Node> myBatchNormCpu = BatchNorm<2>(nbChannels, epsilon, momentum, "mybatchnormcuda");
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myBatchNormCpu -> getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+            float* array0 = new float[nb_elements];
+            float* weights = new float[nbChannels];
+            float* bias = new float[nbChannels];
+            float* mean = new float[nbChannels];
+            float* var = new float[nbChannels];
+
+
+            for (std::size_t i = 0; i < nb_elements; ++i) {
+                array0[i] = valueDist(gen);
+            }
+            for (std::size_t i = 0; i < nbChannels; ++i) {
+                weights[i] = valueDist(gen);
+                bias[i] = valueDist(gen);
+                mean[i] = valueDist(gen);
+                var[i] = valueDist(gen);
+            }
+
+            // input0 CUDA
+            float* array0_d, *weight_d, *bias_d, *mean_d, *var_d;
+            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+            T0_cuda->setDataType(DataType::Float32);
+            T0_cuda->setBackend("cuda");
+            T0_cuda->resize(dims);
+            op_cuda->associateInput(0, T0_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
+
+            // input0 CPU
+            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(0,T0_cpu);
+            T0_cpu->setDataType(DataType::Float32);
+            T0_cpu->setBackend("cpu");
+            T0_cpu->resize(dims);
+            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+
+            // weight CUDA
+            std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>();
+            Tw_cuda->setDataType(DataType::Float32);
+            Tw_cuda->setBackend("cuda");
+            Tw_cuda->resize({nbChannels});
+            op_cuda->associateInput(1, Tw_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * nbChannels);
+            cudaMemcpy(weight_d, weights, sizeof(float) * nbChannels, cudaMemcpyHostToDevice);
+            Tw_cuda->getImpl()->setRawPtr(weight_d, nbChannels);
+
+            // weight CPU
+            std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(1,Tw_cpu);
+            Tw_cpu->setDataType(DataType::Float32);
+            Tw_cpu->setBackend("cpu");
+            Tw_cpu->resize({nbChannels});
+            Tw_cpu -> getImpl() -> setRawPtr(weights, nbChannels);
+
+            // bias CUDA
+            std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>();
+            Tb_cuda->setDataType(DataType::Float32);
+            Tb_cuda->setBackend("cuda");
+            Tb_cuda->resize({nbChannels});
+            op_cuda->associateInput(2, Tb_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * nbChannels);
+            cudaMemcpy(bias_d, bias, sizeof(float) * nbChannels, cudaMemcpyHostToDevice);
+            Tb_cuda->getImpl()->setRawPtr(bias_d, nbChannels);
+
+            // bias CPU
+            std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(2,Tb_cpu);
+            Tb_cpu->setDataType(DataType::Float32);
+            Tb_cpu->setBackend("cpu");
+            Tb_cpu->resize({nbChannels});
+            Tb_cpu -> getImpl() -> setRawPtr(bias, nbChannels);
+
+            // mean CUDA
+            std::shared_ptr<Tensor> Tm_cuda = std::make_shared<Tensor>();
+            Tm_cuda->setDataType(DataType::Float32);
+            Tm_cuda->setBackend("cuda");
+            Tm_cuda->resize({nbChannels});
+            op_cuda->associateInput(3, Tm_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&mean_d), sizeof(float) * nbChannels);
+            cudaMemcpy(mean_d, mean, sizeof(float) * nbChannels, cudaMemcpyHostToDevice);
+            Tm_cuda->getImpl()->setRawPtr(mean_d, nbChannels);
+
+            // mean CPU
+            std::shared_ptr<Tensor> Tm_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(3,Tm_cpu);
+            Tm_cpu->setDataType(DataType::Float32);
+            Tm_cpu->setBackend("cpu");
+            Tm_cpu->resize({nbChannels});
+            Tm_cpu -> getImpl() -> setRawPtr(mean, nbChannels);
+
+            // var CUDA
+            std::shared_ptr<Tensor> Tv_cuda = std::make_shared<Tensor>();
+            Tv_cuda->setDataType(DataType::Float32);
+            Tv_cuda->setBackend("cuda");
+            Tv_cuda->resize({nbChannels});
+            op_cuda->associateInput(4, Tv_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&var_d), sizeof(float) * nbChannels);
+            cudaMemcpy(var_d, var, sizeof(float) * nbChannels, cudaMemcpyHostToDevice);
+            Tv_cuda->getImpl()->setRawPtr(var_d, nbChannels);
+
+            // var CPU
+            std::shared_ptr<Tensor> Tv_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(4,Tv_cpu);
+            Tv_cpu->setDataType(DataType::Float32);
+            Tv_cpu->setBackend("cpu");
+            Tv_cpu->resize({nbChannels});
+            Tv_cpu -> getImpl() -> setRawPtr(var, nbChannels);
+
+            // forward CUDA
+            start = std::chrono::system_clock::now();
+            op_cuda->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            float *computed_cuda = new float[outSize]();
+            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+
+            // forward CPU
+            op_cpu->forward();
+            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
+
+            delete[] array0;
+            delete[] weights;
+            delete[] bias;
+            delete[] mean;
+            delete[] var;
+            delete[] computed_cuda;
+            cudaFree(array0_d);
+            cudaFree(weight_d);
+            cudaFree(bias_d);
+            cudaFree(mean_d);
+            cudaFree(var_d);
+        }
+        std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+
+    }
+}
+TEST_CASE("[gpu/operator] BatchNorm(backward)") {
+    SECTION("Static Input") {
+        std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm");
+        auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator());
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        // Forward
+        std::shared_ptr<Tensor> myWeights= std::make_shared<Tensor>(Array1D<float,3> {{-1.58390772, -0.48463920,  1.30413496}});
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.06150287, -0.03140282, -0.49673468}});
+        std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(Array1D<float,3> {{0.68328333, -0.47286209,  1.11688483}});
+        std::shared_ptr<Tensor> myVar = std::make_shared<Tensor>(Array1D<float,3> {{0.84838068, 1.05930495, 0.53670371}});
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,2,2> { //NCHW
+            {
+                    {
+                        {
+                            {1.46650600,  1.24083233},
+                            {-0.33106008, -0.15137172}
+                        },
+                        {
+                            { 0.06625678, -1.83266091},
+                            { 0.53444749, -0.05167147}
+                        },
+                        {
+                            { 0.41069385, -0.70850474},
+                            { 0.23363227,  0.06111236}
+                        }
+                    },
+                    {
+                        {
+                            { 0.16707586,  1.07217050},
+                            { 1.18544745,  0.03441877}
+                        },
+                        {
+                            { 0.88106865,  0.33312374},
+                            { 0.87147945,  1.46628737}
+                        },
+                       {
+                            { 0.23930393, -0.94172227},
+                            { 1.48735642,  0.46449399}
+                        }
+                    }
+            }
+        });
+
+        myInput->setBackend("cuda");
+        myWeights->setBackend("cuda");
+        myBias->setBackend("cuda");
+        myMean->setBackend("cuda");
+        myVar->setBackend("cuda");
+
+        op->associateInput(0,myInput);
+        op->associateInput(1,myWeights);
+        op->associateInput(2,myBias);
+        op->associateInput(3,myMean);
+        op->associateInput(4,myVar);
+        op->forward();
+
+        // Backward
+        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,2,3,2,2> {
+            {
+                {
+                    {
+                        { 1.34347093,  0.90813798},
+                        { 0.39607167,  1.20428133}
+                    },
+                    {
+                        { 0.16845724,  0.48487359},
+                        { 0.40748054, -0.21790814}
+                    },
+                    {
+                        {-1.83932650, -0.42746788},
+                        { 0.97129798,  2.04073548}
+                    }
+                },
+                {
+                    {
+                        {-0.95714629,  0.18446854},
+                        { 1.14551663, -1.38118088}
+                    },
+                    {
+                        {-0.44466951,  2.73914146},
+                        { 0.57898718,  2.23699141}
+                    },
+                    {
+                        { 0.25004527, -0.18481003},
+                        {-0.72439206,  0.87744337}
+                    }
+
+                }
+            }
+        });
+
+        myOutputGrad->setBackend("cuda");
+        std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
+        std::shared_ptr<Tensor> input = op->getInput(0);
+        std::shared_ptr<Tensor> weights = op->getInput(1);
+        std::shared_ptr<Tensor> bias = op->getInput(2);
+        predictedOutput->setGrad(myOutputGrad);
+        REQUIRE_NOTHROW(myBatchNorm->backward());
+
+        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array4D<float, 2, 3, 2, 2>{
+            {
+                {
+                    {
+                        {-0.92418045, -0.26092845},
+                        {-1.53920066, -3.14756274}},
+
+                        {{ 0.26948565, -0.18548687},
+                        { 0.21506749,  0.45458069}},
+
+                        {{-3.57358932, -1.30609703},
+                        { 1.61337423,  3.55250096}}},
+
+
+                        {{{ 2.41264391,  1.16695499},
+                        {-0.90373814,  3.19601130}},
+
+                        {{ 0.71554798, -1.04076481},
+                        { 0.17618656, -0.60461664}},
+
+                        {{ 0.26926503, -0.92978811},
+                        {-1.13964832,  1.51398242}
+                    }
+                }
+            }
+        });
+
+        float *computedGradCuda = new float[expectedInputGrad->size()]();
+        cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost);
+        
+        for(int i = 0; i < expectedInputGrad->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedGradCuda;
+    }
+}
\ No newline at end of file
diff --git a/unit_tests/Test_ConvDepthWiseImpl.cpp b/unit_tests/Test_ConvDepthWiseImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4655de069cce86e80881a06673621c8159be18f6
--- /dev/null
+++ b/unit_tests/Test_ConvDepthWiseImpl.cpp
@@ -0,0 +1,314 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") {
+    SECTION("Deterministic Input") {
+        std::shared_ptr<Node> myCDW = ConvDepthWise(4, {3,3}, "mycdw");
+        auto op = std::static_pointer_cast<OperatorTensor>(myCDW -> getOperator());
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,1,3,3> {
+            {
+                {{
+                    {  0,  1,  2},
+                    {  3,  4,  5},
+                    {  6,  7,  8}
+
+                }},
+                {{
+                    { 27, 28, 29},
+                    { 30, 31, 32},
+                    { 33, 34, 35}
+
+                }},
+                {{
+                    { 54, 55, 56},
+                    { 57, 58, 59},
+                    { 60, 61, 62}
+                }},
+                {{
+                    { 81, 82, 83},
+                    { 84, 85, 86},
+                    { 87, 88, 89}
+                }}
+            }
+        });
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,4> {{7,0,9,0}});
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,4,5,5> { //NCHW
+            {
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}},
+
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}}
+                },
+                {
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}},
+
+                    {{150, 151, 152, 153, 154},
+                    {155, 156, 157, 158, 159},
+                    {160, 161, 162, 163, 164},
+                    {165, 166, 167, 168, 169},
+                    {170, 171, 172, 173, 174}},
+
+                    {{175, 176, 177, 178, 179},
+                    {180, 181, 182, 183, 184},
+                    {185, 186, 187, 188, 189},
+                    {190, 191, 192, 193, 194},
+                    {195, 196, 197, 198, 199}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,3,3> {
+            {
+                {
+                    {{   319,    355,    391},
+                    {   499,    535,    571},
+                    {   679,    715,    751}},
+
+                    {{  8745,   9024,   9303},
+                    { 10140,  10419,  10698},
+                    { 11535,  11814,  12093}},
+
+                    {{ 29337,  29859,  30381},
+                    { 31947,  32469,  32991},
+                    { 34557,  35079,  35601}},
+
+                    {{ 62061,  62826,  63591},
+                    { 65886,  66651,  67416},
+                    { 69711,  70476,  71241}}
+                },
+                {
+                    {{  3919,   3955,   3991},
+                    {  4099,   4135,   4171},
+                    {  4279,   4315,   4351}},
+
+                    {{ 36645,  36924,  37203},
+                    { 38040,  38319,  38598},
+                    { 39435,  39714,  39993}},
+
+                    {{ 81537,  82059,  82581},
+                    { 84147,  84669,  85191},
+                    { 86757,  87279,  87801}},
+
+                    {{138561, 139326, 140091},
+                    {142386, 143151, 143916},
+                    {146211, 146976, 147741}}
+                }
+            }
+        });
+        myInput->setBackend("cuda");
+        myWeights->setBackend("cuda");
+        myBias->setBackend("cuda");
+        op -> associateInput(0, myInput);
+        op -> associateInput(1, myWeights);
+        op -> associateInput(2, myBias);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+
+        myCDW -> forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+
+    SECTION("Random Input") {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> kernelDist(1, std::size_t(5));
+        std::uniform_int_distribution<std::size_t> dimSizeDist(1, std::size_t(10));
+
+        // To measure execution time of 'forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+        {
+            const std::size_t kernel = kernelDist(gen);
+            std::uniform_int_distribution<std::size_t> resolutionDist(std::size_t(kernel+2),
+                                                               std::size_t(10));
+            const std::size_t nbDims = 4;
+            // input  (batch, ch, Xin, Yin)
+            // weight (outCh, ch, kernelX, kernelY)
+            std::vector<std::size_t> dims;
+            for (std::size_t i = 0; i < nbDims; ++i) {
+                if(i < 2)
+                    dims.push_back(dimSizeDist(gen));
+                else
+                    dims.push_back(resolutionDist(gen));
+            }
+            dims[1] = 1; // TODO FIX: ConvDepthWise doesn't give the same output in CUDA as in CPU unless channels is 1
+            const std::size_t nbChannels = dims[1];
+            const std::vector<std::size_t> dimsW{nbChannels,nbChannels,kernel,kernel};
+
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t wieghtSize = std::accumulate(dimsW.cbegin(), dimsW.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+            // Create ConvDepthWise Operator CUDA
+            std::shared_ptr<Node> myConvCUDA = ConvDepthWise(nbChannels,{kernel,kernel}, "myconvcuda");
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myConvCUDA -> getOperator());
+
+            // Create ConvDepthWise Operator CPU
+            std::shared_ptr<Node> myConvCPU = ConvDepthWise(nbChannels,{kernel,kernel}, "myconvcpu");
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myConvCPU -> getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+
+            float* array0 = new float[nb_elements];
+            float* weights = new float[wieghtSize];
+            float* bias = new float[nbChannels];
+
+            for (std::size_t i = 0; i < nb_elements; ++i) {
+                array0[i] = valueDist(gen);
+            }
+            for (std::size_t i = 0; i < wieghtSize; ++i) {
+                weights[i] = valueDist(gen);
+            }
+            for (std::size_t i = 0; i < nbChannels; ++i) {
+                bias[i] = valueDist(gen);
+            }
+
+            // input0 CUDA
+            float* array0_d, *weight_d, *bias_d;
+            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+            T0_cuda->setDataType(DataType::Float32);
+            T0_cuda->setBackend("cuda");
+            T0_cuda->resize(dims);
+            op_cuda->associateInput(0, T0_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
+
+            // input0 CPU
+            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(0,T0_cpu);
+            T0_cpu->setDataType(DataType::Float32);
+            T0_cpu->setBackend("cpu");
+            T0_cpu->resize(dims);
+            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+
+            // weight CUDA
+            std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>();
+            Tw_cuda->setDataType(DataType::Float32);
+            Tw_cuda->setBackend("cuda");
+            Tw_cuda->resize(dimsW);
+            op_cuda->associateInput(1, Tw_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * wieghtSize);
+            cudaMemcpy(weight_d, weights, sizeof(float) * wieghtSize, cudaMemcpyHostToDevice);
+            Tw_cuda->getImpl()->setRawPtr(weight_d, wieghtSize);
+
+            // weight CPU
+            std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(1,Tw_cpu);
+            Tw_cpu->setDataType(DataType::Float32);
+            Tw_cpu->setBackend("cpu");
+            Tw_cpu->resize(dimsW);
+            Tw_cpu -> getImpl() -> setRawPtr(weights, wieghtSize);
+
+            // bias CUDA
+            std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>();
+            Tb_cuda->setDataType(DataType::Float32);
+            Tb_cuda->setBackend("cuda");
+            Tb_cuda->resize({nbChannels});
+            op_cuda->associateInput(2, Tb_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * nbChannels);
+            cudaMemcpy(bias_d, bias, sizeof(float) * nbChannels, cudaMemcpyHostToDevice);
+            Tb_cuda->getImpl()->setRawPtr(bias_d, nbChannels);
+
+            // bias CPU
+            std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(2,Tb_cpu);
+            Tb_cpu->setDataType(DataType::Float32);
+            Tb_cpu->setBackend("cpu");
+            Tb_cpu->resize({nbChannels});
+            Tb_cpu -> getImpl() -> setRawPtr(bias, nbChannels);
+
+            // forward CUDA
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+            start = std::chrono::system_clock::now();
+            op_cuda->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            float *computed_cuda = new float[outSize]();
+            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+
+            // forward CPU
+            op_cpu->forward();
+            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+
+            REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
+
+            delete[] array0;
+            delete[] weights;
+            delete[] bias;
+            delete[] computed_cuda;
+            cudaFree(array0_d);
+            cudaFree(weight_d);
+            cudaFree(bias_d);
+        }
+        std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+    }
+}
\ No newline at end of file
diff --git a/unit_tests/Test_ConvImpl.cpp b/unit_tests/Test_ConvImpl.cpp
index 12e40cf8266a86259c5128b425919214f2db6052..dc77e35b64fd22952e683e373fcc271c742ece75 100644
--- a/unit_tests/Test_ConvImpl.cpp
+++ b/unit_tests/Test_ConvImpl.cpp
@@ -10,15 +10,15 @@
  ********************************************************************************/
 
 #include <array>
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
-#include "Test_cuda.hpp"
-
-#include "aidge/data/Tensor.hpp"
-
 #include "aidge/backend/cpu.hpp"
 #include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
@@ -222,4 +222,147 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
 
         delete[] computedOutput;
     }
+
+    SECTION("Random Input") {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> kernelDist(1, std::size_t(5));
+        std::uniform_int_distribution<std::size_t> dimSizeDist(1, std::size_t(10));
+
+        // To measure execution time of 'forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+        {
+            const std::size_t kernel = kernelDist(gen);
+            std::uniform_int_distribution<std::size_t> resolutionDist(std::size_t(kernel+2),
+                                                               std::size_t(10));
+            const std::size_t nbDims = 4;
+            std::vector<std::size_t> dims;
+            for (std::size_t i = 0; i < nbDims; ++i) {
+                if(i < 2)
+                    dims.push_back(dimSizeDist(gen));
+                else
+                    dims.push_back(resolutionDist(gen));
+            }
+            const std::size_t outChannels = dimSizeDist(gen);
+            const std::vector<std::size_t> dimsW{outChannels,dims[1],kernel,kernel};
+            const std::size_t inChannels = dims[1];
+
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t wieghtSize = std::accumulate(dimsW.cbegin(), dimsW.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+            // Create Conv Operator CUDA
+            std::shared_ptr<Node> myConvCUDA = Conv(inChannels,outChannels,{kernel,kernel}, "myconvcuda");
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myConvCUDA -> getOperator());
+
+            // Create Conv Operator CPU
+            std::shared_ptr<Node> myConvCPU = Conv(inChannels,outChannels,{kernel,kernel}, "myconvcpu");
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myConvCPU -> getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+
+            float* array0 = new float[nb_elements];
+            float* weights = new float[wieghtSize];
+            float* bias = new float[outChannels];
+
+            for (std::size_t i = 0; i < nb_elements; ++i) {
+                array0[i] = valueDist(gen);
+            }
+            for (std::size_t i = 0; i < wieghtSize; ++i) {
+                weights[i] = valueDist(gen);
+            }
+            for (std::size_t i = 0; i < outChannels; ++i) {
+                bias[i] = valueDist(gen);
+            }
+
+            // input0 CUDA
+            float* array0_d, *weight_d, *bias_d;
+            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+            T0_cuda->setDataType(DataType::Float32);
+            T0_cuda->setBackend("cuda");
+            T0_cuda->resize(dims);
+            op_cuda->associateInput(0, T0_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
+
+            // input0 CPU
+            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(0,T0_cpu);
+            T0_cpu->setDataType(DataType::Float32);
+            T0_cpu->setBackend("cpu");
+            T0_cpu->resize(dims);
+            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+
+            // weight CUDA
+            std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>();
+            Tw_cuda->setDataType(DataType::Float32);
+            Tw_cuda->setBackend("cuda");
+            Tw_cuda->resize(dimsW);
+            op_cuda->associateInput(1, Tw_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * wieghtSize);
+            cudaMemcpy(weight_d, weights, sizeof(float) * wieghtSize, cudaMemcpyHostToDevice);
+            Tw_cuda->getImpl()->setRawPtr(weight_d, wieghtSize);
+
+            // weight CPU
+            std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(1,Tw_cpu);
+            Tw_cpu->setDataType(DataType::Float32);
+            Tw_cpu->setBackend("cpu");
+            Tw_cpu->resize(dimsW);
+            Tw_cpu -> getImpl() -> setRawPtr(weights, wieghtSize);
+
+            // bias CUDA
+            std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>();
+            Tb_cuda->setDataType(DataType::Float32);
+            Tb_cuda->setBackend("cuda");
+            Tb_cuda->resize({outChannels});
+            op_cuda->associateInput(2, Tb_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * outChannels);
+            cudaMemcpy(bias_d, bias, sizeof(float) * outChannels, cudaMemcpyHostToDevice);
+            Tb_cuda->getImpl()->setRawPtr(bias_d, outChannels);
+
+            // bias CPU
+            std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(2,Tb_cpu);
+            Tb_cpu->setDataType(DataType::Float32);
+            Tb_cpu->setBackend("cpu");
+            Tb_cpu->resize({outChannels});
+            Tb_cpu -> getImpl() -> setRawPtr(bias, outChannels);
+
+            // forward CUDA
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+            start = std::chrono::system_clock::now();
+            op_cuda->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            float *computed_cuda = new float[outSize]();
+            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+
+            // forward CPU
+            op_cpu->forward();
+            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
+
+            delete[] array0;
+            delete[] weights;
+            delete[] bias;
+            delete[] computed_cuda;
+            cudaFree(array0_d);
+            cudaFree(weight_d);
+            cudaFree(bias_d);
+        }
+        std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+    }
+
 }
diff --git a/unit_tests/Test_FCImpl.cpp b/unit_tests/Test_FCImpl.cpp
index 0126755d08727597b00823b2055300e7b15accb3..472fd273b1b5eff49e0d05ebd499afdb1435770c 100644
--- a/unit_tests/Test_FCImpl.cpp
+++ b/unit_tests/Test_FCImpl.cpp
@@ -10,122 +10,342 @@
  ********************************************************************************/
 
 #include <array>
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
-#include "Test_cuda.hpp"
-
-#include "aidge/data/Tensor.hpp"
-
 #include "aidge/backend/cpu.hpp"
 #include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
 TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") {
-    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 5, 75>{
-            {{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
-             {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
-             {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
-             {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
-             {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}}});
-    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 5>{{1, 2, 3, 4, 5}});
-    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 2, 5>{
-            {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}});
-    myWeights->setBackend("cuda");
-    myBias->setBackend("cuda");
-    std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc");
-    auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
-    op -> associateInput(1, myWeights);
-    op -> associateInput(2, myBias);
-    SECTION("2D input") {
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 2, 75>{
-                {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18,
-                  19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
-                  38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
-                  57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74},
-                 {75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
-                  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
-                  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
-                  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
-                  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}});
-        myInput->setBackend("cuda");
-        op->associateInput(0, myInput);
-        op -> setDataType(DataType::Float32);
-        op -> setBackend("cuda");
-        myFC->forward();
+    SECTION("Static Input") {
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 5, 75>{
+                {{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
+                5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
+                9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
+                {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
+                5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
+                9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
+                {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
+                5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
+                9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
+                {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
+                5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
+                9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
+                {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
+                5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
+                9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}}});
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 5>{{1, 2, 3, 4, 5}});
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 2, 5>{
+                {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}});
+        myWeights->setBackend("cuda");
+        myBias->setBackend("cuda");
+        std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc");
+        auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
+        op -> associateInput(1, myWeights);
+        op -> associateInput(2, myBias);
+        SECTION("2D input") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 2, 75>{
+                    {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18,
+                    19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                    38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+                    57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74},
+                    {75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+                    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+                    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+                    120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+                    135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}});
+            myInput->setBackend("cuda");
+            op->associateInput(0, myInput);
+            op -> setDataType(DataType::Float32);
+            op -> setBackend("cuda");
+            myFC->forward();
+
+            float* computedOutput   = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
 
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+            for(int i = 0; i < myOutput->size(); i++){
+                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+                REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+            }
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
-            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+            delete[] computedOutput;
         }
+        SECTION("4D input") {
+            std::shared_ptr<Tensor> myInput =
+                    std::make_shared<Tensor>(Array4D<float, 2, 3, 5, 5>{{{{{0, 1, 2, 3, 4},
+                                                                        {5, 6, 7, 8, 9},
+                                                                        {10, 11, 12, 13, 14},
+                                                                        {15, 16, 17, 18, 19},
+                                                                        {20, 21, 22, 23, 24}},
+                                                                        {{25, 26, 27, 28, 29},
+                                                                        {30, 31, 32, 33, 34},
+                                                                        {35, 36, 37, 38, 39},
+                                                                        {40, 41, 42, 43, 44},
+                                                                        {45, 46, 47, 48, 49}},
+                                                                        {{50, 51, 52, 53, 54},
+                                                                        {55, 56, 57, 58, 59},
+                                                                        {60, 61, 62, 63, 64},
+                                                                        {65, 66, 67, 68, 69},
+                                                                        {70, 71, 72, 73, 74}}},
+                                                                    {{{75, 76, 77, 78, 79},
+                                                                        {80, 81, 82, 83, 84},
+                                                                        {85, 86, 87, 88, 89},
+                                                                        {90, 91, 92, 93, 94},
+                                                                        {95, 96, 97, 98, 99}},
+                                                                        {{100, 101, 102, 103, 104},
+                                                                        {105, 106, 107, 108, 109},
+                                                                        {110, 111, 112, 113, 114},
+                                                                        {115, 116, 117, 118, 119},
+                                                                        {120, 121, 122, 123, 124}},
+                                                                        {{125, 126, 127, 128, 129},
+                                                                        {130, 131, 132, 133, 134},
+                                                                        {135, 136, 137, 138, 139},
+                                                                        {140, 141, 142, 143, 144},
+                                                                        {145, 146, 147, 148, 149}}}}});
+            myInput->setBackend("cuda");
+            op->associateInput(0, myInput);
+            op -> setDataType(DataType::Float32);
+            op -> setBackend("cuda");
+            myFC->forward();
+
+            float* computedOutput   = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
 
-        delete[] computedOutput;
+            for(int i = 0; i < myOutput->size(); i++){
+                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+                REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+            }
+
+            delete[] computedOutput;
+        }
     }
-    SECTION("4D input") {
-        std::shared_ptr<Tensor> myInput =
-                std::make_shared<Tensor>(Array4D<float, 2, 3, 5, 5>{{{{{0, 1, 2, 3, 4},
-                                                                     {5, 6, 7, 8, 9},
-                                                                     {10, 11, 12, 13, 14},
-                                                                     {15, 16, 17, 18, 19},
-                                                                     {20, 21, 22, 23, 24}},
-                                                                    {{25, 26, 27, 28, 29},
-                                                                     {30, 31, 32, 33, 34},
-                                                                     {35, 36, 37, 38, 39},
-                                                                     {40, 41, 42, 43, 44},
-                                                                     {45, 46, 47, 48, 49}},
-                                                                    {{50, 51, 52, 53, 54},
-                                                                     {55, 56, 57, 58, 59},
-                                                                     {60, 61, 62, 63, 64},
-                                                                     {65, 66, 67, 68, 69},
-                                                                     {70, 71, 72, 73, 74}}},
-                                                                   {{{75, 76, 77, 78, 79},
-                                                                     {80, 81, 82, 83, 84},
-                                                                     {85, 86, 87, 88, 89},
-                                                                     {90, 91, 92, 93, 94},
-                                                                     {95, 96, 97, 98, 99}},
-                                                                    {{100, 101, 102, 103, 104},
-                                                                     {105, 106, 107, 108, 109},
-                                                                     {110, 111, 112, 113, 114},
-                                                                     {115, 116, 117, 118, 119},
-                                                                     {120, 121, 122, 123, 124}},
-                                                                    {{125, 126, 127, 128, 129},
-                                                                     {130, 131, 132, 133, 134},
-                                                                     {135, 136, 137, 138, 139},
-                                                                     {140, 141, 142, 143, 144},
-                                                                     {145, 146, 147, 148, 149}}}}});
+
+    SECTION("Random Input"){
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(1, std::size_t(10));
+
+        // To measure execution time of 'forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+        {
+            const std::size_t nbDims = 4;
+            std::vector<std::size_t> dims;
+            for (std::size_t i = 0; i < nbDims; ++i) {
+                dims.push_back(dimSizeDist(gen));
+            }
+            const std::size_t outChannels = dimSizeDist(gen);
+
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t inChannels = nb_elements / dims[0];
+
+            const std::vector<std::size_t> dimsW{outChannels, inChannels};
+            const std::size_t wieghtSize = outChannels * inChannels;
+
+            // Create FC Operator CUDA
+            std::shared_ptr<Node> myFCCUDA = FC(inChannels, outChannels, false, "myfccuda");
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myFCCUDA -> getOperator());
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+
+            // Create FC Operator CPU
+            std::shared_ptr<Node> myFCCPU = FC(inChannels, outChannels, false, "myfccpu");
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myFCCPU -> getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+
+            float* array0 = new float[nb_elements];
+            float* weights = new float[wieghtSize];
+            float* bias = new float[outChannels];
+
+            for (std::size_t i = 0; i < nb_elements; ++i) {
+                array0[i] = valueDist(gen);
+            }
+            for (std::size_t i = 0; i < wieghtSize; ++i) {
+                weights[i] = valueDist(gen);
+            }
+            for (std::size_t i = 0; i < outChannels; ++i) {
+                bias[i] = valueDist(gen);
+            }
+
+            // input0 CUDA
+            float* array0_d, *weight_d, *bias_d;
+            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+            T0_cuda->setDataType(DataType::Float32);
+            T0_cuda->setBackend("cuda");
+            T0_cuda->resize(dims);
+            op_cuda->associateInput(0, T0_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
+
+            // input0 CPU
+            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(0,T0_cpu);
+            T0_cpu->setDataType(DataType::Float32);
+            T0_cpu->setBackend("cpu");
+            T0_cpu->resize(dims);
+            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+
+            // weight CUDA
+            std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>();
+            Tw_cuda->setDataType(DataType::Float32);
+            Tw_cuda->setBackend("cuda");
+            Tw_cuda->resize(dimsW);
+            op_cuda->associateInput(1, Tw_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * wieghtSize);
+            cudaMemcpy(weight_d, weights, sizeof(float) * wieghtSize, cudaMemcpyHostToDevice);
+            Tw_cuda->getImpl()->setRawPtr(weight_d, wieghtSize);
+
+            // weight CPU
+            std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(1,Tw_cpu);
+            Tw_cpu->setDataType(DataType::Float32);
+            Tw_cpu->setBackend("cpu");
+            Tw_cpu->resize(dimsW);
+            Tw_cpu -> getImpl() -> setRawPtr(weights, wieghtSize);
+
+            // bias CUDA
+            std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>();
+            Tb_cuda->setDataType(DataType::Float32);
+            Tb_cuda->setBackend("cuda");
+            Tb_cuda->resize({outChannels});
+            op_cuda->associateInput(2, Tb_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * outChannels);
+            cudaMemcpy(bias_d, bias, sizeof(float) * outChannels, cudaMemcpyHostToDevice);
+            Tb_cuda->getImpl()->setRawPtr(bias_d, outChannels);
+
+            // bias CPU
+            std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(2,Tb_cpu);
+            Tb_cpu->setDataType(DataType::Float32);
+            Tb_cpu->setBackend("cpu");
+            Tb_cpu->resize({outChannels});
+            Tb_cpu -> getImpl() -> setRawPtr(bias, outChannels);
+
+            // forward CUDA
+            start = std::chrono::system_clock::now();
+            op_cuda->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            float *computed_cuda = new float[outSize]();
+            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+
+            // forward CPU
+            op_cpu->forward();
+            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
+
+            delete[] array0;
+            delete[] weights;
+            delete[] bias;
+            delete[] computed_cuda;
+            cudaFree(array0_d);
+            cudaFree(weight_d);
+            cudaFree(bias_d);
+        }
+        std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+    }
+}
+
+TEST_CASE("[gpu/operator] FC(backward)", "[FC][GPU]") {
+    SECTION("2D input") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 2, 3>{
+            {
+                {0.1, 0.2, 0.3},
+                {0.4, 0.5, 0.6}
+        }});
         myInput->setBackend("cuda");
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{
+                {{0.1, 0.2, 0.3},
+                {0.4, 0.5, 0.6}}});
+
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{0.1, 0.2}});
+        myWeights->setBackend("cuda");
+        myBias->setBackend("cuda");
+        std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc");
+        auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
+
         op->associateInput(0, myInput);
+        op -> associateInput(1, myWeights);
+        op -> associateInput(2, myBias);
         op -> setDataType(DataType::Float32);
         op -> setBackend("cuda");
         myFC->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+        // Run and test backward operation
+        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 2, 2> {
+            {
+                {0.1, 0.2},
+                {0.3, 0.4}
+            }
+        });
+        myOutputGrad->setBackend("cuda");
+        std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
+        std::shared_ptr<Tensor> input = op->getInput(0);
+        predictedOutput->setGrad(myOutputGrad);
+        REQUIRE_NOTHROW(myFC->backward());
+
+        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float,2,3> {
+            {
+                {0.09, 0.12, 0.15},
+                {0.19, 0.26, 0.33}
+            }
+        });
+        std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float,2> {
+            {0.4, 0.6}
+        });
+        std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float,2,3> {
+            {
+                {0.13, 0.17, 0.21},
+                {0.18, 0.24, 0.3 }
+            }
+        });
+        float *computedGradCuda = new float[expectedInputGrad->size()]();
+        cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost);
+        float *computedGradWCuda = new float[expectedWeightsGrad->size()]();
+        cudaMemcpy(computedGradWCuda, op->getInput(1)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedWeightsGrad->size(), cudaMemcpyDeviceToHost);
+        float *computedGradBCuda = new float[expectedBiasGrad->size()]();
+        cudaMemcpy(computedGradBCuda, op->getInput(2)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedBiasGrad->size(), cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
-            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        for(int i = 0; i < expectedInputGrad->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
         }
+        for(int i = 0; i < expectedBiasGrad->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedBiasGrad->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedGradBCuda[i] - targetOutput) < 1e-6);
+        }
+        for(int i = 0; i < expectedWeightsGrad->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedWeightsGrad->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedGradWCuda[i] - targetOutput) < 1e-6);
+        }
+
+
+
 
-        delete[] computedOutput;
+        delete[] computedGradCuda;
+        delete[] computedGradWCuda;
+        delete[] computedGradBCuda;
     }
 }
\ No newline at end of file
diff --git a/unit_tests/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/Test_GlobalAveragePoolingImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a0f22ab60ced3a3f7648ce798484f72bd67839a
--- /dev/null
+++ b/unit_tests/Test_GlobalAveragePoolingImpl.cpp
@@ -0,0 +1,172 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+// #include <aidge/utils/Types.h>
+// #include <catch2/catch_test_macros.hpp>
+// #include <chrono>
+// #include <cmath>
+// #include <memory>
+#include <array>
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+namespace Aidge {
+TEST_CASE("[gpu/operator] GlobalAveragePooling",
+          "[GlobalAveragePooling][GPU]") {
+
+    SECTION("4D-Tensor")
+    {
+      std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,1,3,4,4> { //NCHW
+        {
+            {
+                {{0, 1, 2, 3},
+                 {4, 5, 6, 7},
+            	   {8, 9, 10, 11},
+                 {12, 13, 14, 15}},
+
+                {{16, 17, 18, 19},
+                 {20, 21, 22, 23},
+                 {24, 25, 26, 27},
+                 {28, 29, 30, 31}},
+
+                {{32, 33, 34, 35},
+                 {36, 37, 38, 39},
+                 {40, 41, 42, 43},
+                 {44, 45, 46, 47}}
+            }
+        }
+     });
+    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,1,3,1,1> {
+        {
+          {
+            {{ 7.5 }},
+            {{ 23.5 }},
+            {{ 39.5 }}
+          }
+        }
+      });
+      myInput->setBackend("cuda");
+      myInput->setDataType(DataType::Float32);
+      // Create MyGlobalAveragePooling Operator
+      std::shared_ptr<Node> globAvgPool = GlobalAveragePooling();
+      auto op = std::static_pointer_cast<OperatorTensor>(globAvgPool->getOperator());
+      op->setDataType(DataType::Float32);
+      op->setBackend("cuda");
+      op->associateInput(0, myInput);
+
+      globAvgPool->forward();
+      float* computedOutput   = new float[myOutput->size()]();
+      cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+      for(int i = 0; i < myOutput->size(); i++){
+          const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+          REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+      }
+
+      delete[] computedOutput;
+    }
+
+    SECTION("Random Input") {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
+                                                               std::size_t(10));
+
+        // To measure execution time of 'AveragePooling_Op::forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        std::size_t number_of_operation = 0;
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+        {
+            // Create GlobalAveragePooling Operator CUDA
+            std::shared_ptr<Node> myGAvgPoolCuda = GlobalAveragePooling();
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myGAvgPoolCuda->getOperator());
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+
+            // Create GlobalAveragePooling Operator CUDA
+            std::shared_ptr<Node> myGAvgPoolCpu = GlobalAveragePooling();
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myGAvgPoolCpu->getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+            // generate a random Tensor
+            const std::size_t nbDims = 4;
+            std::vector<std::size_t> dims;
+            for (std::size_t i = 0; i < nbDims; ++i)
+            {
+                dims.push_back(dimSizeDist(gen));
+            }
+
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            number_of_operation += nb_elements;
+
+            // Fill input tensor
+            float *array0 = new float[nb_elements];
+            for (std::size_t i = 0; i < nb_elements; ++i)
+            {
+                array0[i] = valueDist(gen);
+            }
+
+            // input0 CUDA
+            float* array0_d;
+            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+            T0_cuda->setDataType(DataType::Float32);
+            T0_cuda->setBackend("cuda");
+            T0_cuda->resize(dims);
+            op_cuda->associateInput(0, T0_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
+
+            // input0 CPU
+            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(0,T0_cpu);
+            T0_cpu->setDataType(DataType::Float32);
+            T0_cpu->setBackend("cpu");
+            T0_cpu->resize(dims);
+            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+
+            // Run inference            
+            start = std::chrono::system_clock::now();
+            op_cuda->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            float *computed_cuda = new float[outSize]();
+            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+
+            // forward CPU
+            op_cpu->forward();
+            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
+
+            delete[] computed_cuda;
+            delete[] array0;
+            cudaFree(array0_d);
+        }
+        std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
+        std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+    }
+}
+} // namespace Aidge
diff --git a/unit_tests/Test_MaxPoolingImpl.cpp b/unit_tests/Test_MaxPoolingImpl.cpp
index bc2efdd447363044dc02fab06964909756a8e2d1..99850a0715cf8feb3164d58c410a1ef689feece1 100644
--- a/unit_tests/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/Test_MaxPoolingImpl.cpp
@@ -10,20 +10,20 @@
  ********************************************************************************/
 
 #include <array>
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
-#include "Test_cuda.hpp"
-
-#include "aidge/data/Tensor.hpp"
-
 #include "aidge/backend/cpu.hpp"
 #include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
 
-TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
+TEST_CASE("[gpu/operator] MaxPooling(forward)", "[MaxPooling][GPU]") {
     std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW
         {
             {
@@ -89,4 +89,95 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
 
         delete[] computedOutput;
     }
+
+    SECTION("Random Input") {
+        constexpr std::uint16_t NBTRIALS = 10;
+        std::size_t kernel = 3;
+        std::size_t stride = 3;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(kernel),
+                                                               std::size_t(10));
+
+        // To measure execution time of 'forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        std::size_t number_of_operation = 0;
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+        {
+            // Create MaxPooling Operator CUDA
+            std::shared_ptr<Node> myMaxPoolCuda = MaxPooling({kernel, kernel}, "myMaxPoolCuda", {stride, stride});
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myMaxPoolCuda->getOperator());
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+
+            // Create MaxPooling Operator CUDA
+            std::shared_ptr<Node> myMaxPoolCpu = MaxPooling({kernel, kernel}, "myMaxPoolCpu", {stride, stride});
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myMaxPoolCpu->getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+            // generate a random Tensor
+            const std::size_t nbDims = 4;
+            std::vector<std::size_t> dims;
+            for (std::size_t i = 0; i < nbDims; ++i)
+            {
+                dims.push_back(dimSizeDist(gen));
+            }
+
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            number_of_operation += nb_elements;
+
+            // Fill input tensor
+            float *array0 = new float[nb_elements];
+            for (std::size_t i = 0; i < nb_elements; ++i)
+            {
+                array0[i] = valueDist(gen);
+            }
+
+            // input0 CUDA
+            float* array0_d;
+            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+            T0_cuda->setDataType(DataType::Float32);
+            T0_cuda->setBackend("cuda");
+            T0_cuda->resize(dims);
+            op_cuda->associateInput(0, T0_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
+
+            // input0 CPU
+            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(0,T0_cpu);
+            T0_cpu->setDataType(DataType::Float32);
+            T0_cpu->setBackend("cpu");
+            T0_cpu->resize(dims);
+            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+
+            // Run inference            
+            start = std::chrono::system_clock::now();
+            op_cuda->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            float *computed_cuda = new float[outSize]();
+            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+
+            // forward CPU
+            op_cpu->forward();
+            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
+
+            delete[] computed_cuda;
+            delete[] array0;
+            cudaFree(array0_d);
+        }
+        std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
+        std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+    }
 }
\ No newline at end of file
diff --git a/unit_tests/Test_PadImpl.cpp b/unit_tests/Test_PadImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e799ea6b7d11c9b446e0e4c8b9d12beae24bb05
--- /dev/null
+++ b/unit_tests/Test_PadImpl.cpp
@@ -0,0 +1,784 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") {
+    SECTION("Symmetric Pad") {
+        const int pv = 0; // pad value
+
+        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv));
+        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW
+            {
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}}
+                },
+                {
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}},
+
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW
+            {
+                {
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    { pv,   0,   1,   2,   3,   4,  pv},
+                    { pv,   5,   6,   7,   8,   9,  pv},
+                    { pv,  10,  11,  12,  13,  14,  pv},
+                    { pv,  15,  16,  17,  18,  19,  pv},
+                    { pv,  20,  21,  22,  23,  24,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
+
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    { pv,  25,  26,  27,  28,  29,  pv},
+                    { pv,  30,  31,  32,  33,  34,  pv},
+                    { pv,  35,  36,  37,  38,  39,  pv},
+                    { pv,  40,  41,  42,  43,  44,  pv},
+                    { pv,  45,  46,  47,  48,  49,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
+
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    { pv,  50,  51,  52,  53,  54,  pv},
+                    { pv,  55,  56,  57,  58,  59,  pv},
+                    { pv,  60,  61,  62,  63,  64,  pv},
+                    { pv,  65,  66,  67,  68,  69,  pv},
+                    { pv,  70,  71,  72,  73,  74,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}}
+                },
+                {
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    { pv,  75,  76,  77,  78,  79,  pv},
+                    { pv,  80,  81,  82,  83,  84,  pv},
+                    { pv,  85,  86,  87,  88,  89,  pv},
+                    { pv,  90,  91,  92,  93,  94,  pv},
+                    { pv,  95,  96,  97,  98,  99,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
+
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    {pv,  100, 101, 102, 103, 104,  pv},
+                    {pv,  105, 106, 107, 108, 109,  pv},
+                    {pv,  110, 111, 112, 113, 114,  pv},
+                    {pv,  115, 116, 117, 118, 119,  pv},
+                    {pv,  120, 121, 122, 123, 124,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
+
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    {pv,  125, 126, 127, 128, 129,  pv},
+                    {pv,  130, 131, 132, 133, 134,  pv},
+                    {pv,  135, 136, 137, 138, 139,  pv},
+                    {pv,  140, 141, 142, 143, 144,  pv},
+                    {pv,  145, 146, 147, 148, 149,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}}
+                }
+            }
+        });
+
+        myInput->setBackend("cuda");
+        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->setDataType(DataType::Float32);
+        myPad->getOperator()->setBackend("cuda");
+
+        myPad->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+
+    SECTION("Asymmetric Pad") {
+        const int pv = 0; // pad value
+
+        std::shared_ptr<Node> myPad = Pad<2>({1, 0, 0, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv));
+        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW
+            {
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}}
+                },
+                {
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}},
+
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,6,6> { //NCHW
+            {
+                {
+                    {{ pv,   pv,   pv,   pv,   pv,  pv},
+                    { 0,   1,   2,   3,   4,  pv},
+                    { 5,   6,   7,   8,   9,  pv},
+                    { 10,  11,  12,  13,  14,  pv},
+                    { 15,  16,  17,  18,  19,  pv},
+                    { 20,  21,  22,  23,  24,  pv}},
+
+                    {{ pv,   pv,   pv,   pv,   pv,  pv},
+                    { 25,  26,  27,  28,  29,  pv},
+                    { 30,  31,  32,  33,  34,  pv},
+                    { 35,  36,  37,  38,  39,  pv},
+                    { 40,  41,  42,  43,  44,  pv},
+                    { 45,  46,  47,  48,  49,  pv}},
+
+                    {{ pv,   pv,   pv,   pv,   pv,  pv},
+                    { 50,  51,  52,  53,  54,  pv},
+                    { 55,  56,  57,  58,  59,  pv},
+                    { 60,  61,  62,  63,  64,  pv},
+                    { 65,  66,  67,  68,  69,  pv},
+                    { 70,  71,  72,  73,  74,  pv}}
+                },
+                {
+                    {{ pv,   pv,   pv,   pv,   pv,  pv},
+                    { 75,  76,  77,  78,  79,  pv},
+                    { 80,  81,  82,  83,  84,  pv},
+                    { 85,  86,  87,  88,  89,  pv},
+                    { 90,  91,  92,  93,  94,  pv},
+                    { 95,  96,  97,  98,  99,  pv}},
+
+                    {{ pv,   pv,   pv,   pv,   pv,  pv},
+                    { 100, 101, 102, 103, 104,  pv},
+                    { 105, 106, 107, 108, 109,  pv},
+                    { 110, 111, 112, 113, 114,  pv},
+                    { 115, 116, 117, 118, 119,  pv},
+                    { 120, 121, 122, 123, 124,  pv}},
+
+                    {{ pv,   pv,   pv,   pv,   pv,  pv},
+                    { 125, 126, 127, 128, 129,  pv},
+                    { 130, 131, 132, 133, 134,  pv},
+                    { 135, 136, 137, 138, 139,  pv},
+                    { 140, 141, 142, 143, 144,  pv},
+                    { 145, 146, 147, 148, 149,  pv}}
+                }
+            }
+        });
+
+        myInput->setBackend("cuda");
+        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->setDataType(DataType::Float32);
+        myPad->getOperator()->setBackend("cuda");
+
+        myPad->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+
+    SECTION("Pad Edge") {
+        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Edge);
+        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW
+            {
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}}
+                },
+                {
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}},
+
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW
+            {
+                {
+                    {{ 0,  0,   1,   2,   3,   4,  4},
+                    { 0,   0,   1,   2,   3,   4,  4},
+                    { 5,   5,   6,   7,   8,   9,  9},
+                    { 10,  10,  11,  12,  13,  14,  14},
+                    { 15,  15,  16,  17,  18,  19,  19},
+                    { 20,  20,  21,  22,  23,  24,  24},
+                    { 20,  20,  21,  22,  23,  24,  24}},
+
+                    {{ 25,  25,  26,  27,  28,  29,  29},
+                    { 25,  25,  26,  27,  28,  29,  29},
+                    { 30,  30,  31,  32,  33,  34,  34},
+                    { 35,  35,  36,  37,  38,  39,  39},
+                    { 40,  40,  41,  42,  43,  44,  44},
+                    { 45,  45,  46,  47,  48,  49,  49},
+                    { 45,  45,  46,  47,  48,  49, 49}},
+
+                    {{ 50,  50,  51,  52,  53,  54,  54},
+                    { 50,  50,  51,  52,  53,  54,  54},
+                    { 55,  55,  56,  57,  58,  59,  59},
+                    { 60,  60,  61,  62,  63,  64,  64},
+                    { 65,  65,  66,  67,  68,  69,  69},
+                    { 70,  70,  71,  72,  73,  74,  74},
+                    { 70,  70,  71,  72,  73,  74,  74}}
+                },
+                {
+                    {{ 75,  75,  76,  77,  78,  79,  79},
+                    { 75,  75,  76,  77,  78,  79,  79},
+                    { 80,  80,  81,  82,  83,  84,  84},
+                    { 85,  85,  86,  87,  88,  89,  89},
+                    { 90,  90,  91,  92,  93,  94,  94},
+                    { 95,  95,  96,  97,  98,  99,  99},
+                    { 95,  95,  96,  97,  98,  99,  99}},
+
+                    {{100,  100, 101, 102, 103, 104,  104},
+                    {100,  100, 101, 102, 103, 104,  104},
+                    {105,  105, 106, 107, 108, 109, 109},
+                    {110,  110, 111, 112, 113, 114,  114},
+                    {115,  115, 116, 117, 118, 119,  119},
+                    {120,  120, 121, 122, 123, 124,  124},
+                    {120,  120, 121, 122, 123, 124,  124}},
+
+                    {{125,  125, 126, 127, 128, 129,  129},
+                    {125,  125, 126, 127, 128, 129,  129},
+                    {130,  130, 131, 132, 133, 134,  134},
+                    {135,  135, 136, 137, 138, 139,  139},
+                    {140,  140, 141, 142, 143, 144,  144},
+                    {145,  145, 146, 147, 148, 149,  149},
+                    {145,  145, 146, 147, 148, 149,  149}}
+                }
+            }
+        });
+        myInput->setBackend("cuda");
+        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->setDataType(DataType::Float32);
+        myPad->getOperator()->setBackend("cuda");
+
+        myPad->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+
+    SECTION("Pad Reflect") {
+        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Reflect);
+        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW
+            {
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}}
+                },
+                {
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}},
+
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW
+            {
+                {
+                    {
+                    { 6, 5, 6, 7, 8, 9, 5},
+                    { 1, 0, 1, 2, 3, 4, 0},
+                    { 6, 5, 6, 7, 8, 9, 5},
+                    { 11, 10, 11, 12, 13, 14, 10},
+                    { 16, 15, 16, 17, 18, 19, 15},
+                    { 21, 20, 21, 22, 23, 24, 20},
+                    { 1, 0, 1, 2, 3, 4, 0}
+                    },
+                    {
+                    { 31, 30, 31, 32, 33, 34, 30},
+                    { 26, 25, 26, 27, 28, 29, 25},
+                    { 31, 30, 31, 32, 33, 34, 30},
+                    { 36, 35, 36, 37, 38, 39, 35},
+                    { 41, 40, 41, 42, 43, 44, 40},
+                    { 46, 45, 46, 47, 48, 49, 45},
+                    { 26, 25, 26, 27, 28, 29, 25}
+                    },
+                    {
+                    { 56, 55, 56, 57, 58, 59, 55},
+                    { 51, 50, 51, 52, 53, 54, 50},
+                    { 56, 55, 56, 57, 58, 59, 55},
+                    { 61, 60, 61, 62, 63, 64, 60},
+                    { 66, 65, 66, 67, 68, 69, 65},
+                    { 71, 70, 71, 72, 73, 74, 70},
+                    { 51, 50, 51, 52, 53, 54, 50}
+                    }
+                },
+                {
+                    {
+                    { 81, 80, 81, 82, 83, 84, 80},
+                    { 76, 75, 76, 77, 78, 79, 75},
+                    { 81, 80, 81, 82, 83, 84, 80},
+                    { 86, 85, 86, 87, 88, 89, 85},
+                    { 91, 90, 91, 92, 93, 94, 90},
+                    { 96, 95, 96, 97, 98, 99, 95},
+                    { 76, 75, 76, 77, 78, 79, 75}
+                    },
+                    {
+                    { 106, 105, 106, 107, 108, 109, 105},
+                    { 101, 100, 101, 102, 103, 104, 100},
+                    { 106, 105, 106, 107, 108, 109, 105},
+                    { 111, 110, 111, 112, 113, 114, 110},
+                    { 116, 115, 116, 117, 118, 119, 115},
+                    { 121, 120, 121, 122, 123, 124, 120},
+                    { 101, 100, 101, 102, 103, 104, 100}
+                    },
+                    {
+                    { 131, 130, 131, 132, 133, 134, 130},
+                    { 126, 125, 126, 127, 128, 129, 125},
+                    { 131, 130, 131, 132, 133, 134, 130},
+                    { 136, 135, 136, 137, 138, 139, 135},
+                    { 141, 140, 141, 142, 143, 144, 140},
+                    { 146, 145, 146, 147, 148, 149, 145},
+                    { 126, 125, 126, 127, 128, 129, 125}
+                    }
+                    }
+                }
+        });
+        myInput->setBackend("cuda");
+        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->setDataType(DataType::Float32);
+        myPad->getOperator()->setBackend("cuda");
+
+        myPad->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+
+    SECTION("Pad Wrap") {
+        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Wrap);
+        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW
+            {
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}}
+                },
+                {
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}},
+
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW
+            {
+                {
+                    {{ 24,  20,  21,  22,  23,  24,  20},
+                    { 4,   0,   1,   2,   3,   4,  0},
+                    { 9,   5,   6,   7,   8,   9,  5},
+                    { 14,  10,  11,  12,  13,  14,  10},
+                    { 19,  15,  16,  17,  18,  19,  15},
+                    { 24,  20,  21,  22,  23,  24,  20},
+                    { 4,   0,   1,   2,   3,   4,  0}},
+
+                    {{ 49,  45,  46,  47,  48,  49, 45},
+                    { 29,  25,  26,  27,  28,  29,  25},
+                    { 34,  30,  31,  32,  33,  34,  30},
+                    { 39,  35,  36,  37,  38,  39,  35},
+                    { 44,  40,  41,  42,  43,  44,  40},
+                    { 49,  45,  46,  47,  48,  49,  45},
+                    { 29,  25,  26,  27,  28,  29,  25}},
+
+                    {{ 74,  70,  71,  72,  73,  74,  70},
+                    { 54,  50,  51,  52,  53,  54,  50},
+                    { 59,  55,  56,  57,  58,  59,  55},
+                    { 64,  60,  61,  62,  63,  64,  60},
+                    { 69,  65,  66,  67,  68,  69,  65},
+                    { 74,  70,  71,  72,  73,  74,  70},
+                    { 54,  50,  51,  52,  53,  54,  50}}
+                },
+                {
+                    {{ 99,  95,  96,  97,  98,  99,  95},
+                    { 79,  75,  76,  77,  78,  79,  75},
+                    { 84,  80,  81,  82,  83,  84,  80},
+                    { 89,  85,  86,  87,  88,  89,  85},
+                    { 94,  90,  91,  92,  93,  94,  90},
+                    { 99,  95,  96,  97,  98,  99,  95},
+                    { 79,  75,  76,  77,  78,  79,  75}},
+
+                    {{124,  120, 121, 122, 123, 124,  120},
+                    {104,  100, 101, 102, 103, 104,  100},
+                    {109,  105, 106, 107, 108, 109, 105},
+                    {114,  110, 111, 112, 113, 114,  110},
+                    {119,  115, 116, 117, 118, 119,  115},
+                    {124,  120, 121, 122, 123, 124,  120},
+                    {104,  100, 101, 102, 103, 104,  100}},
+
+                    {{149,  145, 146, 147, 148, 149,  145},
+                    {129,  125, 126, 127, 128, 129,  125},
+                    {134,  130, 131, 132, 133, 134,  130},
+                    {139,  135, 136, 137, 138, 139,  135},
+                    {144,  140, 141, 142, 143, 144,  140},
+                    {149,  145, 146, 147, 148, 149,  145},
+                    {129,  125, 126, 127, 128, 129,  125}}
+                }
+            }
+        });
+        myInput->setBackend("cuda");
+        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->setDataType(DataType::Float32);
+        myPad->getOperator()->setBackend("cuda");
+
+        myPad->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+    SECTION("Random Input") {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> padTypeDist(std::size_t(0), std::size_t(1));
+        // TODO: fix Reflect and Wrap Pad, cpu and gpu only five same results when padding = 1 
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> padSizeDist(std::size_t(0), std::size_t(5));
+
+        // To measure execution time of 'forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+        {
+            const std::size_t nbDims = 4;
+            std::vector<std::size_t> dims;
+            for (std::size_t i = 0; i < nbDims; ++i) {
+                dims.push_back(dimSizeDist(gen));
+            }
+
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+            const std::size_t borderType = padTypeDist(gen);
+            const std::size_t padding = padSizeDist(gen);
+            // Create Pad Operator CUDA
+            std::shared_ptr<Node> myPadCUDA = Pad<2>({padding, padding, padding, padding}, "mypadcuda",  static_cast<PadBorderType>(borderType));
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPadCUDA -> getOperator());
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+
+            // Create Pad Operator CPU
+            std::shared_ptr<Node> myPadCPU = Pad<2>({padding, padding, padding, padding}, "mypadcpu", static_cast<PadBorderType>(borderType));
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPadCPU -> getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+            float* array0 = new float[nb_elements];
+            for (std::size_t i = 0; i < nb_elements; ++i) {
+                array0[i] = valueDist(gen);
+            }
+
+            // input CUDA
+            float* array0_d;
+            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+            T0_cuda->setDataType(DataType::Float32);
+            T0_cuda->setBackend("cuda");
+            T0_cuda->resize(dims);
+            op_cuda->associateInput(0, T0_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
+
+            // input CPU
+            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(0,T0_cpu);
+            T0_cpu->setDataType(DataType::Float32);
+            T0_cpu->setBackend("cpu");
+            T0_cpu->resize(dims);
+            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+
+            // forward CUDA
+            start = std::chrono::system_clock::now();
+            op_cuda->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            float *computed_cuda = new float[outSize]();
+            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+
+            // forward CPU
+            op_cpu->forward();
+            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
+
+            delete[] array0;
+            delete[] computed_cuda;
+            cudaFree(array0_d);
+        }
+        std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+    }
+}
+
+TEST_CASE("[gpu/operator] Pad(backward)", "[Pad][GPU]") {
+   SECTION("Symmetric Pad") {
+        const int pv = 0; // pad value
+
+        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv));
+        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,1,3,5,5> { //NCHW
+            {
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}}
+                }
+            }
+        });
+        myInput->setBackend("cuda");
+        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->setDataType(DataType::Float32);
+        myPad->getOperator()->setBackend("cuda");
+
+        myPad->forward();
+
+        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,3,7,7> { //NCHW
+            {
+                {
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    { pv,   0,   1,   2,   3,   4,  pv},
+                    { pv,   5,   6,   7,   8,   9,  pv},
+                    { pv,  10,  11,  12,  13,  14,  pv},
+                    { pv,  15,  16,  17,  18,  19,  pv},
+                    { pv,  20,  21,  22,  23,  24,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
+
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    { pv,  25,  26,  27,  28,  29,  pv},
+                    { pv,  30,  31,  32,  33,  34,  pv},
+                    { pv,  35,  36,  37,  38,  39,  pv},
+                    { pv,  40,  41,  42,  43,  44,  pv},
+                    { pv,  45,  46,  47,  48,  49,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
+
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    { pv,  50,  51,  52,  53,  54,  pv},
+                    { pv,  55,  56,  57,  58,  59,  pv},
+                    { pv,  60,  61,  62,  63,  64,  pv},
+                    { pv,  65,  66,  67,  68,  69,  pv},
+                    { pv,  70,  71,  72,  73,  74,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}}
+                }
+            }
+        });
+        myOutputGrad->setBackend("cuda");
+        op->getOutput(0)->setGrad(myOutputGrad);
+        REQUIRE_NOTHROW(myPad->backward());
+
+        float *computedGradCuda = new float[myInput->size()]();
+        cudaMemcpy(computedGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * myInput->size(), cudaMemcpyDeviceToHost);
+
+        myInput->setBackend("cpu");
+        for(int i = 0; i < myInput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myInput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedGradCuda;
+    }
+}
\ No newline at end of file
diff --git a/unit_tests/Test_ReLUImpl.cpp b/unit_tests/Test_ReLUImpl.cpp
index 5651496561f3e1d864767ec38addd4d704b8693c..7ab38aa7def7f846555ae33ccd3871d6ee5a1539 100644
--- a/unit_tests/Test_ReLUImpl.cpp
+++ b/unit_tests/Test_ReLUImpl.cpp
@@ -1,5 +1,5 @@
 /********************************************************************************
- * Copyright (c) 2023 CEA-List
+ * Copyright (c) 2024 CEA-List
  *
  * This program and the accompanying materials are made available under the
  * terms of the Eclipse Public License 2.0 which is available at
@@ -10,123 +10,20 @@
  ********************************************************************************/
 
 #include <array>
-
 #include <catch2/catch_test_macros.hpp>
-
-#include "Test_cuda.hpp"
-
-#include "aidge/data/Tensor.hpp"
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include "aidge/backend/cpu.hpp"
 #include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
 
 TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") {
-    SECTION("1D Tensor") {
-        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array1D<float,10> {
-            {0, 1, 2,-3, 4,-5,-6, 7, 8, 9}
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,10> {
-            {0, 1, 2, 0, 4, 0, 0, 7, 8, 9}
-        });
-
-        std::shared_ptr<Node> myReLU = ReLU();
-        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
-        op->associateInput(0,input0);
-        op->setDataType(DataType::Float32);
-        op->setBackend("cuda");
-        myReLU->forward();
-
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
-            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
-        }
-
-        delete[] computedOutput;
-    }
-
-    SECTION("2D Tensor") {
-        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array2D<float,2,10> {
-            {
-                { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
-                {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
-            }
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,2,10> {
-            {
-                { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
-                { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
-            }
-        });
-
-        std::shared_ptr<Node> myReLU = ReLU();
-        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
-        op->associateInput(0,input0);
-        op->setDataType(DataType::Float32);
-        op->setBackend("cuda");
-        myReLU->forward();
-        
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
-            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
-        }
-
-        delete[] computedOutput;
-    }
-
-    SECTION("3D Tensor") {
-        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array3D<float,2,2,10> {
-            {
-                {
-                    { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
-                    {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
-                },
-                {
-                    { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
-                    {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
-                }
-            }
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,2,2,10> {
-            {
-                {
-                    { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
-                    { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
-                },
-                {
-                    { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
-                    { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
-                }
-            }
-        });
-
-        std::shared_ptr<Node> myReLU = ReLU();
-        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
-        op->associateInput(0,input0);
-        op->setDataType(DataType::Float32);
-        op->setBackend("cuda");
-        myReLU->forward();
-
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
-            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
-        }
-
-        delete[] computedOutput;
-    }
-
-    SECTION("4D Tensor") {
+    SECTION("Constant Input") {
         std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
             {
                 {
@@ -193,4 +90,80 @@ TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") {
 
         delete[] computedOutput;
     }
+    SECTION("Random Input")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
+                                                               std::size_t(10));
+
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(8)); // Max nbDims supported by cudnn is 8
+        // To measure execution time of 'forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        std::size_t number_of_operation = 0;
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+        {
+            // Create ReLU Operator
+            std::shared_ptr<Node> myReLU = ReLU("myReLU");
+            auto op = std::static_pointer_cast<OperatorTensor>(myReLU->getOperator());
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
+
+            // generate a random Tensor
+            const std::size_t nbDims = nbDimsDist(gen);
+            std::vector<std::size_t> dims;
+            for (std::size_t i = 0; i < nbDims; ++i)
+            {
+                dims.push_back(dimSizeDist(gen));
+            }
+
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            number_of_operation += nb_elements;
+
+            // Create the input Tensor
+            std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+            T0->setDataType(DataType::Float32);
+            T0->setBackend("cuda");
+            T0->resize(dims);
+            op->associateInput(0, T0);
+
+            // Fill input tensor
+            float *input_h = new float[nb_elements];
+            float *output_h = new float[nb_elements];
+            for (std::size_t i = 0; i < nb_elements; ++i)
+            {
+                float value = valueDist(gen);
+                input_h[i] = value;
+                output_h[i] = value>=0?value:0.0f;
+            }
+            float *input_d;
+            cudaMalloc(reinterpret_cast<void **>(&input_d), sizeof(float) * nb_elements);
+            cudaMemcpy(input_d, input_h, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            T0->getImpl()->setRawPtr(input_d, nb_elements);
+
+            // Run inference            
+            start = std::chrono::system_clock::now();
+            myReLU->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            float *computedOutput = new float[nb_elements]();
+            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nb_elements, cudaMemcpyDeviceToHost);
+
+            REQUIRE(approxEq<float>(*computedOutput, *output_h));
+
+            delete[] computedOutput;
+            delete[] input_h;
+            cudaFree(input_d);
+        }
+        std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
+        std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+
+    }
 }
diff --git a/unit_tests/Test_ReshapeImpl.cpp b/unit_tests/Test_ReshapeImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df9a4dda6d59371c8dd07f8c4442e3a3bb4a7159
--- /dev/null
+++ b/unit_tests/Test_ReshapeImpl.cpp
@@ -0,0 +1,274 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+#include <numeric> // std::accumulate, std::shuffle, std::transform
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+
+TEST_CASE("[gpu/operator] Reshape(forward)") {
+    SECTION("1D Tensor") {
+        std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array1D<float,6> {
+            {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,2,3> {
+            {
+                {1.0, 2.0, 3.0},
+                {4.0, 5.0, 6.0}
+            }
+        });
+
+        std::shared_ptr<Node> myReshape = Reshape({2, 3});
+        auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator());
+        op->associateInput(0, input);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        myReshape->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+    SECTION("2D Tensor") {
+        std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array2D<float,2,3> {
+            {
+                {1.0, 2.0, 3.0},
+                {4.0, 5.0, 6.0}
+            }
+
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> {
+            {
+                {1.0, 2.0},
+                {3.0, 4.0},
+                {5.0, 6.0}
+            }
+        });
+
+        std::shared_ptr<Node> myReshape = Reshape({3, 2});
+        auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator());
+        op->associateInput(0, input);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        myReshape->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+    SECTION("Random Input")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
+                                                               std::size_t(10));
+
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(CUDNN_DIM_MAX)); // Max nbDims supported by cudnn is 8
+        // To measure execution time of 'forward()'
+        std::chrono::time_point<std::chrono::system_clock> start;
+        std::chrono::time_point<std::chrono::system_clock> end;
+        std::chrono::duration<double, std::micro> duration{};
+        std::size_t number_of_operation = 0;
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
+        {
+            // generate a random Tensor
+            const std::size_t nbDims = nbDimsDist(gen);
+            std::vector<std::size_t> dims, shuffeledDims;
+            for (std::size_t i = 0; i < nbDims; ++i)
+            {
+                dims.push_back(dimSizeDist(gen));
+            }
+            shuffeledDims = dims;
+            std::shuffle(shuffeledDims.begin(), shuffeledDims.end(), gen);
+
+            std::vector<std::int64_t> shuffeledIntDims(shuffeledDims.size());
+            std::transform(shuffeledDims.begin(), shuffeledDims.end(), shuffeledIntDims.begin(),
+                        [](int value) { return static_cast<std::int64_t>(value); });
+            // Create Reshape Operator CUDA
+            std::shared_ptr<Node> myReshapeCuda = Reshape(shuffeledIntDims, false,"myreshapecuda");
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myReshapeCuda->getOperator());
+            op_cuda->setDataType(DataType::Float32);
+            op_cuda->setBackend("cuda");
+
+            // Create Reshape Operator CPU
+            std::shared_ptr<Node> myReshapeCpu = Reshape(shuffeledIntDims, false,"myreshapecpu");
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myReshapeCpu->getOperator());
+            op_cpu->setDataType(DataType::Float32);
+            op_cpu->setBackend("cpu");
+
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            number_of_operation += nb_elements;
+
+            // Fill input tensor
+            float *array0 = new float[nb_elements];
+            for (std::size_t i = 0; i < nb_elements; ++i)
+            {
+                array0[i] = valueDist(gen);
+            }
+
+            // input0 CUDA
+            float* array0_d;
+            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+            T0_cuda->setDataType(DataType::Float32);
+            T0_cuda->setBackend("cuda");
+            T0_cuda->resize(dims);
+            op_cuda->associateInput(0, T0_cuda);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
+
+            // input0 CPU
+            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+            op_cpu->associateInput(0,T0_cpu);
+            T0_cpu->setDataType(DataType::Float32);
+            T0_cpu->setBackend("cpu");
+            T0_cpu->resize(dims);
+            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+
+            // Run inference            
+            start = std::chrono::system_clock::now();
+            op_cuda->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            float *computed_cuda = new float[nb_elements];
+            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nb_elements, cudaMemcpyDeviceToHost);
+
+            // forward CPU
+            op_cpu->forward();
+            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
+
+            delete[] computed_cuda;
+            delete[] array0;
+            cudaFree(array0_d);
+        }
+        std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
+        std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+
+    }
+}
+
+TEST_CASE("[gpu/operator] Reshape(backward)") {
+    SECTION("1D Tensor") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,2,3> {
+            {
+                {1.0, 2.0, 3.0},
+                {4.0, 5.0, 6.0}
+            }
+        });
+
+        std::shared_ptr<Node> myReshape = Reshape({6});
+        auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator());
+        op->associateInput(0, myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        myReshape->forward();
+
+        // Run and test backward operation
+        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array1D<float, 6> {
+            {1, 2, 3, 4, 5, 6}
+        });
+        myOutputGrad->setBackend("cuda");
+        std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
+        std::shared_ptr<Tensor> input = op->getInput(0);
+        predictedOutput->setGrad(myOutputGrad);
+        REQUIRE_NOTHROW(myReshape->backward());
+
+        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float,2,3> {
+            {
+                {1.0, 2.0, 3.0},
+                {4.0, 5.0, 6.0}
+            }
+        });
+
+        float *computedGradCuda = new float[expectedInputGrad->size()]();
+        cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost);
+        
+        for(int i = 0; i < expectedInputGrad->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedGradCuda;
+    }
+    SECTION("2D Tensor") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,2,3> {
+            {
+                {1.0, 2.0, 3.0},
+                {4.0, 5.0, 6.0}
+            }
+        });
+
+        std::shared_ptr<Node> myReshape = Reshape({3, 2});
+        auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator());
+        op->associateInput(0, myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        myReshape->forward();
+
+        // Run and test backward operation
+        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 3, 2> {
+            {
+                {1.0, 2.0},
+                {3.0, 4.0},
+                {5.0, 6.0}
+            }
+        });
+        myOutputGrad->setBackend("cuda");
+        std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
+        std::shared_ptr<Tensor> input = op->getInput(0);
+        predictedOutput->setGrad(myOutputGrad);
+        REQUIRE_NOTHROW(myReshape->backward());
+
+        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float,2,3> {
+            {
+                {1.0, 2.0, 3.0},
+                {4.0, 5.0, 6.0}
+            }
+        });
+
+        float *computedGradCuda = new float[expectedInputGrad->size()]();
+        cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost);
+        
+        for(int i = 0; i < expectedInputGrad->size(); i++){
+            const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedGradCuda;
+    }
+}
diff --git a/version.txt b/version.txt
index 341cf11faf9a29504168de4e54beaad182c5adc5..f4778493c50025c6ab147a1fec7486ef0c706792 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.0
\ No newline at end of file
+0.2.2
\ No newline at end of file