diff --git a/.gitlab/ci/build.gitlab-ci.yml b/.gitlab/ci/build.gitlab-ci.yml
index 01c537fce668e95f5e632cce8b58eda0da36ecdd..d12c714d34b2d457a5b4a675bbd4b99c1211362b 100644
--- a/.gitlab/ci/build.gitlab-ci.yml
+++ b/.gitlab/ci/build.gitlab-ci.yml
@@ -1,3 +1,6 @@
+include:
+  - remote: 'https://gitlab.eclipse.org/eclipse/aidge/gitlab_shared_files/-/raw/main/.gitlab/ci/shared_script.gitlab-ci.yml'
+
 build:ubuntu_cpp:
   stage: build
   needs: []
@@ -6,15 +9,14 @@ build:ubuntu_cpp:
 
   script:
     # Download dependencies
+    - DEPENDENCY_JOB="build:ubuntu_cpp"
     # aidge_core
-    - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"'
-    - unzip -o build_artifacts.zip -d .
-    - rm -rf build_cpp
+    - DEPENDENCY_NAME="aidge_core"
+    - !reference [.download_dependency, script]
     # aidge_backend_cpu
-    - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"'
-    - unzip -o build_artifacts.zip -d .
-    - rm -rf build_cpp
-  
+    - DEPENDENCY_NAME="aidge_backend_cpu"
+    - !reference [.download_dependency, script]
+
     # Build current module
     - export CMAKE_PREFIX_PATH=../install_cpp
     - mkdir -p build_cpp
@@ -35,15 +37,14 @@ build:ubuntu_cpp_g++10:
     - docker
 
   script:
-    # Download dependencies
+        # Download dependencies
+    - DEPENDENCY_JOB="build:ubuntu_cpp"
     # aidge_core
-    - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"'
-    - unzip -o build_artifacts.zip -d .
-    - rm -rf build_cpp
+    - DEPENDENCY_NAME="aidge_core"
+    - !reference [.download_dependency, script]
     # aidge_backend_cpu
-    - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"'
-    - unzip -o build_artifacts.zip -d .
-    - rm -rf build_cpp
+    - DEPENDENCY_NAME="aidge_backend_cpu"
+    - !reference [.download_dependency, script]
 
     # Build current module
     - export CMAKE_PREFIX_PATH=../install_cpp
@@ -63,14 +64,14 @@ build:ubuntu_cpp_g++12:
 
   script:
     # Download dependencies
+    - DEPENDENCY_JOB="build:ubuntu_cpp"
     # aidge_core
-    - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"'
-    - unzip -o build_artifacts.zip -d .
-    - rm -rf build_cpp
+    - DEPENDENCY_NAME="aidge_core"
+    - !reference [.download_dependency, script]
     # aidge_backend_cpu
-    - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"'
-    - unzip -o build_artifacts.zip -d .
-    - rm -rf build_cpp
+    - DEPENDENCY_NAME="aidge_backend_cpu"
+    - !reference [.download_dependency, script]
+
 
     # Build current module
     - export CMAKE_PREFIX_PATH=../install_cpp
@@ -90,14 +91,14 @@ build:ubuntu_cpp_clang12:
 
   script:
     # Download dependencies
+    - DEPENDENCY_JOB="build:ubuntu_cpp"
     # aidge_core
-    - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"'
-    - unzip -o build_artifacts.zip -d .
-    - rm -rf build_cpp
+    - DEPENDENCY_NAME="aidge_core"
+    - !reference [.download_dependency, script]
     # aidge_backend_cpu
-    - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"'
-    - unzip -o build_artifacts.zip -d .
-    - rm -rf build_cpp
+    - DEPENDENCY_NAME="aidge_backend_cpu"
+    - !reference [.download_dependency, script]
+
 
     # Build current module
     - export CMAKE_PREFIX_PATH=../install_cpp
@@ -117,14 +118,13 @@ build:ubuntu_cpp_clang15:
 
   script:
     # Download dependencies
+    - DEPENDENCY_JOB="build:ubuntu_cpp"
     # aidge_core
-    - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_cpp"'
-    - unzip -o build_artifacts.zip -d .
-    - rm -rf build_cpp
+    - DEPENDENCY_NAME="aidge_core"
+    - !reference [.download_dependency, script]
     # aidge_backend_cpu
-    - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_cpp"'
-    - unzip -o build_artifacts.zip -d .
-    - rm -rf build_cpp
+    - DEPENDENCY_NAME="aidge_backend_cpu"
+    - !reference [.download_dependency, script]
 
     # Build current module
     - export CMAKE_PREFIX_PATH=../install_cpp
@@ -144,12 +144,13 @@ build:ubuntu_python:
 
   script:
     # Download dependencies
-    # aidge_core (Python)
-    - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5139/jobs/artifacts/main/download?job=build:ubuntu_python"'
-    - unzip -o build_artifacts.zip -d .
-    # aidge_backend_cpu (Python)
-    - 'curl --location --output build_artifacts.zip "https://gitlab.eclipse.org/api/v4/projects/5140/jobs/artifacts/master/download?job=build:ubuntu_python"'
-    - unzip -o build_artifacts.zip -d .
+    - DEPENDENCY_JOB="build:ubuntu_python"
+    # aidge_core (python)
+    - DEPENDENCY_NAME="aidge_core"
+    - !reference [.download_dependency, script]
+    # aidge_backend_cpu (python)
+    - DEPENDENCY_NAME="aidge_backend_cpu"
+    - !reference [.download_dependency, script]
 
     - python3 -m pip install virtualenv
     - virtualenv venv
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21f57bc75b2fa4ad8b57711b092693c70b20d628..d940def39a4e092bb01765a0b127b41c6a88914f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,11 +7,12 @@ file(READ "${CMAKE_SOURCE_DIR}/project_name.txt" project)
 message(STATUS "Project name: ${project}")
 message(STATUS "Project version: ${version}")
 
-# Note : project name is {project} and python module name is also {project} 
+# Note : project name is {project} and python module name is also {project}
 set(module_name _${project}) # target name
 
 
 project(${project})
+set(CXX_STANDARD 14)
 
 ##############################################
 # Define options
@@ -19,6 +20,7 @@ option(PYBIND "python binding" ON)
 option(WERROR "Warning as error" OFF)
 option(TEST "Enable tests" ON)
 option(COVERAGE "Enable coverage" OFF)
+option(ENABLE_ASAN "Enable ASan (AddressSanitizer) for runtime analysis of memory use (over/underflow, memory leak, ...)" OFF)
 
 ##############################################
 # Import utils CMakeLists
@@ -36,8 +38,9 @@ enable_language(CUDA)
 find_package(CUDAToolkit REQUIRED)
 
 find_package(aidge_core REQUIRED)
-find_package(aidge_backend_cpu REQUIRED)
-
+if(TEST)
+    find_package(aidge_backend_cpu REQUIRED)
+endif()
 ##############################################
 # Create target and set properties
 
@@ -48,11 +51,31 @@ add_library(${module_name} ${src_files} ${inc_files})
 target_link_libraries(${module_name}
     PUBLIC
         _aidge_core # _ is added because we link the target not the project
-        _aidge_backend_cpu # _ is added because we link the target not the project
         CUDA::cudart
+        CUDA::cublas
         cudnn
 )
 
+if( ${ENABLE_ASAN} )
+    message("Building ${module_name} with ASAN.")
+    set(SANITIZE_FLAGS -fsanitize=address -fno-omit-frame-pointer)
+    target_link_libraries(${module_name}
+        PUBLIC
+            -fsanitize=address
+    )
+    target_compile_options(${module_name}
+        PRIVATE
+            ${SANITIZE_FLAGS}
+    )
+endif()
+
+if(TEST)
+    target_link_libraries(${module_name}
+    PUBLIC
+        _aidge_backend_cpu # _ is added because we link the target not the project
+    )
+endif()
+
 #Set target properties
 target_include_directories(${module_name}
     PUBLIC
@@ -76,7 +99,7 @@ if (PYBIND)
 
     # Handles Python + pybind11 headers dependencies
     target_link_libraries(${module_name}
-        PUBLIC 
+        PUBLIC
             pybind11::pybind11
         PRIVATE
             Python::Python
@@ -119,8 +142,8 @@ install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 install(EXPORT ${project}-targets
  FILE "${project}-targets.cmake"
  DESTINATION ${INSTALL_CONFIGDIR}
-#  COMPONENT ${module_name} 
-)  
+#  COMPONENT ${module_name}
+)
 
 #Create a ConfigVersion.cmake file
 include(CMakePackageConfigHelpers)
diff --git a/include/aidge/backend/cuda.hpp b/include/aidge/backend/cuda.hpp
index cfae53b64115aa7946580d00f45be56f17163d7f..5b9b94f03a2c5a099f010ab7117479030003f5a8 100644
--- a/include/aidge/backend/cuda.hpp
+++ b/include/aidge/backend/cuda.hpp
@@ -13,7 +13,10 @@
 #define AIDGE_BACKEND_CUDA_IMPORTS_H_
 
 #include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp"
 #include "aidge/backend/cuda/operator/ConvImpl.hpp"
-#include "aidge/backend/cuda/operator/ProducerImpl.hpp"
+#include "aidge/backend/cuda/operator/FCImpl.hpp"
+#include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp"
+#include "aidge/backend/cuda/operator/ReLUImpl.hpp"
 
 #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/data/TensorImpl.hpp b/include/aidge/backend/cuda/data/TensorImpl.hpp
index c61e926c88a9baf1fcdf64794c2a975a1b891356..f083a8ba9f68ed53929db95b3fd6604f31548e21 100644
--- a/include/aidge/backend/cuda/data/TensorImpl.hpp
+++ b/include/aidge/backend/cuda/data/TensorImpl.hpp
@@ -1,6 +1,10 @@
 #ifndef AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_
 #define AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_
 
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <string>
+
 #include "aidge/backend/TensorImpl.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/utils/Registrar.hpp"
@@ -24,19 +28,27 @@ void thrust_copy(const half_float::half* srcData, half_float::half* dstData, siz
 
 /**
  * @brief Abstract class for the TensorImpl_cuda class template.
- * @details Its purpose is to provide access to base methods that are specific 
- * to the implementation (which are therefore not present in the TensorImpl 
+ * @details Its purpose is to provide access to base methods that are specific
+ * to the implementation (which are therefore not present in the TensorImpl
  * class), but whose data type does not need to be known.
  */
 class TensorImpl_cuda_ {
+protected:
+    mutable cudnnTensorDescriptor_t mCudnnTensor = nullptr;
+
 public:
     /**
      * @brief Return the CuDNN tensor descriptor of the tensor.
-     * @details This method uses lazy initialization for the descriptor 
+     * @details This method uses lazy initialization for the descriptor
      * (which is therefore mutable in the derived class).
      * @return cudnnTensorDescriptor_t CuDNN tensor descriptor.
      */
-    virtual const cudnnTensorDescriptor_t& getCudnnTensorDesc() const = 0;
+    virtual const cudnnTensorDescriptor_t& getCudnnTensorDesc(const Tensor& tensor) const = 0;
+
+    virtual ~TensorImpl_cuda_() {
+        if (mCudnnTensor != nullptr)
+            cudnnDestroyTensorDescriptor(mCudnnTensor);
+    }
 };
 
 template <class T>
@@ -54,119 +66,117 @@ private:
     }
 
 private:
-    const Tensor &mTensor;  // Impl needs to access Tensor information, but is not
-                            // supposed to change it!
-    /// Pointer to the data and its capacity
     future_std::span<T> mData;
     /// If this instance own the data, std::unique_ptr manages it
     std::unique_ptr<T, decltype(&cudaDelete)> mDataOwner;
-    mutable cudnnTensorDescriptor_t mCudnnTensor = nullptr;
 
 public:
-    static constexpr const char *Backend = "cuda";
+    static const std::string Backend;
+
+    TensorImpl_cuda(DeviceIdx_t device, std::vector<DimSize_t> dims) : TensorImpl(Backend, device, dims), mDataOwner(nullptr, cudaDelete) {}
 
-    TensorImpl_cuda(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor), mDataOwner(nullptr, cudaDelete) {}
 
     bool operator==(const TensorImpl &otherImpl) const override final;
 
-    static std::unique_ptr<TensorImpl_cuda> create(const Tensor &tensor) {
-        return std::make_unique<TensorImpl_cuda<T>>(tensor);
+    static std::shared_ptr<TensorImpl_cuda> create(DeviceIdx_t device, std::vector<DimSize_t> dims) {
+        return std::make_shared<TensorImpl_cuda<T>>(device, dims);
     }
 
     // native interface
     const future_std::span<T>& data() const { return mData; }
 
-    std::size_t size() const override { return mData.size(); }
-    std::size_t scalarSize() const override { return sizeof(T); }
-
-    void setDevice(DeviceIdx_t device) override {
-        mDevice = device;
-    }
+    std::size_t scalarSize() const noexcept override { return sizeof(T); }
 
     void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override {
-        void* dst = static_cast<void*>(static_cast<T*>(rawPtr()) + offset);
-        CHECK_CUDA_STATUS(cudaMemcpy(dst, src, length * sizeof(T), cudaMemcpyDeviceToDevice));
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
+        const T* srcT = static_cast<const T *>(src);
+        T* dstT = static_cast<T *>(rawPtr(offset));
+
+        AIDGE_ASSERT(dstT < srcT || dstT >= srcT + length, "overlapping copy is not supported");
+        CHECK_CUDA_STATUS(cudaMemcpy(dstT, srcT, length * sizeof(T), cudaMemcpyDeviceToDevice));
     }
 
-    void copyCast(const void *src, NbElts_t length, const DataType srcDt) override {
+    void copyCast(const void *src, const DataType srcDt, NbElts_t length, NbElts_t offset = 0) override {
         if (length == 0) {
             return;
         }
 
-        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
-        if (srcDt == DataType::Float64) {
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
+        switch (srcDt) {
+        case DataType::Float64:
             thrust_copy(static_cast<const double*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
                         length);
-        }
-        else if (srcDt == DataType::Float32) {
+            break;
+        case DataType::Float32:
             thrust_copy(static_cast<const float*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
                         length);
-        }
-        else if (srcDt == DataType::Float16) {
+            break;
+        case DataType::Float16:
             thrust_copy(static_cast<const half_float::half*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
                         length);
-        }
-        else if (srcDt == DataType::Int64) {
+            break;
+        case DataType::Int64:
             thrust_copy(static_cast<const int64_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
                         length);
-        }
-        else if (srcDt == DataType::UInt64) {
+            break;
+        case DataType::UInt64:
             thrust_copy(static_cast<const uint64_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
                         length);
-        }
-        else if (srcDt == DataType::Int32) {
+            break;
+        case DataType::Int32:
             thrust_copy(static_cast<const int32_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
                         length);
-        }
-        else if (srcDt == DataType::UInt32) {
+            break;
+        case DataType::UInt32:
             thrust_copy(static_cast<const uint32_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
                         length);
-        }
-        else if (srcDt == DataType::Int16) {
+            break;
+        case DataType::Int16:
             thrust_copy(static_cast<const int16_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
                         length);
-        }
-        else if (srcDt == DataType::UInt16) {
+            break;
+        case DataType::UInt16:
             thrust_copy(static_cast<const uint16_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
                         length);
-        }
-        else if (srcDt == DataType::Int8) {
+            break;
+        case DataType::Int8:
             thrust_copy(static_cast<const int8_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
                         length);
-        }
-        else if (srcDt == DataType::UInt8) {
+            break;
+        case DataType::UInt8:
             thrust_copy(static_cast<const uint8_t*>(src),
-                        static_cast<T*>(rawPtr()),
+                        static_cast<T*>(rawPtr(offset)),
                         length);
-        }
-        else {
+            break;
+        default:
             AIDGE_THROW_OR_ABORT(std::runtime_error, "Unsupported data type.");
+            break;
         }
     }
 
-    void copyFromDevice(const void *src, NbElts_t length, const std::pair<std::string, DeviceIdx_t>& device) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
-        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyDeviceToDevice));
+    void copyFromDevice(const void *src, const std::pair<std::string, DeviceIdx_t>& device, NbElts_t length, NbElts_t offset = 0) override {
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
+        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyDeviceToDevice));
     }
 
-    void copyFromHost(const void *src, NbElts_t length) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
-        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyHostToDevice));
+    void copyFromHost(const void *src, NbElts_t length, NbElts_t offset = 0) override {
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
+        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyHostToDevice));
     }
 
-    void copyToHost(void *dst, NbElts_t length) const override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
-        CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(), length * sizeof(T), cudaMemcpyDeviceToHost));
+    void copyToHost(void *dst, NbElts_t length, NbElts_t offset = 0) const override {
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "copy length is above capacity");
+        CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(offset), length * sizeof(T), cudaMemcpyDeviceToHost));
     }
 
     void *rawPtr(NbElts_t offset = 0) override {
@@ -175,30 +185,27 @@ public:
     };
 
     const void *rawPtr(NbElts_t offset = 0) const override {
-        AIDGE_ASSERT(mData.size() >= mTensor.size(), "accessing uninitialized const rawPtr");
+        AIDGE_ASSERT(mData.size() >= mNbElts, "accessing uninitialized const rawPtr");
         return (mData.data() + offset);
     };
 
-    const cudnnTensorDescriptor_t& getCudnnTensorDesc() const override {
+    const cudnnTensorDescriptor_t& getCudnnTensorDesc(const Tensor& tensor) const override {
         if (mCudnnTensor == nullptr) {
             CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mCudnnTensor));
 
-            if (mTensor.size() > 0) {
+            if (tensor.size() > 0) {
                 /**
                 **      cudNN Tensors are restricted to having at least 4 dimensions :
                 **      When working with lower dimensionsal data, unused dimensions are set to 1.
                 **      Referes to the cudnnSetTensorNdDescriptor documentation from :
                 **      https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html
                 **/
-                std::vector<int> dims(mTensor.dims().begin(), mTensor.dims().end());
+                std::vector<int> dims(tensor.dims().cbegin(), tensor.dims().cend());
+                std::vector<int> strides(tensor.strides().cbegin(), tensor.strides().cend());
 
-                if (dims.size() < 4)
+                if (dims.size() < 4) {
                     dims.resize(4, 1);
-
-                std::vector<int> strides(dims.size(), 1);
-
-                for (size_t dim = 1; dim < dims.size(); ++dim) {
-                    strides[dims.size() - dim - 1] = strides[dims.size() - dim] * dims[dims.size() - dim];
+                    strides.resize(4, 1);
                 }
 
                 CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor,
@@ -213,27 +220,27 @@ public:
     }
 
     void setRawPtr(void *ptr, NbElts_t length) override final {
-        AIDGE_ASSERT(length >= mTensor.size(), "trying to set raw pointer of insufficient capacity");
+        AIDGE_ASSERT(length >= mNbElts, "trying to set raw pointer of insufficient capacity");
         mData = future_std::span<T>(static_cast<T *>(ptr), length);
         mDataOwner.reset();
     };
 
-    virtual ~TensorImpl_cuda() {
-        if (mCudnnTensor != nullptr)
-            cudnnDestroyTensorDescriptor(mCudnnTensor);
-    }
+    virtual ~TensorImpl_cuda() = default;
 
 private:
     void lazyInit() {
-        if (mData.size() < mTensor.size()) {
+        if (mData.size() < mNbElts) {
             // Need more data, a re-allocation will occur
             AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "trying to enlarge non-owned data");
-            mDataOwner.reset(cudaAlloc(mTensor.size()));
-            mData = future_std::span<T>(mDataOwner.get(), mTensor.size());
+            mDataOwner.reset(cudaAlloc(mNbElts));
+            mData = future_std::span<T>(mDataOwner.get(), mNbElts);
         }
     }
 };
 
+template <typename T>
+const std::string TensorImpl_cuda<T>::Backend = "cuda";
+
 namespace {
 static Registrar<Tensor> registrarTensorImpl_cuda_Float64(
         {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create);
diff --git a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..43a6bd57c0c6431705abe73d3f3c175046d72dc9
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
@@ -0,0 +1,59 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_AVGPOOLINGIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_AVGPOOLINGIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/AvgPooling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+template <DimIdx_t DIM>
+class AvgPoolingImpl_cuda : public OperatorImpl {
+private:
+    // CuDNN specific variables
+    cudnnPoolingDescriptor_t mAvgPoolingDesc = nullptr;
+    cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    std::shared_ptr<Tensor> mInputFallback;
+
+public:
+    AvgPoolingImpl_cuda(const AvgPooling_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<AvgPoolingImpl_cuda> create(const AvgPooling_Op<2> &op) {
+        return std::make_unique<AvgPoolingImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    ~AvgPoolingImpl_cuda();
+
+private:
+    template <class T> void forward_(const Tensor& input);
+};
+
+namespace {
+// add cuda backend to AvgPooling_Op<2> implementation registry
+static Registrar<AvgPooling_Op<2>> registrarAvgPoolingImpl_cuda("cuda", Aidge::AvgPoolingImpl_cuda<2>::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_AVGPOOLINGIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ConvImpl.hpp b/include/aidge/backend/cuda/operator/ConvImpl.hpp
index 0ad995b24082782c611b93fbcc040d1319a7362f..8c591927ce0e52daeff447726c114ce3ae4d0103 100644
--- a/include/aidge/backend/cuda/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ConvImpl.hpp
@@ -34,22 +34,30 @@ private:
     cudnnConvolutionDescriptor_t mConvDesc = nullptr;
     cudnnFilterDescriptor_t mFilterDesc = nullptr;
     cudnnConvolutionFwdAlgo_t mFwdAlgo;
+    cudnnConvolutionBwdFilterAlgo_t mBwdFilterAlgo;
+    cudnnConvolutionBwdDataAlgo_t mBwdDataAlgo;
     size_t mWorkspaceSize = 0;
-    void* mWorkspace = nullptr;
+    void* mFwdWorkspace = nullptr;
+    void* mBwdWorkspace = nullptr;
+    std::shared_ptr<Tensor> mInput0Fallback;
+    std::shared_ptr<Tensor> mInput1Fallback;
+    std::shared_ptr<Tensor> mInput2Fallback;
 
 public:
-    ConvImpl_cuda(const Conv_Op<DIM> &op) : OperatorImpl(op) {}
+    ConvImpl_cuda(const Conv_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<2> &op) {
+    static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<DIM> &op) {
         return std::make_unique<ConvImpl_cuda>(op);
     }
 
 public:
     void forward();
+    void backward();
     ~ConvImpl_cuda();
 
 private:
     template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
+    template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
 };
 
 namespace {
diff --git a/include/aidge/backend/cuda/operator/FCImpl.hpp b/include/aidge/backend/cuda/operator/FCImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3d8a1348d500fc533c7c9b601b09629995f97427
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/FCImpl.hpp
@@ -0,0 +1,61 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_FCIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_FCIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/FC.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class FCImplForward_cuda : public Registrable<FCImplForward_cuda,
+                                                 std::tuple<DataType>,
+                                                 void(std::size_t , std::size_t, std::size_t, bool, const void* , const void* , const void* , void*)> {};
+class FCImpl_cuda : public OperatorImpl {
+private:
+    std::shared_ptr<Tensor> mInput0Fallback;
+    std::shared_ptr<Tensor> mInput1Fallback;
+    std::shared_ptr<Tensor> mInput2Fallback;
+
+
+public:
+    FCImpl_cuda(const FC_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<FCImpl_cuda> create(const FC_Op &op) {
+        return std::make_unique<FCImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    // ~FCImpl_cuda();
+
+private:
+    template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels);
+};
+
+namespace {
+// add cuda backend to FC_Op implementation registry
+static Registrar<FC_Op> registrarFCImpl_cuda("cuda", Aidge::FCImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_FCIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9084e01fc08cb3d00e80fc8cf6246064b20591f2
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp
@@ -0,0 +1,36 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+
+template <class T>
+cublasStatus_t cublasGemm(cublasHandle_t handle,
+                          cublasOperation_t transa, cublasOperation_t transb,
+                          int m, int n, int k,
+                          const T *alpha,
+                          const T *A, int lda,
+                          const T *B, int ldb,
+                          const T *beta,
+                          T *C, int ldc);
+}
+#endif /* AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d08d7fab3c7cb2baa18838fd872e44a8eccc923
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp
@@ -0,0 +1,59 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_MAXPOOLINGIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_MAXPOOLINGIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/MaxPooling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+template <DimIdx_t DIM>
+class MaxPoolingImpl_cuda : public OperatorImpl {
+private:
+    // CuDNN specific variables
+    cudnnPoolingDescriptor_t mMaxPoolingDesc = nullptr;
+    cudnnPoolingMode_t mMode = CUDNN_POOLING_MAX;
+    std::shared_ptr<Tensor> mInputFallback;
+
+public:
+    MaxPoolingImpl_cuda(const MaxPooling_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<MaxPoolingImpl_cuda> create(const MaxPooling_Op<2> &op) {
+        return std::make_unique<MaxPoolingImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    ~MaxPoolingImpl_cuda();
+
+private:
+    template <class T> void forward_(const Tensor& input);
+};
+
+namespace {
+// add cuda backend to MaxPooling_Op<2> implementation registry
+static Registrar<MaxPooling_Op<2>> registrarMaxPoolingImpl_cuda("cuda", Aidge::MaxPoolingImpl_cuda<2>::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_MAXPOOLINGIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ProducerImpl.hpp b/include/aidge/backend/cuda/operator/ProducerImpl.hpp
deleted file mode 100644
index 9912133072e23181df8f384841660bf89a829b60..0000000000000000000000000000000000000000
--- a/include/aidge/backend/cuda/operator/ProducerImpl.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_
-#define AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_
-
-#include <memory>
-
-#include "aidge/backend/OperatorImpl.hpp"
-#include "aidge/operator/Producer.hpp"
-#include "aidge/utils/Registrar.hpp"
-#include "aidge/utils/Types.h"
-
-namespace Aidge {
-class ProducerImpl_cuda : public OperatorImpl {
-public:
-    ProducerImpl_cuda(const Producer_Op &op) : OperatorImpl(op) {}
-
-    static std::unique_ptr<ProducerImpl_cuda> create(const Producer_Op &op) {
-        return std::make_unique<ProducerImpl_cuda>(op);
-    }
-
-    NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-static Registrar<Producer_Op> registrarProducerImpl_cuda("cuda", Aidge::ProducerImpl_cuda::create);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CUDA_OPERATOR_PRODUCERIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ReLUImpl.hpp b/include/aidge/backend/cuda/operator/ReLUImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6570662fa5df27e54a9df6f357e918243a71330a
--- /dev/null
+++ b/include/aidge/backend/cuda/operator/ReLUImpl.hpp
@@ -0,0 +1,61 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_RELUIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_RELUIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/ReLU.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class ReLUImpl_cuda : public OperatorImpl {
+private:
+    // CuDNN specific variables
+    #if CUDNN_VERSION >= 5000
+        cudnnActivationDescriptor_t mReLUDesc = nullptr;
+    #else
+        cudnnActivationMode_t mReLUDesc = nullptr;
+    #endif
+    std::shared_ptr<Tensor> mInputFallback;
+
+public:
+    ReLUImpl_cuda(const ReLU_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<ReLUImpl_cuda> create(const ReLU_Op &op) {
+        return std::make_unique<ReLUImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    ~ReLUImpl_cuda();
+
+private:
+    template <class T> void forward_(const Tensor& input);
+};
+
+namespace {
+// add cuda backend to ReLU_Op implementation registry
+static Registrar<ReLU_Op> registrarReLUImpl_cuda("cuda", Aidge::ReLUImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_RELUIMPL_H_ */
diff --git a/include/aidge/backend/cuda/utils/CudaContext.hpp b/include/aidge/backend/cuda/utils/CudaContext.hpp
index 82dd395e6bbb33bae29c5d881290d6996bfb0332..7218cc24aed718f57a1866be74e7ba9124a5a7f1 100644
--- a/include/aidge/backend/cuda/utils/CudaContext.hpp
+++ b/include/aidge/backend/cuda/utils/CudaContext.hpp
@@ -2,8 +2,8 @@
 #define AIDGE_BACKEND_CUDA_CUDA_CONTEXT_H
 
 #include <vector>
-#include <cstdio>
 
+#include "aidge/utils/ErrorHandling.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
@@ -87,7 +87,7 @@ public:
 
         if (cublas_h[dev] == NULL) {
             CHECK_CUBLAS_STATUS(cublasCreate(&cublas_h[dev]));
-            printf("CUBLAS initialized on device #%d\n", dev);
+            fmt::print("CUBLAS initialized on device #{}\n", dev);
         }
 
         return cublas_h[dev];
@@ -113,7 +113,7 @@ public:
 
         if (cudnn_h[dev] == NULL) {
             CHECK_CUDNN_STATUS(cudnnCreate(&cudnn_h[dev]));
-            printf("CUDNN initialized on device #%d\n", dev);
+            fmt::print("CUDNN initialized on device #{}\n", dev);
         }
 
         return cudnn_h[dev];
diff --git a/include/aidge/backend/cuda/utils/CudaUtils.hpp b/include/aidge/backend/cuda/utils/CudaUtils.hpp
index 2f66d0e778778400f0b7def345619d635cc37674..ab7c805224ed6fe073baf2036b84f4ed6f49b077 100644
--- a/include/aidge/backend/cuda/utils/CudaUtils.hpp
+++ b/include/aidge/backend/cuda/utils/CudaUtils.hpp
@@ -11,6 +11,9 @@
 #include <cuda.h>
 #include <cudnn.h>
 
+#include "aidge/data/half.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+
 #define CHECK_CUDNN_STATUS(status)                                             \
     do {                                                                       \
         const cudnnStatus_t e = (status);                                      \
@@ -62,6 +65,29 @@
 
 namespace Aidge {
 namespace Cuda {
+    // CuDNN scaling parameters are typically "alpha" and "beta".
+    // Their type must be "float" for HALF and FLOAT (default template)
+    // and "double" for DOUBLE (specialized template)
+    template <class T>
+    struct cudnn_scaling_type {
+        typedef float type;
+    };
+
+    template <>
+    struct cudnn_scaling_type<double> {
+        typedef double type;
+    };
+
+    template <class T>
+    struct cuda_type {
+        typedef T type;
+    };
+
+    template <>
+    struct cuda_type<half_float::half> {
+        typedef __half type;
+    };
+
     const char* cublasGetErrorString(cublasStatus_t error);
 
     // Enable Peer-to-Peer communications between devices
diff --git a/setup.py b/setup.py
index dbf82e2826065c812a3bea5f5e1d34557b79f00d..80500f3165dd87eb7b6dd73c78b89806cc8a874a 100644
--- a/setup.py
+++ b/setup.py
@@ -62,10 +62,10 @@ class CMakeBuild(build_ext):
 
         os.chdir(str(build_temp))
 
-        # Impose to use the executable of the python 
+        # Impose to use the executable of the python
         # used to launch setup.py to setup PythonInterp
         param_py = "-DPYTHON_EXECUTABLE=" + sys.executable
-        
+
         compile_type = 'Debug'
         install_path = os.path.join(sys.prefix, "lib", "libAidge")  if "AIDGE_INSTALL" not in os.environ else os.environ["AIDGE_INSTALL"]
 
@@ -85,11 +85,11 @@ class CMakeBuild(build_ext):
             for file in files:
                 if (file.endswith('.so') or file.endswith('.pyd')) and (root != str(aidge_package.absolute())):
                     currentFile=os.path.join(root, file)
-                    shutil.copy(currentFile, str(aidge_package.absolute())) 
+                    shutil.copy(currentFile, str(aidge_package.absolute()))
 
         # Copy version.txt in aidge_package
         os.chdir(os.path.dirname(__file__))
-        shutil.copy("version.txt", str(aidge_package.absolute()))    
+        shutil.copy("version.txt", str(aidge_package.absolute()))
 
 
 if __name__ == '__main__':
@@ -108,7 +108,7 @@ if __name__ == '__main__':
         cmdclass={
             'build_ext': CMakeBuild,
         },
-        install_requires=['aidge_core', 'aidge_backend_cpu'],
+        install_requires=['aidge_core'],
         zip_safe=False,
 
     )
diff --git a/src/data/TensorImpl.cu b/src/data/TensorImpl.cu
index ecacd4d678dd7d79462332fb28e238b063d8bdd1..898475b5db325afcaedff44756cc2157cf9e2eec 100644
--- a/src/data/TensorImpl.cu
+++ b/src/data/TensorImpl.cu
@@ -91,10 +91,10 @@ template <class T>
 bool Aidge::TensorImpl_cuda<T>::operator==(const TensorImpl &otherImpl) const {
     const auto& otherImplCuda = static_cast<const TensorImpl_cuda<T>&>(otherImpl);
 
-    if (mTensor.size() != otherImplCuda.mTensor.size())
+    if (mNbElts != otherImplCuda.size())
         return false;
 
     thrust::device_ptr<T> thrustData(mData.data());
     thrust::device_ptr<T> thrustOtherData(otherImplCuda.mData.data());
-    return thrust::equal(thrustData, thrustData + mTensor.size(), thrustOtherData);
+    return thrust::equal(thrustData, thrustData + mNbElts, thrustOtherData);
 }
diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..eb9cc6a1f4412178525a5e6bccd32e94c4413d4d
--- /dev/null
+++ b/src/operator/AvgPoolingImpl.cpp
@@ -0,0 +1,90 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/AvgPooling.hpp"
+#include "aidge/utils/Types.h"
+
+template <Aidge::DimIdx_t DIM>
+void Aidge::AvgPoolingImpl_cuda<DIM>::forward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    assert(mOp.getRawInput(0) && "missing input #0");
+
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+
+    // Lazy-initialize CuDNN AvgPooling descriptor
+    if (mAvgPoolingDesc == nullptr) {
+        const AvgPooling_Op<DIM>& avgPoolingOp = static_cast<const AvgPooling_Op<DIM>&>(op);
+        const std::vector<int> strides(avgPoolingOp.template getAttr<AvgPoolingAttr::StrideDims>().begin(), avgPoolingOp.template getAttr<AvgPoolingAttr::StrideDims>().end());
+        const std::vector<int> paddings(DIM, 0);
+        const std::vector<int> window_dims(avgPoolingOp.template getAttr<AvgPoolingAttr::KernelDims>().begin(), avgPoolingOp.template getAttr<AvgPoolingAttr::KernelDims>().end());
+
+        CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mAvgPoolingDesc));
+        CHECK_CUDNN_STATUS(
+            cudnnSetPoolingNdDescriptor(mAvgPoolingDesc,
+                                        mMode,
+                                        CUDNN_NOT_PROPAGATE_NAN,
+                                        DIM,
+                                        &window_dims[0],
+                                        &paddings[0],
+                                        &strides[0]));
+    }
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input);
+            break;
+        case DataType::Float32:
+            forward_<float>(input);
+            break;
+        case DataType::Float16:
+            forward_<half>(input);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <Aidge::DimIdx_t DIM>
+template <class T>
+void Aidge::AvgPoolingImpl_cuda<DIM>::forward_(const Tensor& input) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    CHECK_CUDNN_STATUS(
+        cudnnPoolingForward(
+            CudaContext::cudnnHandle(),
+            mAvgPoolingDesc,
+            &alpha,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
+            input.getImpl()->rawPtr(),
+            &beta,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()
+        )
+    );
+}
+
+template <Aidge::DimIdx_t DIM>
+Aidge::AvgPoolingImpl_cuda<DIM>::~AvgPoolingImpl_cuda() {
+    if(mAvgPoolingDesc != nullptr)
+        cudnnDestroyPoolingDescriptor(mAvgPoolingDesc);
+}
+
+// Template declarations
+template class Aidge::AvgPoolingImpl_cuda<2>;
diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp
index 9c3684e89f6b27133ca99be16b332c4e9f9a27b1..096ee9485a03b736326f46e9a569c6b3c9b5a631 100644
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -10,32 +10,26 @@
  ********************************************************************************/
 
 #include <cassert>
-#include <chrono>  // std::chrono::milliseconds
-#include <numeric> // std::accumulate
-#include <thread>  // std::this_thread::sleep_for
 #include <vector>
 
-#include "aidge/utils/Types.h"
-#include "aidge/operator/Conv.hpp"
-
 #include "aidge/backend/cuda/data/TensorImpl.hpp"
 #include "aidge/backend/cuda/operator/ConvImpl.hpp"
-#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/utils/Types.h"
 
 template <Aidge::DimIdx_t DIM>
 void Aidge::ConvImpl_cuda<DIM>::forward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
     // FIXME: uncomment the following code once memory handling will work
     assert(mOp.getRawInput(0) && "missing input #0");
     assert(mOp.getRawInput(1) && "missing input #1");
 
     // Convert input data (no overhead if not needed!)
-    // TODO: right now, if needed, memory will be allocated/deallocated at each
-    // call to forward(). We might put the following shared_ptr as members of
-    // this class to avoid that.
-    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
-    const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
-    const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
-    const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0));
+    const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0));
+    const auto& input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0));
 
     // Lazy-initialize CuDNN convolution descriptor
     if (mConvDesc == nullptr) {
@@ -45,14 +39,13 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
         const std::vector<int> upscales(convOp.template getAttr<ConvAttr::DilationDims>().begin(), convOp.template getAttr<ConvAttr::DilationDims>().end());
 
         CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc));
-        CHECK_CUDNN_STATUS(
-            cudnnSetConvolutionNdDescriptor(mConvDesc,
-                                            DIM,
-                                            &paddings[0],
-                                            &strides[0],
-                                            &upscales[0],
-                                            CUDNN_CROSS_CORRELATION,
-                                            DataTypeToCudnn(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType())));
+        CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc,
+            DIM,
+            &paddings[0],
+            &strides[0],
+            &upscales[0],
+            CUDNN_CROSS_CORRELATION,
+            DataTypeToCudnn(op.getOutput(0)->dataType())));
     }
 
     // Lazy-initialize CuDNN filter descriptor
@@ -61,14 +54,14 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
 
         CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc));
         CHECK_CUDNN_STATUS(cudnnSetFilterNdDescriptor(mFilterDesc,
-                                                    DataTypeToCudnn(input1.dataType()),
-                                                    CUDNN_TENSOR_NCHW,
-                                                    kernels.size(),
-                                                    &kernels[0]));
+            DataTypeToCudnn(input1.dataType()),
+            CUDNN_TENSOR_NCHW,
+            kernels.size(),
+            &kernels[0]));
     }
 
     // Set forward algorithm and allocate the required workspace
-    if (mWorkspace == nullptr) {
+    if (mFwdWorkspace == nullptr) {
         // Find the best CuDNN forward algorithm (the one with the lowest compute time)
         int maxAlgoIterations = 0;
         cudnnGetConvolutionForwardAlgorithmMaxCount(CudaContext::cudnnHandle(),
@@ -80,14 +73,14 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
         std::vector<cudnnConvolutionFwdAlgoPerf_t> returnFwdAlgo(maxAlgoIterations);
 
         CHECK_CUDNN_STATUS(cudnnFindConvolutionForwardAlgorithm(
-                            CudaContext::cudnnHandle(),
-                            dynamic_cast<TensorImpl_cuda_*>(input0.getImpl().get())->getCudnnTensorDesc(),
-                            mFilterDesc,
-                            mConvDesc,
-                            dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(),
-                            maxAlgoIterations,
-                            &returnAlgoCounts,
-                            &returnFwdAlgo[0]));
+            CudaContext::cudnnHandle(),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
+            mFilterDesc,
+            mConvDesc,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            maxAlgoIterations,
+            &returnAlgoCounts,
+            &returnFwdAlgo[0]));
         mFwdAlgo = returnFwdAlgo[0].algo;
 
         // Allocate the workspace required by the chosen CuDNN forward algorithm
@@ -95,48 +88,54 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
 
         CHECK_CUDNN_STATUS(cudnnGetConvolutionForwardWorkspaceSize(
             CudaContext::cudnnHandle(),
-            dynamic_cast<TensorImpl_cuda_*>(input0.getImpl().get())->getCudnnTensorDesc(),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
             mFilterDesc,
             mConvDesc,
-            dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
             mFwdAlgo,
             &workspaceSize));
 
-        CHECK_CUDA_STATUS(cudaMalloc(&mWorkspace, workspaceSize));
+        CHECK_CUDA_STATUS(cudaMalloc(&mFwdWorkspace, workspaceSize));
         mWorkspaceSize = workspaceSize;
     }
 
     // Do the actual forward computation
     // Template is only for scaling parameters, which are always in float
     // excepted when the convolution is performed in double precision.
-    if (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType() == DataType::Float64) {
-        forward_<double>(input0, input1, input2);
-    }
-    else {
-        forward_<float>(input0, input1, input2);
+    switch(op.getOutput(0)->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input0, input1, input2);
+            break;
+        case DataType::Float32:
+            forward_<float>(input0, input1, input2);
+            break;
+        case DataType::Float16:
+            forward_<half>(input0, input1, input2);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
     }
 }
 
 template <Aidge::DimIdx_t DIM>
 template <class T>
 void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) {
-    const T alpha = 1.0f;
-    const T beta = 0.0f;
-
-    CHECK_CUDNN_STATUS(
-        cudnnConvolutionForward(CudaContext::cudnnHandle(),
-                                &alpha,
-                                dynamic_cast<TensorImpl_cuda_*>(input0.getImpl().get())->getCudnnTensorDesc(),
-                                input0.getImpl()->rawPtr(),
-                                mFilterDesc,
-                                input1.getImpl()->rawPtr(),
-                                mConvDesc,
-                                mFwdAlgo,
-                                mWorkspace,
-                                mWorkspaceSize,
-                                &beta,
-                                dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(),
-                                std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()));
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    CHECK_CUDNN_STATUS(cudnnConvolutionForward(CudaContext::cudnnHandle(),
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
+        input0.getImpl()->rawPtr(),
+        mFilterDesc,
+        input1.getImpl()->rawPtr(),
+        mConvDesc,
+        mFwdAlgo,
+        mFwdWorkspace,
+        mWorkspaceSize,
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+        op.getOutput(0)->getImpl()->rawPtr()));
 
     // Add bias (if there is any)
     if (mOp.getRawInput(2) && input2.size() > 0) {
@@ -151,12 +150,182 @@ void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& inp
         // TODO: find a more elegant solution(?)
 
         CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(),
-                                            &alpha,
-                                            dynamic_cast<TensorImpl_cuda_*>(bias.getImpl().get())->getCudnnTensorDesc(),
-                                            input2.getImpl()->rawPtr(),
-                                            &alpha,
-                                            dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(),
-                                            std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()));
+            &alpha,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(bias.getImpl())->getCudnnTensorDesc(bias),
+            input2.getImpl()->rawPtr(),
+            &alpha,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            op.getOutput(0)->getImpl()->rawPtr()));
+    }
+}
+
+template <Aidge::DimIdx_t DIM>
+void Aidge::ConvImpl_cuda<DIM>::backward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    // FIXME: uncomment the following code once memory handling will work
+    assert(mOp.getRawInput(0) && "missing input #0");
+    assert(mOp.getRawInput(1) && "missing input #1");
+
+    // Convert input data (no overhead if not needed!)
+    const auto& input0 = op.getInput(0)->ref(mInput0Fallback, *op.getOutput(0));
+    const auto& input1 = op.getInput(1)->ref(mInput1Fallback, *op.getOutput(0));
+    const auto& input2 = op.getInput(2)->ref(mInput2Fallback, *op.getOutput(0));
+
+    // Set forward algorithm and allocate the required workspace
+    if (mBwdWorkspace == nullptr) {
+        // Find the best CuDNN backward algorithm (the one with the lowest compute time)
+        int maxAlgoIterations = 0;
+        cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(CudaContext::cudnnHandle(),
+                                                    &maxAlgoIterations);
+        assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionBackwardFilterAlgorithm");
+
+        int returnAlgoCounts = 0;
+        std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> returnBwdFilterAlgo(maxAlgoIterations);
+
+        CHECK_CUDNN_STATUS(cudnnFindConvolutionBackwardFilterAlgorithm(
+            CudaContext::cudnnHandle(),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            mConvDesc,
+            mFilterDesc,
+            maxAlgoIterations,
+            &returnAlgoCounts,
+            &returnBwdFilterAlgo[0]));
+
+        mBwdFilterAlgo = returnBwdFilterAlgo[0].algo;
+
+        maxAlgoIterations = 0;
+        cudnnGetConvolutionBackwardDataAlgorithmMaxCount(CudaContext::cudnnHandle(),
+                                                    &maxAlgoIterations);
+        assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionBackwardDataAlgorithm");
+
+        returnAlgoCounts = 0;
+        std::vector<cudnnConvolutionBwdDataAlgoPerf_t> returnBwdDataAlgo(maxAlgoIterations);
+
+        CHECK_CUDNN_STATUS(cudnnFindConvolutionBackwardDataAlgorithm(
+            CudaContext::cudnnHandle(),
+            mFilterDesc,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            mConvDesc,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
+            maxAlgoIterations,
+            &returnAlgoCounts,
+            &returnBwdDataAlgo[0]));
+
+        mBwdDataAlgo = returnBwdDataAlgo[0].algo;
+
+        // Allocate the workspace required by the chosen CuDNN backward algorithm
+        size_t workspaceSize = 0;
+        CHECK_CUDNN_STATUS(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            CudaContext::cudnnHandle(),
+            // same arguments as cudnnGetConvolutionBackwardFilterAlgorithm()
+            // -->
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            mConvDesc,
+            mFilterDesc,
+            // <--
+            mBwdFilterAlgo,
+            &workspaceSize));
+
+        size_t workspaceSizeData = 0;
+        CHECK_CUDNN_STATUS(cudnnGetConvolutionBackwardDataWorkspaceSize(
+            CudaContext::cudnnHandle(),
+            // same arguments as cudnnGetConvolutionBackwardDataAlgorithm() -->
+            mFilterDesc,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            mConvDesc,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
+            // <--
+            mBwdDataAlgo,
+            &workspaceSizeData));
+
+        if (workspaceSizeData > workspaceSize)
+            workspaceSize = workspaceSizeData;
+
+        if (workspaceSize > mWorkspaceSize) {
+            if (mFwdWorkspace != nullptr) {
+                cudaFree(mFwdWorkspace);
+            }
+            CHECK_CUDA_STATUS(cudaMalloc(&mFwdWorkspace, workspaceSize));
+            mWorkspaceSize = workspaceSize;
+        }
+
+        mBwdWorkspace = mFwdWorkspace;
+    }
+
+    // Do the actual backward computation
+    // Template is only for scaling parameters, which are always in float
+    // excepted when the convolution is performed in double precision.
+    if (op.getOutput(0)->dataType() == DataType::Float64) {
+        backward_<double>(input0, input1, input2);
+    }
+    else {
+        backward_<float>(input0, input1, input2);
+    }
+}
+
+template <Aidge::DimIdx_t DIM>
+template <class T>
+void Aidge::ConvImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    std::shared_ptr<Tensor> gradOutputFallback;
+    const auto& gradOutput = op.getOutput(0)->grad()->refCastFrom(gradOutputFallback, *(op.getInput(0)->grad()));
+
+    const T alpha = 1.0f;
+    const T beta = 0.0f;
+
+    CHECK_CUDNN_STATUS(cudnnConvolutionBackwardFilter(
+        CudaContext::cudnnHandle(),
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
+        input0.getImpl()->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
+        gradOutput.getImpl()->rawPtr(),
+        mConvDesc,
+        mBwdFilterAlgo,
+        mBwdWorkspace,
+        mWorkspaceSize,
+        &beta,
+        mFilterDesc,
+        op.getInput(1)->grad()->getImpl()->rawPtr()));
+
+    CHECK_CUDNN_STATUS(cudnnConvolutionBackwardData(
+        CudaContext::cudnnHandle(),
+        &alpha,
+        mFilterDesc,
+        input1.getImpl()->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
+        gradOutput.getImpl()->rawPtr(),
+        mConvDesc,
+        mBwdDataAlgo,
+        mBwdWorkspace,
+        mWorkspaceSize,
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
+        op.getInput(0)->grad()->getImpl()->rawPtr()));
+
+    // Add bias (if there is any)
+    if (mOp.getRawInput(2) && input2.size() > 0) {
+        // Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor()
+        std::vector<DimSize_t> gradBiasDims(DIM+2, 1);
+        gradBiasDims[1] = op.getInput(2)->grad()->size();
+
+        // Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc())
+        Tensor gradBias(op.getInput(2)->grad()->dataType());
+        gradBias.setBackend("cuda");
+        gradBias.resize(gradBiasDims);
+        // TODO: find a more elegant solution(?)
+
+        CHECK_CUDNN_STATUS(cudnnConvolutionBackwardBias(CudaContext::cudnnHandle(),
+            &alpha,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
+            gradOutput.getImpl()->rawPtr(),
+            &beta,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(gradBias.getImpl())->getCudnnTensorDesc(gradBias),
+            op.getInput(2)->grad()->getImpl()->rawPtr()));
     }
 }
 
@@ -170,8 +339,8 @@ Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() {
         cudnnDestroyFilterDescriptor(mFilterDesc);
     }
 
-    if (mWorkspace != nullptr) {
-        cudaFree(mWorkspace);
+    if (mFwdWorkspace != nullptr) {
+        cudaFree(mFwdWorkspace);
     }
 }
 
diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b60f7fd6aa41f206b2c6eaa5d8f8daa1bd81374
--- /dev/null
+++ b/src/operator/FCImpl.cpp
@@ -0,0 +1,117 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/FCImpl.hpp"
+#include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/FC.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::FCImpl_cuda::forward() {
+    assert(mOp.getRawInput(0) && "missing input #0");
+    assert(mOp.getRawInput(1) && "missing input #1");
+    assert(mOp.getRawInput(2) && "missing input #2");
+
+    const auto& fcOp = static_cast<const FC_Op&>(mOp);
+    bool noBias = fcOp.template getAttr<FCAttr::NoBias>();
+    std::size_t outChannels = static_cast<std::size_t>(fcOp.template getAttr<FCAttr::OutChannels>());
+
+    const auto& input0 = fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0));
+    const auto& input1 = fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0));
+    const auto& input2 = fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0));
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input0, input1, input2, noBias, outChannels);
+            break;
+        case DataType::Float32:
+            forward_<float>(input0, input1, input2, noBias, outChannels);
+            break;
+        case DataType::Float16:
+            forward_<half>(input0, input1, input2, noBias, outChannels);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template<class T>
+void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels)
+{
+    const T * input = static_cast<const T*>(input0.getImpl()->rawPtr());
+    const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr());
+    T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
+
+    // Performing output = T(weights) * input
+    //            [n x m] = [n x k] * [k x m]
+    // cublas is column-major so instead of transposing inputs, computing output [m x n] and transposing output, we compute output as [n x m]
+    int n = outChannels;
+    int m = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->size()/n;
+    int k = input0.size()/m;
+    int lda = k;  // leading dimension of weights
+    int ldb = k;  // leading dimension of input
+    int ldc = n;  // leading dimension of output
+    const T alpha = 1.0f;
+    const T beta = 0.0f;
+    CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(),
+                                    CUBLAS_OP_T,
+                                    CUBLAS_OP_N,
+                                    n,
+                                    m,
+                                    k,
+                                    reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
+                                    weights,
+                                    ldb,
+                                    input, 
+                                    lda,
+                                    reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta),
+                                    output,
+                                    ldc));
+
+    if(!noBias){
+        T* onesVector;
+        CHECK_CUDA_STATUS(cudaMalloc((void**)&onesVector, m * sizeof(T)));
+        // Fill the vector with ones
+        std::vector<T> onesVec(m, T(1.0));
+        CHECK_CUDA_STATUS(cudaMemcpy(onesVector,
+                                    &onesVec[0],
+                                    m * sizeof(T),
+                                    cudaMemcpyHostToDevice));
+        const T * biases = static_cast<const T*>(input2.getImpl()->rawPtr());
+        // Performing output = biases * onesVector + output
+        //           [n x m] = [n x 1] * [1 x m]   + [n x m]
+        CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(),
+                                       CUBLAS_OP_N,
+                                       CUBLAS_OP_N,
+                                       n,
+                                       m,
+                                       1,
+                                       reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
+                                       biases,
+                                       n,
+                                       onesVector,
+                                       1,
+                                       reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
+                                       output,
+                                       n));
+
+        cudaFree(onesVector);
+    }
+
+}
\ No newline at end of file
diff --git a/src/operator/FCImpl_CUDA_kernels.cu b/src/operator/FCImpl_CUDA_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5139ac1d7edf61cf347870e6add2870b2792a0e5
--- /dev/null
+++ b/src/operator/FCImpl_CUDA_kernels.cu
@@ -0,0 +1,76 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#include <stdio.h>
+
+#include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp"
+
+namespace Aidge{
+
+template <>
+cublasStatus_t cublasGemm<__half>(cublasHandle_t handle,
+                                  cublasOperation_t transa, cublasOperation_t transb,
+                                  int m, int n, int k,
+                                  const __half *alpha,
+                                  const __half *A, int lda,
+                                  const __half *B, int ldb,
+                                  const __half *beta,
+                                  __half *C, int ldc)
+{
+    return cublasHgemm(handle,
+					   transa, transb,
+					   m, n, k,
+					   alpha,
+					   A, lda,
+					   B, ldb,
+					   beta,
+					   C, ldc);
+}
+
+template <>
+cublasStatus_t cublasGemm<float>(cublasHandle_t handle,
+                                 cublasOperation_t transa, cublasOperation_t transb,
+                                 int m, int n, int k,
+                                 const float *alpha,
+                                 const float *A, int lda,
+                                 const float *B, int ldb,
+                                 const float *beta,
+                                 float *C, int ldc)
+{
+    return cublasSgemm(handle,
+					   transa, transb,
+					   m, n, k,
+					   alpha,
+					   A, lda,
+					   B, ldb,
+					   beta,
+					   C, ldc);
+}
+
+template <>
+cublasStatus_t cublasGemm<double>(cublasHandle_t handle,
+								  cublasOperation_t transa, cublasOperation_t transb,
+								  int m, int n, int k,
+								  const double *alpha,
+								  const double *A, int lda,
+								  const double *B, int ldb,
+								  const double *beta,
+								  double *C, int ldc)
+{
+    return cublasDgemm(handle,
+					   transa, transb,
+					   m, n, k,
+					   alpha,
+					   A, lda,
+					   B, ldb,
+					   beta,
+					   C, ldc);
+}
+}
\ No newline at end of file
diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8d7c81948bd898b7cc4e2f3bead9c498175e2c1
--- /dev/null
+++ b/src/operator/MaxPoolingImpl.cpp
@@ -0,0 +1,91 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/MaxPooling.hpp"
+#include "aidge/utils/Types.h"
+
+template <Aidge::DimIdx_t DIM>
+void Aidge::MaxPoolingImpl_cuda<DIM>::forward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    assert(mOp.getRawInput(0) && "missing input #0");
+
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+
+    // Lazy-initialize CuDNN MaxPooling descriptor
+    if (mMaxPoolingDesc == nullptr) {
+        const MaxPooling_Op<DIM>& maxPoolingOp = static_cast<const MaxPooling_Op<DIM>&>(op);
+        const std::vector<int> strides(maxPoolingOp.template getAttr<MaxPoolingAttr::StrideDims>().begin(), maxPoolingOp.template getAttr<MaxPoolingAttr::StrideDims>().end());
+        const std::vector<int> paddings(DIM, 0);
+        const std::vector<int> window_dims(maxPoolingOp.template getAttr<MaxPoolingAttr::KernelDims>().begin(), maxPoolingOp.template getAttr<MaxPoolingAttr::KernelDims>().end());
+
+        CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mMaxPoolingDesc));
+        CHECK_CUDNN_STATUS(
+            cudnnSetPoolingNdDescriptor(mMaxPoolingDesc,
+                                        mMode,
+                                        CUDNN_NOT_PROPAGATE_NAN,
+                                        DIM,
+                                        &window_dims[0],
+                                        &paddings[0],
+                                        &strides[0]));
+    }
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input);
+            break;
+        case DataType::Float32:
+            forward_<float>(input);
+            break;
+        case DataType::Float16:
+            forward_<half>(input);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <Aidge::DimIdx_t DIM>
+template <class T>
+void Aidge::MaxPoolingImpl_cuda<DIM>::forward_(const Tensor& input) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    CHECK_CUDNN_STATUS(
+        cudnnPoolingForward(
+            CudaContext::cudnnHandle(),
+            mMaxPoolingDesc,
+            &alpha,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
+            input.getImpl()->rawPtr(),
+            &beta,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()
+        )
+    );
+}
+
+template <Aidge::DimIdx_t DIM>
+Aidge::MaxPoolingImpl_cuda<DIM>::~MaxPoolingImpl_cuda() {
+    if(mMaxPoolingDesc != nullptr)
+        cudnnDestroyPoolingDescriptor(mMaxPoolingDesc);
+}
+
+
+// Template declarations
+template class Aidge::MaxPoolingImpl_cuda<2>;
diff --git a/src/operator/ProducerImpl.cpp b/src/operator/ProducerImpl.cpp
deleted file mode 100644
index aca3c4945e357be13017e302cb6e7f12ba61237c..0000000000000000000000000000000000000000
--- a/src/operator/ProducerImpl.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#include <cassert>
-#include <numeric> // std::accumulate
-#include <vector>
-
-#include "aidge/data/Tensor.hpp"
-#include "aidge/operator/Producer.hpp"
-#include "aidge/utils/Types.h"
-
-#include "aidge/backend/cuda/operator/ProducerImpl.hpp"
-
-Aidge::DimSize_t Aidge::ProducerImpl_cuda::getNbProducedData(
-    Aidge::IOIndex_t outputIdx) const
-{
-    // Requires the whole tensors, regardless of available data on inputs
-    assert(outputIdx == 0 && "operator has only one output");
-    (void) outputIdx;
-
-    return std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->size();
-}
-
-void Aidge::ProducerImpl_cuda::forward()
-{
-}
diff --git a/src/operator/ReLUImpl.cpp b/src/operator/ReLUImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ebd6b276e02000bdefb22fe8d2373255a1a5c2c
--- /dev/null
+++ b/src/operator/ReLUImpl.cpp
@@ -0,0 +1,78 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/ReLUImpl.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/ReLU.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::ReLUImpl_cuda::forward() {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    assert(mOp.getRawInput(0) && "missing input #0");
+
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+
+    // Lazy-initialize CuDNN ReLU descriptor
+    if (mReLUDesc == nullptr) {
+		#if CUDNN_VERSION >= 5000
+			CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mReLUDesc));
+			CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor(
+				mReLUDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0));
+		#else
+			mReLUDesc = CUDNN_ACTIVATION_RELU;
+		#endif
+    }
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input);
+            break;
+        case DataType::Float32:
+            forward_<float>(input);
+            break;
+        case DataType::Float16:
+            forward_<half>(input);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::ReLUImpl_cuda::forward_(const Tensor& input) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
+    const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
+    CHECK_CUDNN_STATUS(
+        cudnnActivationForward(CudaContext::cudnnHandle(),
+                               mReLUDesc,
+                               &alpha,
+							   std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
+                               input.getImpl()->rawPtr(),
+                               &beta,
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+                               std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+}
+
+Aidge::ReLUImpl_cuda::~ReLUImpl_cuda() {
+    if (mReLUDesc != nullptr) {
+		#if CUDNN_VERSION >= 5000
+            cudnnDestroyActivationDescriptor(mReLUDesc);
+		#endif
+    }
+}
+
diff --git a/src/utils/CudaUtils.cpp b/src/utils/CudaUtils.cpp
index a6e0514f9a949a805561d966e2a712701c18936c..ca3263a282322e70157b7537c502a63a3edb526f 100644
--- a/src/utils/CudaUtils.cpp
+++ b/src/utils/CudaUtils.cpp
@@ -40,7 +40,7 @@ void Aidge::Cuda::setMultiDevicePeerAccess(unsigned int size, unsigned int* devi
                     CHECK_CUDA_STATUS(cudaSetDevice(devices[j]));
                     const cudaError_t status = cudaDeviceEnablePeerAccess(devices[i], 0);
                     if (status == cudaErrorPeerAccessAlreadyEnabled) {
-                        printf("Peer access already enabled between device %d and device %d\n", devices[j], devices[i]);
+                        fmt::print("Peer access already enabled between device {} and device {}\n", devices[j], devices[i]);
                     } else {
                         CHECK_CUDA_STATUS(status);
                     }
diff --git a/unit_tests/Test_AvgPoolingImpl.cpp b/unit_tests/Test_AvgPoolingImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4d39db555e9d12c7e5135d1eb3db6ffc8f459c3
--- /dev/null
+++ b/unit_tests/Test_AvgPoolingImpl.cpp
@@ -0,0 +1,163 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+
+#include <catch2/catch_test_macros.hpp>
+#include <cuda_fp16.h>
+#include <numeric>   // std::accumulate
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include "Test_cuda.hpp"
+
+#include "aidge/data/half.hpp"
+#include "aidge/data/Tensor.hpp"
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") {
+    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW
+        {
+            {
+                {{  0,   1,   2,   3,   4},
+                 {  5,   6,   7,   8,   9},
+                 { 10,  11,  12,  13,  14},
+                 { 15,  16,  17,  18,  19},
+                 { 20,  21,  22,  23,  24}},
+
+                {{ 25,  26,  27,  28,  29},
+                 { 30,  31,  32,  33,  34},
+                 { 35,  36,  37,  38,  39},
+                 { 40,  41,  42,  43,  44},
+                 { 45,  46,  47,  48,  49}}
+            },
+            {
+                {{100, 101, 102, 103, 104},
+                 {105, 106, 107, 108, 109},
+                 {110, 111, 112, 113, 114},
+                 {115, 116, 117, 118, 119},
+                 {120, 121, 122, 123, 124}},
+
+                {{125, 126, 127, 128, 129},
+                 {130, 131, 132, 133, 134},
+                 {135, 136, 137, 138, 139},
+                 {140, 141, 142, 143, 144},
+                 {145, 146, 147, 148, 149}}
+            }
+        }
+    });
+    SECTION("Stride") {
+        std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "myAvgPool", {2,2});
+        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
+            {
+                {
+                    {{  3,   5},
+                     { 13,  15}},
+                    {{ 28,  30},
+                     { 38,  40}}
+                },
+                {
+                    {{103, 105},
+                     {113, 115}},
+                    {{128, 130},
+                     {138, 140}}
+                }
+            }
+        });
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        op->computeOutputDims();
+        myAvgPool->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+
+    SECTION("Stride >= feature dim") {
+        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { //NCHW
+        {
+            {
+                {{0.3745, 0.9507, 0.7320},
+                 {0.5987, 0.1560, 0.1560},
+                 {0.0581, 0.8662, 0.6011}}
+            }
+        }
+        });
+        std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "myAvgPool", {3,3});
+        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,1,1,1,1> {
+            {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011)/9.0}}}}
+        });
+        op->associateInput(0,myInput2);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        op->computeOutputDims();
+        myAvgPool->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+
+    SECTION("half") {
+        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half,1,1,3,3> { //NCHW
+        {
+            {
+                {{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)},
+                 {half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)},
+                 {half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}}
+            }
+        }
+        });
+        myInput2->setBackend("cuda");
+
+        std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "mymyAvgPoolcdw", {3,3});
+        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator());
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half,1,1,1,1> {
+            {{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011))/half_float::half(9.0)}}}}
+        });
+        op->associateInput(0,myInput2);
+        op->setDataType(DataType::Float16);
+        op->setBackend("cuda");
+        op->computeOutputDims();
+        myAvgPool->forward();
+
+        half_float::half* computedOutput   = new half_float::half[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(half_float::half) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const half_float::half targetOutput = *(static_cast<half_float::half*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+}
\ No newline at end of file
diff --git a/unit_tests/Test_CastMove.cpp b/unit_tests/Test_CastMove.cpp
index 0b68a4f9dcb6c72df91506a6f92be8c31e95f068..c96600f79967c69e43b3c334d3624f6514b6f936 100644
--- a/unit_tests/Test_CastMove.cpp
+++ b/unit_tests/Test_CastMove.cpp
@@ -18,8 +18,8 @@
 #include "aidge/graph/Node.hpp"
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/graph/OpArgs.hpp"
-#include "aidge/scheduler/Scheduler.hpp"
-#include "aidge/recipies/Recipies.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/recipes/Recipes.hpp"
 
 #include "aidge/backend/cuda.hpp"
 
diff --git a/unit_tests/Test_FCImpl.cpp b/unit_tests/Test_FCImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..54e37db15ded5546eb8fc3caacff9bae238b452c
--- /dev/null
+++ b/unit_tests/Test_FCImpl.cpp
@@ -0,0 +1,133 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "Test_cuda.hpp"
+
+#include "aidge/data/Tensor.hpp"
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") {
+    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 5, 75>{
+            {{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
+              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
+              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
+             {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
+              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
+              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
+             {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
+              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
+              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
+             {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
+              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
+              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
+             {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
+              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
+              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}}});
+    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 5>{{1, 2, 3, 4, 5}});
+    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 2, 5>{
+            {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}});
+    myWeights->setBackend("cuda");
+    myBias->setBackend("cuda");
+    std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc");
+    auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
+    op -> associateInput(1, myWeights);
+    op -> associateInput(2, myBias);
+    SECTION("2D input") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 2, 75>{
+                {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18,
+                  19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                  38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+                  57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74},
+                 {75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+                  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+                  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+                  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+                  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}});
+        myInput->setBackend("cuda");
+        op->associateInput(0, myInput);
+        op -> setDataType(DataType::Float32);
+        op -> setBackend("cuda");
+        op->computeOutputDims();
+        myFC->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+    SECTION("4D input") {
+        std::shared_ptr<Tensor> myInput =
+                std::make_shared<Tensor>(Array4D<float, 2, 3, 5, 5>{{{{{0, 1, 2, 3, 4},
+                                                                     {5, 6, 7, 8, 9},
+                                                                     {10, 11, 12, 13, 14},
+                                                                     {15, 16, 17, 18, 19},
+                                                                     {20, 21, 22, 23, 24}},
+                                                                    {{25, 26, 27, 28, 29},
+                                                                     {30, 31, 32, 33, 34},
+                                                                     {35, 36, 37, 38, 39},
+                                                                     {40, 41, 42, 43, 44},
+                                                                     {45, 46, 47, 48, 49}},
+                                                                    {{50, 51, 52, 53, 54},
+                                                                     {55, 56, 57, 58, 59},
+                                                                     {60, 61, 62, 63, 64},
+                                                                     {65, 66, 67, 68, 69},
+                                                                     {70, 71, 72, 73, 74}}},
+                                                                   {{{75, 76, 77, 78, 79},
+                                                                     {80, 81, 82, 83, 84},
+                                                                     {85, 86, 87, 88, 89},
+                                                                     {90, 91, 92, 93, 94},
+                                                                     {95, 96, 97, 98, 99}},
+                                                                    {{100, 101, 102, 103, 104},
+                                                                     {105, 106, 107, 108, 109},
+                                                                     {110, 111, 112, 113, 114},
+                                                                     {115, 116, 117, 118, 119},
+                                                                     {120, 121, 122, 123, 124}},
+                                                                    {{125, 126, 127, 128, 129},
+                                                                     {130, 131, 132, 133, 134},
+                                                                     {135, 136, 137, 138, 139},
+                                                                     {140, 141, 142, 143, 144},
+                                                                     {145, 146, 147, 148, 149}}}}});
+        myInput->setBackend("cuda");
+        op->associateInput(0, myInput);
+        op -> setDataType(DataType::Float32);
+        op -> setBackend("cuda");
+        op->computeOutputDims();
+        myFC->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+}
\ No newline at end of file
diff --git a/unit_tests/Test_MaxPoolingImpl.cpp b/unit_tests/Test_MaxPoolingImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2ec0dfe5dc6df072b6be3b20c075190cd3f6fce
--- /dev/null
+++ b/unit_tests/Test_MaxPoolingImpl.cpp
@@ -0,0 +1,93 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "Test_cuda.hpp"
+
+#include "aidge/data/Tensor.hpp"
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+
+using namespace Aidge;
+
+
+TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
+    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW
+        {
+            {
+                {{-0.3848,  0.2166, -0.4373,  0.6142,  0.5277},
+                 {0.7995,  0.3638, -1.4589, -1.0843,  1.0918},
+            	 {0.7147,  0.0936, -1.2902,  1.2037,  0.4874},
+                 {-0.5981,  2.1184, -0.9175,  1.3859,  0.3305},
+                 {-1.7700,  0.0563, -0.3914,  0.0538, -0.3955}},
+
+                {{-3.1409, -0.4554,  0.0524,  2.2291,  0.4859},
+                 {-0.7465, -0.6567, -2.3703, -0.6386, -1.4152},
+                 { 2.2329, -0.5850,  0.0700,  1.2838, -1.7363},
+                 { 0.2139,  0.0624, -1.0689, -0.8221, -0.8038},
+                 { 0.1886, -0.7840, -0.2313,  0.2651, -1.6244}}
+            },
+            {
+                {{ 0.4371,  1.6417,  0.9129,  0.6325,  0.5438},
+                 {-2.3552, -0.8850, -0.0232, -0.5462, -1.2011},
+                 {1.7653, -1.6668, -1.0814,  0.6182,  1.2071},
+                 {0.9541, -0.5133,  0.8664, -0.8892,  1.4585},
+                 {1.0220, -0.5107,  0.1829, -0.2301, -0.4268}},
+
+                {{ 1.0429,  0.6279, -0.2875,  0.7187, -0.1500},
+                 {1.6041,  2.9635,  1.4172, -0.7517,  0.5441},
+                 {-0.2276,  0.0857,  0.6776, -0.1389, -0.0614},
+                 {-0.1547, -0.3435,  0.0650, -0.5095, -1.8073},
+                 {1.7217,  0.3999, -0.5953,  1.0604, -0.4126}}
+            }
+        }
+    });
+    SECTION("Stride") {
+        std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2});
+        auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
+            {
+                {
+                    {{  0.7995,  0.6142},
+                     { 2.1184,  1.3859}},
+                    {{ -0.4554,  2.2291},
+                     {  2.2329,  1.2838}}
+                },
+                {
+                    {{1.6417,  0.9129},
+                     {1.7653,  0.8664}},
+                    {{2.9635,  1.4172},
+                     {0.0857,  0.6776}}
+                }
+            }
+        });
+        myMaxPool->getOperator()->associateInput(0,myInput);
+        myMaxPool->getOperator()->setDataType(DataType::Float32);
+        myMaxPool->getOperator()->setBackend("cuda");
+        op->computeOutputDims();
+        myMaxPool->forward();
+        
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+}
\ No newline at end of file
diff --git a/unit_tests/Test_ReLUImpl.cpp b/unit_tests/Test_ReLUImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..82da6fae6737ee39fc60d771c10dc69fa2dea5f6
--- /dev/null
+++ b/unit_tests/Test_ReLUImpl.cpp
@@ -0,0 +1,200 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "Test_cuda.hpp"
+
+#include "aidge/data/Tensor.hpp"
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cuda.hpp"
+
+using namespace Aidge;
+
+
+TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") {
+    SECTION("1D Tensor") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array1D<float,10> {
+            {0, 1, 2,-3, 4,-5,-6, 7, 8, 9}
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,10> {
+            {0, 1, 2, 0, 4, 0, 0, 7, 8, 9}
+        });
+
+        std::shared_ptr<Node> myReLU = ReLU();
+        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        op->computeOutputDims();
+        myReLU->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+
+    SECTION("2D Tensor") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array2D<float,2,10> {
+            {
+                { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,2,10> {
+            {
+                { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
+                { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
+            }
+        });
+
+        std::shared_ptr<Node> myReLU = ReLU();
+        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        op->computeOutputDims();
+        myReLU->forward();
+        
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+
+    SECTION("3D Tensor") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array3D<float,2,2,10> {
+            {
+                {
+                    { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                    {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                },
+                {
+                    { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                    {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,2,2,10> {
+            {
+                {
+                    { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
+                    { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
+                },
+                {
+                    { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
+                    { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
+                }
+            }
+        });
+
+        std::shared_ptr<Node> myReLU = ReLU();
+        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        op->computeOutputDims();
+        myReLU->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+
+    SECTION("4D Tensor") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
+            {
+                {
+                    {
+                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                    },
+                    {
+                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                    }
+                },
+                {
+                    {
+                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                    },
+                    {
+                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                    }
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
+            {
+                {
+                    {
+                        { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
+                        { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
+                    },
+                    {
+                        { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
+                        { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
+                    }
+                },
+                {
+                    {
+                        { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
+                        { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
+                    },
+                    {
+                        { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
+                        { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
+                    }
+                }
+            }
+        });
+
+        std::shared_ptr<Node> myReLU = ReLU();
+        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        op->computeOutputDims();
+        op->forward();
+
+        float* computedOutput   = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+
+        for(int i = 0; i < myOutput->size(); i++){
+            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
+        }
+
+        delete[] computedOutput;
+    }
+}
diff --git a/version.txt b/version.txt
index 8a9ecc2ea99d607e92feae1656ddbf6fdd82a2c1..341cf11faf9a29504168de4e54beaad182c5adc5 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.0.1
\ No newline at end of file
+0.2.0
\ No newline at end of file