diff --git a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
index f32dcb1249a591d79671d8d014c56a649ebdebb6..bec3728579a3299f11cf21d7140df8adaef13b98 100644
--- a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
@@ -33,6 +33,7 @@ private:
     // CuDNN specific variables
     cudnnPoolingDescriptor_t mAvgPoolingDesc = nullptr;
     cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    std::shared_ptr<Tensor> mInputFallback;
 
 public:
     AvgPoolingImpl_cuda(const AvgPooling_Op<DIM> &op) : OperatorImpl(op) {}
diff --git a/include/aidge/backend/cuda/operator/FCImpl.hpp b/include/aidge/backend/cuda/operator/FCImpl.hpp
index ee2f1c57a27fe80f2b2ab4f3b3ae68bfa0cca24b..023757b17eb80249f184a144cd7c38b7ca5758e6 100644
--- a/include/aidge/backend/cuda/operator/FCImpl.hpp
+++ b/include/aidge/backend/cuda/operator/FCImpl.hpp
@@ -32,7 +32,9 @@ class FCImplForward_cuda : public Registrable<FCImplForward_cuda,
                                                  void(std::size_t , std::size_t, std::size_t, bool, const void* , const void* , const void* , void*)> {};
 class FCImpl_cuda : public OperatorImpl {
 private:
-    // CuDNN specific variables
+    std::shared_ptr<Tensor> mInput0Fallback;
+    std::shared_ptr<Tensor> mInput1Fallback;
+    std::shared_ptr<Tensor> mInput2Fallback;
 
 
 public:
diff --git a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp
index 9c83332f9857ed802f5563faef558a7278d3e992..9084e01fc08cb3d00e80fc8cf6246064b20591f2 100644
--- a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp
@@ -32,11 +32,5 @@ cublasStatus_t cublasGemm(cublasHandle_t handle,
                           const T *B, int ldb,
                           const T *beta,
                           T *C, int ldc);
-// cublasGemm(cublasContext*&, cublasOperation_t, cublasOperation_t, int&, int&, int&, 
-						//  const type*, 
-						//  const __half*&, int&, 
-						//  const __half*&, int&, 
-						//  const type*, 
-						//  __half*&, int&)’
 }
 #endif /* AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp
index 9da97736e31ddbc01e47e0bde903e5b8348a6f7f..9216eca4b901b9d7b92dbf8efb4d0a419dac9876 100644
--- a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp
+++ b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp
@@ -33,6 +33,7 @@ private:
     // CuDNN specific variables
     cudnnPoolingDescriptor_t mMaxPoolingDesc = nullptr;
     cudnnPoolingMode_t mMode = CUDNN_POOLING_MAX;
+    std::shared_ptr<Tensor> mInputFallback;
 
 public:
     MaxPoolingImpl_cuda(const MaxPooling_Op<DIM> &op) : OperatorImpl(op) {}
diff --git a/include/aidge/backend/cuda/operator/ReLUImpl.hpp b/include/aidge/backend/cuda/operator/ReLUImpl.hpp
index 3b6cbcc6041a4757a919203fed6d080e30051d08..27d0a61824df78bcc1c001e6fdc4a28056d5459c 100644
--- a/include/aidge/backend/cuda/operator/ReLUImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ReLUImpl.hpp
@@ -35,6 +35,7 @@ private:
     #else
         cudnnActivationMode_t mReLUDesc = nullptr;
     #endif
+    std::shared_ptr<Tensor> mInputFallback;
 
 public:
     ReLUImpl_cuda(const ReLU_Op &op) : OperatorImpl(op) {}
diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp
index 861533eced6112903ea288f09711f3a382db542c..eb9cc6a1f4412178525a5e6bccd32e94c4413d4d 100644
--- a/src/operator/AvgPoolingImpl.cpp
+++ b/src/operator/AvgPoolingImpl.cpp
@@ -25,8 +25,7 @@ void Aidge::AvgPoolingImpl_cuda<DIM>::forward() {
 
     assert(mOp.getRawInput(0) && "missing input #0");
 
-    std::shared_ptr<Tensor> inputFallback;
-    const auto& input = std::static_pointer_cast<Tensor>(op.getRawInput(0))->refCastFrom(inputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
 
     // Lazy-initialize CuDNN AvgPooling descriptor
     if (mAvgPoolingDesc == nullptr) {
diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp
index a8f8da8e114baaae3f7d09146cdd5b664150f260..8b60f7fd6aa41f206b2c6eaa5d8f8daa1bd81374 100644
--- a/src/operator/FCImpl.cpp
+++ b/src/operator/FCImpl.cpp
@@ -28,15 +28,14 @@ void Aidge::FCImpl_cuda::forward() {
     assert(mOp.getRawInput(1) && "missing input #1");
     assert(mOp.getRawInput(2) && "missing input #2");
 
-    std::shared_ptr<Tensor> inputFallback, input1Fallback, input2Fallback;
-    const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(inputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
-    const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
-    const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
-
     const auto& fcOp = static_cast<const FC_Op&>(mOp);
     bool noBias = fcOp.template getAttr<FCAttr::NoBias>();
     std::size_t outChannels = static_cast<std::size_t>(fcOp.template getAttr<FCAttr::OutChannels>());
 
+    const auto& input0 = fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0));
+    const auto& input1 = fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0));
+    const auto& input2 = fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0));
+
     switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
         case DataType::Float64:
             forward_<double>(input0, input1, input2, noBias, outChannels);
@@ -55,17 +54,19 @@ void Aidge::FCImpl_cuda::forward() {
 template<class T>
 void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels)
 {
-
     const T * input = static_cast<const T*>(input0.getImpl()->rawPtr());
     const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr());
     T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
 
+    // Performing output = T(weights) * input
+    //            [n x m] = [n x k] * [k x m]
+    // cublas is column-major so instead of transposing inputs, computing output [m x n] and transposing output, we compute output as [n x m]
     int n = outChannels;
     int m = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->size()/n;
     int k = input0.size()/m;
-    int lda = k;
-    int ldb = k;
-    int ldc = n;
+    int lda = k;  // leading dimension of weights
+    int ldb = k;  // leading dimension of input
+    int ldc = n;  // leading dimension of output
     const T alpha = 1.0f;
     const T beta = 0.0f;
     CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(),
@@ -93,7 +94,8 @@ void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, co
                                     m * sizeof(T),
                                     cudaMemcpyHostToDevice));
         const T * biases = static_cast<const T*>(input2.getImpl()->rawPtr());
-
+        // Performing output = biases * onesVector + output
+        //           [n x m] = [n x 1] * [1 x m]   + [n x m]
         CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(),
                                        CUBLAS_OP_N,
                                        CUBLAS_OP_N,
diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp
index 19a567fe4b273e821f95a16989d0a09bd510fe07..b8d7c81948bd898b7cc4e2f3bead9c498175e2c1 100644
--- a/src/operator/MaxPoolingImpl.cpp
+++ b/src/operator/MaxPoolingImpl.cpp
@@ -25,8 +25,7 @@ void Aidge::MaxPoolingImpl_cuda<DIM>::forward() {
 
     assert(mOp.getRawInput(0) && "missing input #0");
 
-    std::shared_ptr<Tensor> inputFallback;
-    const auto& input = std::static_pointer_cast<Tensor>(op.getRawInput(0))->refCastFrom(inputFallback, *std::static_pointer_cast<Tensor>(op.getRawOutput(0)));
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
 
     // Lazy-initialize CuDNN MaxPooling descriptor
     if (mMaxPoolingDesc == nullptr) {
diff --git a/src/operator/ReLUImpl.cpp b/src/operator/ReLUImpl.cpp
index c880184bc51fce65710d02d9a483e43de6184d89..2ebd6b276e02000bdefb22fe8d2373255a1a5c2c 100644
--- a/src/operator/ReLUImpl.cpp
+++ b/src/operator/ReLUImpl.cpp
@@ -24,8 +24,7 @@ void Aidge::ReLUImpl_cuda::forward() {
 
     assert(mOp.getRawInput(0) && "missing input #0");
 
-    std::shared_ptr<Tensor> inputFallback;
-    const auto& input = std::static_pointer_cast<Tensor>(op.getRawInput(0))->refCastFrom(inputFallback, *std::static_pointer_cast<Tensor>(op.getRawOutput(0)));
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
 
     // Lazy-initialize CuDNN ReLU descriptor
     if (mReLUDesc == nullptr) {
@@ -38,11 +37,18 @@ void Aidge::ReLUImpl_cuda::forward() {
 		#endif
     }
 
-    if (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType() == DataType::Float64) {
-        forward_<double>(input);
-    }
-    else {
-        forward_<float>(input);
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input);
+            break;
+        case DataType::Float32:
+            forward_<float>(input);
+            break;
+        case DataType::Float16:
+            forward_<half>(input);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
     }
 }
 
@@ -64,7 +70,9 @@ void Aidge::ReLUImpl_cuda::forward_(const Tensor& input) {
 
 Aidge::ReLUImpl_cuda::~ReLUImpl_cuda() {
     if (mReLUDesc != nullptr) {
-        cudnnDestroyActivationDescriptor(mReLUDesc);
+		#if CUDNN_VERSION >= 5000
+            cudnnDestroyActivationDescriptor(mReLUDesc);
+		#endif
     }
 }