diff --git a/include/aidge/backend/cuda.hpp b/include/aidge/backend/cuda.hpp
index d5e9d1654f0a4fe894ed0e965a25b32c9e5caa06..0d7a55aaa3d3d5d150c1e55f6fd9f87d65c3e3ab 100644
--- a/include/aidge/backend/cuda.hpp
+++ b/include/aidge/backend/cuda.hpp
@@ -27,19 +27,18 @@
 #include "aidge/backend/cuda/operator/MulImpl.hpp"
 #include "aidge/backend/cuda/operator/PadImpl.hpp"
 #include "aidge/backend/cuda/operator/PowImpl.hpp"
+#include "aidge/backend/cuda/operator/ReLUImpl.hpp"
 #include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp"
 #include "aidge/backend/cuda/operator/ReduceSumImpl.hpp"
-#include "aidge/backend/cuda/operator/ReLUImpl.hpp"
-#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
-#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
 #include "aidge/backend/cuda/operator/ReshapeImpl.hpp"
+#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
+#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
 #include "aidge/backend/cuda/operator/SigmoidImpl.hpp"
 #include "aidge/backend/cuda/operator/SubImpl.hpp"
 #include "aidge/backend/cuda/operator/TanhImpl.hpp"
 
-#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
-#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
 #include "aidge/backend/cuda/operator/ILayerNormImpl.hpp"
-
+#include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
+#include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
 
 #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */
diff --git a/include/aidge/backend/cuda/data/TensorImpl.hpp b/include/aidge/backend/cuda/data/TensorImpl.hpp
index 541afeecc751332d41ff082b790282abcad5a1b0..35f6cfc1f517a31570c2d7b25341413330728314 100644
--- a/include/aidge/backend/cuda/data/TensorImpl.hpp
+++ b/include/aidge/backend/cuda/data/TensorImpl.hpp
@@ -1,30 +1,38 @@
 #ifndef AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_
 #define AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_
 
-#include <cstddef>  // std::size_t
+#include <cstddef> // std::size_t
 #include <memory>
 #include <string>
 
 #include "aidge/backend/TensorImpl.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
-#include "aidge/utils/ErrorHandling.hpp"
 #include "aidge/utils/future_std/span.hpp"
 
-#include "aidge/backend/cuda/utils/CudaUtils.hpp"
 #include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
 namespace Aidge {
 
 template <typename SRC_T, typename DST_T>
-void thrust_copy(const SRC_T* /*srcData*/, DST_T* /*dstData*/, size_t /*size*/);
-template <typename SRC_T, typename std::enable_if<!std::is_same<half_float::half, SRC_T>::value>::type* = nullptr>
-void thrust_copy(const SRC_T* srcData, half_float::half* dstData, size_t size);
-template <typename DST_T, typename std::enable_if<!std::is_same<half_float::half, DST_T>::value>::type* = nullptr>
-void thrust_copy(const half_float::half* srcData, DST_T* dstData, size_t size);
+void thrust_copy(const SRC_T * /*srcData*/,
+                 DST_T * /*dstData*/,
+                 size_t /*size*/);
+template <typename SRC_T,
+          typename std::enable_if<
+              !std::is_same<half_float::half, SRC_T>::value>::type * = nullptr>
+void thrust_copy(const SRC_T *srcData, half_float::half *dstData, size_t size);
+template <typename DST_T,
+          typename std::enable_if<
+              !std::is_same<half_float::half, DST_T>::value>::type * = nullptr>
+void thrust_copy(const half_float::half *srcData, DST_T *dstData, size_t size);
 template <>
-void thrust_copy(const half_float::half* srcData, half_float::half* dstData, size_t size);
+void thrust_copy(const half_float::half *srcData,
+                 half_float::half *dstData,
+                 size_t size);
 
 /**
  * @brief Abstract class for the TensorImpl_cuda class template.
@@ -33,17 +41,18 @@ void thrust_copy(const half_float::half* srcData, half_float::half* dstData, siz
  * class), but whose data type does not need to be known.
  */
 class TensorImpl_cuda_ {
-protected:
+  protected:
     mutable cudnnTensorDescriptor_t mCudnnTensor = nullptr;
 
-public:
+  public:
     /**
      * @brief Return the CuDNN tensor descriptor of the tensor.
      * @details This method uses lazy initialization for the descriptor
      * (which is therefore mutable in the derived class).
      * @return cudnnTensorDescriptor_t CuDNN tensor descriptor.
      */
-    virtual const cudnnTensorDescriptor_t& getCudnnTensorDesc(const Tensor& tensor) const = 0;
+    virtual const cudnnTensorDescriptor_t &
+    getCudnnTensorDesc(const Tensor &tensor) const = 0;
 
     virtual ~TensorImpl_cuda_() {
         if (mCudnnTensor != nullptr)
@@ -52,137 +61,199 @@ public:
 };
 
 template <class T>
-class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_  {
-private:
-    static T* cudaAlloc(NbElts_t length) {
-        T* data;
-        CHECK_CUDA_STATUS(cudaMalloc(reinterpret_cast<void**>(&data), length * sizeof(T)));
+class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_ {
+  private:
+    static T *cudaAlloc(NbElts_t length) {
+        T *data;
+        CHECK_CUDA_STATUS(
+            cudaMalloc(reinterpret_cast<void **>(&data), length * sizeof(T)));
         return data;
     }
 
-    static void cudaDelete(T* data) {
+    static void cudaDelete(T *data) {
         // Should not be called if data is nullptr, according to the standard
         cudaFree(data);
     }
 
-private:
+  private:
     future_std::span<T> mData;
     /// If this instance own the data, std::unique_ptr manages it
     std::unique_ptr<T, decltype(&cudaDelete)> mDataOwner;
 
-public:
+  public:
     static const std::string Backend;
 
-    TensorImpl_cuda(DeviceIdx_t device, std::vector<DimSize_t> dims) : TensorImpl(Backend, device, dims), mDataOwner(nullptr, cudaDelete) {}
-
+    TensorImpl_cuda(DeviceIdx_t device, std::vector<DimSize_t> dims)
+        : TensorImpl(Backend, device, dims), mDataOwner(nullptr, cudaDelete) {}
 
     bool operator==(const TensorImpl &otherImpl) const override final;
 
-    static std::shared_ptr<TensorImpl_cuda> create(DeviceIdx_t device, std::vector<DimSize_t> dims) {
+    static std::shared_ptr<TensorImpl_cuda>
+    create(DeviceIdx_t device, std::vector<DimSize_t> dims) {
         return std::make_shared<TensorImpl_cuda<T>>(device, dims);
     }
 
     // native interface
-    const future_std::span<T>& data() const { return mData; }
+    const future_std::span<T> &data() const {
+        return mData;
+    }
 
-    inline std::size_t capacity() const noexcept override { return mData.size(); }
+    inline std::size_t capacity() const noexcept override {
+        return mData.size();
+    }
 
-    std::size_t scalarSize() const noexcept override { return sizeof(T); }
+    std::size_t scalarSize() const noexcept override {
+        return sizeof(T);
+    }
 
     void zeros() override final {
         CHECK_CUDA_STATUS(cudaMemset(rawPtr(), T(0), mNbElts * sizeof(T)));
     }
 
     void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copy(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
-        const T* srcT = static_cast<const T *>(src);
-        T* dstT = static_cast<T *>(rawPtr(offset));
-
-        AIDGE_ASSERT(dstT < srcT || dstT >= srcT + length, "TensorImpl_cuda<{}>::copy(): overlapping copy is not supported", typeid(T).name());
-        CHECK_CUDA_STATUS(cudaMemcpy(dstT, srcT, length * sizeof(T), cudaMemcpyDeviceToDevice));
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts,
+                     "TensorImpl_cuda<{}>::copy(): copy length ({}) is above "
+                     "capacity ({})",
+                     typeid(T).name(),
+                     length,
+                     mNbElts);
+        const T *srcT = static_cast<const T *>(src);
+        T *dstT = static_cast<T *>(rawPtr(offset));
+
+        AIDGE_ASSERT(
+            dstT < srcT || dstT >= srcT + length,
+            "TensorImpl_cuda<{}>::copy(): overlapping copy is not supported",
+            typeid(T).name());
+        CHECK_CUDA_STATUS(cudaMemcpy(dstT,
+                                     srcT,
+                                     length * sizeof(T),
+                                     cudaMemcpyDeviceToDevice));
     }
 
-    void copyCast(const void *src, const DataType srcDt, NbElts_t length, NbElts_t offset = 0) override {
+    void copyCast(const void *src,
+                  const DataType srcDt,
+                  NbElts_t length,
+                  NbElts_t offset = 0) override {
         if (length == 0) {
             return;
         }
 
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyCast(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts,
+                     "TensorImpl_cuda<{}>::copyCast(): copy length ({}) is "
+                     "above capacity ({})",
+                     typeid(T).name(),
+                     length,
+                     mNbElts);
         switch (srcDt) {
         case DataType::Float64:
-            thrust_copy(static_cast<const double*>(src),
-                        static_cast<T*>(rawPtr(offset)),
+            thrust_copy(static_cast<const double *>(src),
+                        static_cast<T *>(rawPtr(offset)),
                         length);
             break;
         case DataType::Float32:
-            thrust_copy(static_cast<const float*>(src),
-                        static_cast<T*>(rawPtr(offset)),
+            thrust_copy(static_cast<const float *>(src),
+                        static_cast<T *>(rawPtr(offset)),
                         length);
             break;
         case DataType::Float16:
-            thrust_copy(static_cast<const half_float::half*>(src),
-                        static_cast<T*>(rawPtr(offset)),
+            thrust_copy(static_cast<const half_float::half *>(src),
+                        static_cast<T *>(rawPtr(offset)),
                         length);
             break;
         case DataType::Int64:
-            thrust_copy(static_cast<const int64_t*>(src),
-                        static_cast<T*>(rawPtr(offset)),
+            thrust_copy(static_cast<const int64_t *>(src),
+                        static_cast<T *>(rawPtr(offset)),
                         length);
             break;
         case DataType::UInt64:
-            thrust_copy(static_cast<const uint64_t*>(src),
-                        static_cast<T*>(rawPtr(offset)),
+            thrust_copy(static_cast<const uint64_t *>(src),
+                        static_cast<T *>(rawPtr(offset)),
                         length);
             break;
         case DataType::Int32:
-            thrust_copy(static_cast<const int32_t*>(src),
-                        static_cast<T*>(rawPtr(offset)),
+            thrust_copy(static_cast<const int32_t *>(src),
+                        static_cast<T *>(rawPtr(offset)),
                         length);
             break;
         case DataType::UInt32:
-            thrust_copy(static_cast<const uint32_t*>(src),
-                        static_cast<T*>(rawPtr(offset)),
+            thrust_copy(static_cast<const uint32_t *>(src),
+                        static_cast<T *>(rawPtr(offset)),
                         length);
             break;
         case DataType::Int16:
-            thrust_copy(static_cast<const int16_t*>(src),
-                        static_cast<T*>(rawPtr(offset)),
+            thrust_copy(static_cast<const int16_t *>(src),
+                        static_cast<T *>(rawPtr(offset)),
                         length);
             break;
         case DataType::UInt16:
-            thrust_copy(static_cast<const uint16_t*>(src),
-                        static_cast<T*>(rawPtr(offset)),
+            thrust_copy(static_cast<const uint16_t *>(src),
+                        static_cast<T *>(rawPtr(offset)),
                         length);
             break;
         case DataType::Int8:
-            thrust_copy(static_cast<const int8_t*>(src),
-                        static_cast<T*>(rawPtr(offset)),
+            thrust_copy(static_cast<const int8_t *>(src),
+                        static_cast<T *>(rawPtr(offset)),
                         length);
             break;
         case DataType::UInt8:
-            thrust_copy(static_cast<const uint8_t*>(src),
-                        static_cast<T*>(rawPtr(offset)),
+            thrust_copy(static_cast<const uint8_t *>(src),
+                        static_cast<T *>(rawPtr(offset)),
                         length);
             break;
         default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "TensorImpl_cuda<{}>::copyCast(): unsupported data type {}.", typeid(T).name(), srcDt);
+            AIDGE_THROW_OR_ABORT(
+                std::runtime_error,
+                "TensorImpl_cuda<{}>::copyCast(): unsupported data type {}.",
+                typeid(T).name(),
+                srcDt);
             break;
         }
     }
 
-    void copyFromDevice(const void *src, const std::pair<std::string, DeviceIdx_t>& device, NbElts_t length, NbElts_t offset = 0) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromDevice(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
-        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyDeviceToDevice));
+    void copyFromDevice(const void *src,
+                        const std::pair<std::string, DeviceIdx_t> &device,
+                        NbElts_t length,
+                        NbElts_t offset = 0) override {
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts,
+                     "TensorImpl_cuda<{}>::copyFromDevice(): copy length ({}) "
+                     "is above capacity ({})",
+                     typeid(T).name(),
+                     length,
+                     mNbElts);
+        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset),
+                                     src,
+                                     length * sizeof(T),
+                                     cudaMemcpyDeviceToDevice));
     }
 
-    void copyFromHost(const void *src, NbElts_t length, NbElts_t offset = 0) override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyFromHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
-        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset), src, length * sizeof(T), cudaMemcpyHostToDevice));
+    void copyFromHost(const void *src,
+                      NbElts_t length,
+                      NbElts_t offset = 0) override {
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts,
+                     "TensorImpl_cuda<{}>::copyFromHost(): copy length ({}) "
+                     "is above capacity ({})",
+                     typeid(T).name(),
+                     length,
+                     mNbElts);
+        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(offset),
+                                     src,
+                                     length * sizeof(T),
+                                     cudaMemcpyHostToDevice));
     }
 
-    void copyToHost(void *dst, NbElts_t length, NbElts_t offset = 0) const override {
-        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts, "TensorImpl_cuda<{}>::copyToHost(): copy length ({}) is above capacity ({})", typeid(T).name(), length, mNbElts);
-        CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(offset), length * sizeof(T), cudaMemcpyDeviceToHost));
+    void copyToHost(void *dst,
+                    NbElts_t length,
+                    NbElts_t offset = 0) const override {
+        AIDGE_ASSERT(length <= mData.size() || length <= mNbElts,
+                     "TensorImpl_cuda<{}>::copyToHost(): copy length ({}) is "
+                     "above capacity ({})",
+                     typeid(T).name(),
+                     length,
+                     mNbElts);
+        CHECK_CUDA_STATUS(cudaMemcpy(dst,
+                                     rawPtr(offset),
+                                     length * sizeof(T),
+                                     cudaMemcpyDeviceToHost));
     }
 
     void *rawPtr(NbElts_t offset = 0) override {
@@ -191,104 +262,139 @@ public:
     };
 
     const void *rawPtr(NbElts_t offset = 0) const override {
-        AIDGE_ASSERT(mData.size() >= mNbElts, "TensorImpl_cuda<{}>::rawPtr(): accessing uninitialized const rawPtr", typeid(T).name());
+        AIDGE_ASSERT(mData.size() >= mNbElts,
+                     "TensorImpl_cuda<{}>::rawPtr(): accessing uninitialized "
+                     "const rawPtr",
+                     typeid(T).name());
         return (mData.data() + offset);
     };
 
-    const cudnnTensorDescriptor_t& getCudnnTensorDesc(const Tensor& tensor) const override {
+    const cudnnTensorDescriptor_t &
+    getCudnnTensorDesc(const Tensor &tensor) const override {
         if (mCudnnTensor == nullptr) {
             CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mCudnnTensor));
 
             if (tensor.size() > 0) {
                 /**
-                **      cudNN Tensors are restricted to having at least 4 dimensions :
-                **      When working with lower dimensionsal data, unused dimensions are set to 1.
-                **      Referes to the cudnnSetTensorNdDescriptor documentation from :
-                **      https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html
+                **      cudNN Tensors are restricted to having at least 4
+                *dimensions :
+                **      When working with lower dimensionsal data, unused
+                *dimensions are set to 1.
+                **      Referes to the cudnnSetTensorNdDescriptor documentation
+                *from :
+                **
+                *https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html
                 **/
-                std::vector<int> dims(tensor.dims().cbegin(), tensor.dims().cend());
-                std::vector<int> strides(tensor.strides().cbegin(), tensor.strides().cend());
+                std::vector<int> dims(tensor.dims().cbegin(),
+                                      tensor.dims().cend());
+                std::vector<int> strides(tensor.strides().cbegin(),
+                                         tensor.strides().cend());
 
                 if (dims.size() < 4) {
                     dims.resize(4, 1);
                     strides.resize(4, 1);
                 }
 
-                CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor,
-                                            CudaContext::data_type<T>::value,
-                                            dims.size(),
-                                            &dims[0],
-                                            &strides[0]));
+                CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(
+                    mCudnnTensor,
+                    CudaContext::data_type<T>::value,
+                    dims.size(),
+                    &dims[0],
+                    &strides[0]));
             }
-        }
-        else {
+        } else {
             // Compare if the shape of the tensor has changed
             cudnnDataType_t currentDataType;
             int currentNbDims;
-            // Since we don't know the nb dims of the current tensor, we init with CUDNN_DIM_MAX then remove the trailing zeros
+            // Since we don't know the nb dims of the current tensor, we init
+            // with CUDNN_DIM_MAX then remove the trailing zeros
             std::vector<int> currentDims(CUDNN_DIM_MAX);
             std::vector<int> currentStrides(CUDNN_DIM_MAX);
 
-            CHECK_CUDNN_STATUS(cudnnGetTensorNdDescriptor(mCudnnTensor, CUDNN_DIM_MAX, &currentDataType, &currentNbDims, currentDims.data(), currentStrides.data()));
+            CHECK_CUDNN_STATUS(
+                cudnnGetTensorNdDescriptor(mCudnnTensor,
+                                           CUDNN_DIM_MAX,
+                                           &currentDataType,
+                                           &currentNbDims,
+                                           currentDims.data(),
+                                           currentStrides.data()));
             // Remove the trailing zeros
-            currentDims.erase(std::find_if(currentDims.rbegin(), currentDims.rend(), [](int x) { return x != 0; }).base(),
+            currentDims.erase(std::find_if(currentDims.rbegin(),
+                                           currentDims.rend(),
+                                           [](int x) { return x != 0; })
+                                  .base(),
                               currentDims.end());
 
-            std::vector<int> dims(tensor.dims().cbegin(), tensor.dims().cend());
+            std::vector<int> dims(tensor.dims().cbegin(),
+                                  tensor.dims().cend());
             if (dims.size() < 4) {
                 dims.resize(4, 1);
             }
 
             // Update descriptor if shape has changed
-            if (dims!=currentDims) {
-                std::vector<int> strides(tensor.strides().cbegin(), tensor.strides().cend());
+            if (dims != currentDims) {
+                std::vector<int> strides(tensor.strides().cbegin(),
+                                         tensor.strides().cend());
 
                 if (strides.size() < 4) {
                     strides.resize(4, 1);
                 }
 
-                CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor,
-                                            CudaContext::data_type<T>::value,
-                                            dims.size(),
-                                            &dims[0],
-                                            &strides[0]));
+                CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(
+                    mCudnnTensor,
+                    CudaContext::data_type<T>::value,
+                    dims.size(),
+                    &dims[0],
+                    &strides[0]));
             }
         }
         return mCudnnTensor;
     }
 
     void setRawPtr(void *ptr, NbElts_t length) override final {
-        AIDGE_ASSERT(length >= mNbElts, "TensorImpl_cuda<{}>::setRawPtr(): trying to set raw pointer (length: {}) of insufficient capacity (required: {})", typeid(T).name(), length, mNbElts);
+        AIDGE_ASSERT(
+            length >= mNbElts,
+            "TensorImpl_cuda<{}>::setRawPtr(): trying to set raw pointer "
+            "(length: {}) of insufficient capacity (required: {})",
+            typeid(T).name(),
+            length,
+            mNbElts);
         mData = future_std::span<T>(static_cast<T *>(ptr), length);
         mDataOwner.reset();
     };
 
     virtual ~TensorImpl_cuda() = default;
 
-private:
+  private:
     void lazyInit() {
         if (mData.size() < mNbElts) {
             // Need more data, a re-allocation will occur
-            AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "TensorImpl_cuda<{}>: trying to enlarge non-owned data", typeid(T).name());
+            AIDGE_ASSERT(
+                mData.empty() || mDataOwner != nullptr,
+                "TensorImpl_cuda<{}>: trying to enlarge non-owned data",
+                typeid(T).name());
             mDataOwner.reset(cudaAlloc(mNbElts));
             mData = future_std::span<T>(mDataOwner.get(), mNbElts);
         }
     }
 };
 
-template <typename T>
-const std::string TensorImpl_cuda<T>::Backend = "cuda";
+template <typename T> const std::string TensorImpl_cuda<T>::Backend = "cuda";
 
 namespace {
-static Registrar<Tensor> registrarTensorImpl_cuda_Float64(
-        {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create);
-static Registrar<Tensor> registrarTensorImpl_cuda_Float32(
-        {"cuda", DataType::Float32}, Aidge::TensorImpl_cuda<float>::create);
+static Registrar<Tensor>
+    registrarTensorImpl_cuda_Float64({"cuda", DataType::Float64},
+                                     Aidge::TensorImpl_cuda<double>::create);
+static Registrar<Tensor>
+    registrarTensorImpl_cuda_Float32({"cuda", DataType::Float32},
+                                     Aidge::TensorImpl_cuda<float>::create);
 static Registrar<Tensor> registrarTensorImpl_cuda_Float16(
-        {"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create);
-static Registrar<Tensor> registrarTensorImpl_cuda_Int32(
-        {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int32_t>::create);
-}  // namespace
-}  // namespace Aidge
+    {"cuda", DataType::Float16},
+    Aidge::TensorImpl_cuda<half_float::half>::create);
+static Registrar<Tensor>
+    registrarTensorImpl_cuda_Int32({"cuda", DataType::Int32},
+                                   Aidge::TensorImpl_cuda<int32_t>::create);
+} // namespace
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/AddImpl.hpp b/include/aidge/backend/cuda/operator/AddImpl.hpp
index 42d420f8410f79100fdfdbe3eabb8b43e616a74a..70e7f80f1e4e5489afd02473662f54a4ca34c758 100644
--- a/include/aidge/backend/cuda/operator/AddImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AddImpl.hpp
@@ -29,10 +29,10 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class AddImpl_cuda : public OperatorImpl {
-public:
-    AddImpl_cuda(const Add_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    AddImpl_cuda(const Add_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<AddImpl_cuda> create(const Add_Op& op) {
+    static std::unique_ptr<AddImpl_cuda> create(const Add_Op &op) {
         return std::make_unique<AddImpl_cuda>(op);
     }
 
@@ -47,13 +47,19 @@ public:
     void forward() override;
     void backward() override;
 
-private:
-    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
-    template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+  private:
+    template <class T>
+    void forward_(const std::vector<Tensor> &inputs,
+                  const std::vector<std::vector<int>> &inputsDims,
+                  const std::vector<std::vector<int>> &inputsStrides);
+    template <class T>
+    void backward_(const Tensor &outGrad,
+                   const std::vector<std::vector<int>> &inputsDims,
+                   const std::vector<std::vector<int>> &inputsStrides);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(Add_Op, "cuda", Aidge::AddImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_ADDIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/AndImpl.hpp b/include/aidge/backend/cuda/operator/AndImpl.hpp
index e90a4c5fe3d7b4cd529dcb4cb5400a6447f53e3c..69911463ce257b804af21a00a72d9ecf2bb3c5a1 100644
--- a/include/aidge/backend/cuda/operator/AndImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AndImpl.hpp
@@ -29,10 +29,10 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class AndImpl_cuda : public OperatorImpl {
-public:
-    AndImpl_cuda(const And_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    AndImpl_cuda(const And_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<AndImpl_cuda> create(const And_Op& op) {
+    static std::unique_ptr<AndImpl_cuda> create(const And_Op &op) {
         return std::make_unique<AndImpl_cuda>(op);
     }
 
@@ -46,12 +46,15 @@ public:
 
     void forward() override;
 
-private:
-    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+  private:
+    template <class T>
+    void forward_(const std::vector<Tensor> &inputs,
+                  const std::vector<std::vector<int>> &inputsDims,
+                  const std::vector<std::vector<int>> &inputsStrides);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(And_Op, "cuda", Aidge::AndImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_ANDIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp
index bae79a03d03cd5fb7d5fdc4fbebf1dd7562370ae..588581786546f1eb9442a0c095d6346e1f4e32b7 100644
--- a/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp
@@ -12,26 +12,26 @@
 #ifndef AIDGE_CUDA_OPERATOR_ANDIMPL_KERNELS_H_
 #define AIDGE_CUDA_OPERATOR_ANDIMPL_KERNELS_H_
 
-#include <stdexcept>
 #include <cfloat>
 #include <cuda.h>
-#include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdexcept>
 
-#include "aidge/data/Data.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/data/Data.hpp"
 
 namespace Aidge {
 
 template <class T>
-void AndForward(const T* input1, const T* input2, T* output,
-                const std::vector<int>& input1Dims,const std::vector<int>& input2Dims,
-                const std::vector<int>& inputStrides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+void AndForward(const T *input1,
+                const T *input2,
+                T *output,
+                const std::vector<int> &input1Dims,
+                const std::vector<int> &input2Dims,
+                const std::vector<int> &inputStrides,
+                const std::vector<int> &input2Strides,
+                const std::vector<int> &outputStrides,
                 int outSize);
 }
 #endif /* AIDGE_CUDA_OPERATOR_ANDIMPL_KERNELS_H_ */
-
-
-
-
-
diff --git a/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp b/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp
index 7b4628084a913a10e48302597a4d5b77fb7f6d16..c20016371e7dfdeed8d09ae0ca58049d456281c6 100644
--- a/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp
@@ -29,10 +29,10 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class ArgMaxImpl_cuda : public OperatorImpl {
-public:
-    ArgMaxImpl_cuda(const ArgMax_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    ArgMaxImpl_cuda(const ArgMax_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<ArgMaxImpl_cuda> create(const ArgMax_Op& op) {
+    static std::unique_ptr<ArgMaxImpl_cuda> create(const ArgMax_Op &op) {
         return std::make_unique<ArgMaxImpl_cuda>(op);
     }
 
@@ -46,15 +46,17 @@ public:
 
     void forward() override;
 
-private:
+  private:
     // CuDNN specific variables
     std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
 
-    template <class T> void forward_(const Tensor& input, std::int32_t axis, DimSize_t selectLastIdx);
+    template <class T>
+    void
+    forward_(const Tensor &input, std::int32_t axis, DimSize_t selectLastIdx);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(ArgMax_Op, "cuda", Aidge::ArgMaxImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_ARGMAXIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp
index 8c07bf597f6422a26cedd4176fdb1ef29bcabcef..860bb08bb6bd264b10858af432a952762c8c71cc 100644
--- a/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp
@@ -12,20 +12,23 @@
 #ifndef AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_
 #define AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_
 
-#include <stdexcept>
 #include <cfloat>
 #include <cuda.h>
-#include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdexcept>
 
-#include "aidge/data/Data.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/data/Data.hpp"
 
-namespace Aidge
-{
-    template <class T>
-    void ArgMax_cuda_forward_kernel(const T* input, T* output,
-                                    const std::vector<int>& inputDims, const std::vector<int>& inputStrides,
-                                    int axis, int total_elems, std::size_t selectLastIdx);
+namespace Aidge {
+template <class T>
+void ArgMax_cuda_forward_kernel(const T *input,
+                                T *output,
+                                const std::vector<int> &inputDims,
+                                const std::vector<int> &inputStrides,
+                                int axis,
+                                int total_elems,
+                                std::size_t selectLastIdx);
 }
 #endif /* AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
index 1c4efcf66850330fe9747c500093efa4456fa3f1..e6f9a4b8b71d15ee3b916677d029591fa26858f5 100644
--- a/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AvgPoolingImpl.hpp
@@ -28,12 +28,13 @@
 
 namespace Aidge {
 // Operator implementation entry point for the backend
-template <DimIdx_t DIM>
-class AvgPoolingImpl_cuda : public OperatorImpl {
-public:
-    AvgPoolingImpl_cuda(const AvgPooling_Op<DIM>& op) : OperatorImpl(op, "cuda") {}
+template <DimIdx_t DIM> class AvgPoolingImpl_cuda : public OperatorImpl {
+  public:
+    AvgPoolingImpl_cuda(const AvgPooling_Op<DIM> &op)
+        : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<AvgPoolingImpl_cuda> create(const AvgPooling_Op<DIM>& op) {
+    static std::unique_ptr<AvgPoolingImpl_cuda>
+    create(const AvgPooling_Op<DIM> &op) {
         return std::make_unique<AvgPoolingImpl_cuda>(op);
     }
 
@@ -49,19 +50,19 @@ public:
     void backward() override;
     ~AvgPoolingImpl_cuda();
 
-private:
+  private:
     // CuDNN specific variables
     cudnnPoolingDescriptor_t mAvgPoolingDesc = nullptr;
     cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
     std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
 
-    template <class T> void forward_(const Tensor& input);
-    template <class T> void backward_(const Tensor& output_grad);
+    template <class T> void forward_(const Tensor &input);
+    template <class T> void backward_(const Tensor &output_grad);
 };
 
 // Implementation entry point registration to Operator
 using AvgPooling2D_Op = AvgPooling_Op<2>;
 REGISTRAR(AvgPooling2D_Op, "cuda", Aidge::AvgPoolingImpl_cuda<2>::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_AVGPOOLINGIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/BatchNormImpl.hpp b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp
index 025ef406fa6a988e758707b11fb2ceab6c829f26..e152653c55340712feadb50f731828b29a548020 100644
--- a/include/aidge/backend/cuda/operator/BatchNormImpl.hpp
+++ b/include/aidge/backend/cuda/operator/BatchNormImpl.hpp
@@ -28,12 +28,13 @@
 
 namespace Aidge {
 // Operator implementation entry point for the backend
-template <DimIdx_t DIM>
-class BatchNormImpl_cuda : public OperatorImpl {
-public:
-    BatchNormImpl_cuda(const BatchNorm_Op<DIM>& op) : OperatorImpl(op, "cuda") {}
+template <DimIdx_t DIM> class BatchNormImpl_cuda : public OperatorImpl {
+  public:
+    BatchNormImpl_cuda(const BatchNorm_Op<DIM> &op)
+        : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<BatchNormImpl_cuda> create(const BatchNorm_Op<DIM>& op) {
+    static std::unique_ptr<BatchNormImpl_cuda>
+    create(const BatchNorm_Op<DIM> &op) {
         return std::make_unique<BatchNormImpl_cuda>(op);
     }
 
@@ -49,19 +50,27 @@ public:
     void backward() override;
     ~BatchNormImpl_cuda();
 
-private:
+  private:
     // CuDNN specific variables
     cudnnTensorDescriptor_t mBNDesc = nullptr;
     cudnnBatchNormMode_t mMode;
     double mEpsilon;
 
-    template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, const Tensor& input3, const Tensor& input4);
-    template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
+    template <class T>
+    void forward_(const Tensor &input0,
+                  const Tensor &input1,
+                  const Tensor &input2,
+                  const Tensor &input3,
+                  const Tensor &input4);
+    template <class T>
+    void backward_(const Tensor &input0,
+                   const Tensor &input1,
+                   const Tensor &input2);
 };
 
 // Implementation entry point registration to Operator
 using BatchNorm2D_Op = BatchNorm_Op<2>;
 REGISTRAR(BatchNorm2D_Op, "cuda", Aidge::BatchNormImpl_cuda<2>::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_BATCHNORMIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ConvImpl.hpp b/include/aidge/backend/cuda/operator/ConvImpl.hpp
index 27f3781a6824dd71d228b90c71df58b12ea0a6b3..b58352b8bcba670a5cb571d38c069c663094dedb 100644
--- a/include/aidge/backend/cuda/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ConvImpl.hpp
@@ -27,49 +27,53 @@
 
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
-
 namespace Aidge {
 // Operator implementation entry point for the backend
-template <DimIdx_t DIM>
-class ConvImpl_cuda : public OperatorImpl {
-public:
-    ConvImpl_cuda(const Operator&op, bool depthWise = false) : OperatorImpl(op, "cuda"), mDepthWise(depthWise) {}
+template <DimIdx_t DIM> class ConvImpl_cuda : public OperatorImpl {
+  public:
+    ConvImpl_cuda(const Operator &op, bool depthWise = false)
+        : OperatorImpl(op, "cuda"), mDepthWise(depthWise) {}
 
-    static std::unique_ptr<ConvImpl_cuda<DIM>> create(const Conv_Op<DIM>& op) {
+    static std::unique_ptr<ConvImpl_cuda<DIM>> create(const Conv_Op<DIM> &op) {
         return std::make_unique<ConvImpl_cuda<DIM>>(op);
     }
 
-    static std::unique_ptr<ConvImpl_cuda<DIM>> createDW(const ConvDepthWise_Op<DIM> &op) {
+    static std::unique_ptr<ConvImpl_cuda<DIM>>
+    createDW(const ConvDepthWise_Op<DIM> &op) {
         return std::make_unique<ConvImpl_cuda<DIM>>(op, true);
     }
 
     virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
-        return {
-            {DataType::Any}
-        };
+        return {{DataType::Any}};
     }
 
     void forward() override;
     void backward() override;
     ~ConvImpl_cuda();
 
-private:
+  private:
     // CuDNN specific variables
     cudnnConvolutionDescriptor_t mConvDesc = nullptr;
     cudnnFilterDescriptor_t mFilterDesc = nullptr;
-    cudnnConvolutionFwdAlgo_t mFwdAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+    cudnnConvolutionFwdAlgo_t mFwdAlgo =
+        CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
     cudnnConvolutionBwdFilterAlgo_t mBwdFilterAlgo;
     cudnnConvolutionBwdDataAlgo_t mBwdDataAlgo;
     size_t mWorkspaceSize = 0;
-    void* mFwdWorkspace = nullptr;
-    void* mBwdWorkspace = nullptr;
+    void *mFwdWorkspace = nullptr;
+    void *mBwdWorkspace = nullptr;
     std::shared_ptr<Tensor> mInput0Fallback;
     std::shared_ptr<Tensor> mInput1Fallback;
     std::shared_ptr<Tensor> mInput2Fallback;
     bool mDepthWise = false;
 
-    template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
-    template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
+    template <class T>
+    void
+    forward_(const Tensor &input0, const Tensor &input1, const Tensor &input2);
+    template <class T>
+    void backward_(const Tensor &input0,
+                   const Tensor &input1,
+                   const Tensor &input2);
 };
 
 // Implementation entry point registration to Operator
@@ -77,6 +81,6 @@ using Conv2D_Op = Conv_Op<2>;
 using ConvDepthWise2D_Op = ConvDepthWise_Op<2>;
 REGISTRAR(Conv2D_Op, "cuda", Aidge::ConvImpl_cuda<2>::create);
 REGISTRAR(ConvDepthWise2D_Op, "cuda", Aidge::ConvImpl_cuda<2>::createDW);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_CONVIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/DivImpl.hpp b/include/aidge/backend/cuda/operator/DivImpl.hpp
index fbd3c73f1741d05549f06290ba9166b8d11c604d..90686a177a0e66701a6410dac811dc25e1472341 100644
--- a/include/aidge/backend/cuda/operator/DivImpl.hpp
+++ b/include/aidge/backend/cuda/operator/DivImpl.hpp
@@ -29,10 +29,10 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class DivImpl_cuda : public OperatorImpl {
-public:
-    DivImpl_cuda(const Div_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    DivImpl_cuda(const Div_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<DivImpl_cuda> create(const Div_Op& op) {
+    static std::unique_ptr<DivImpl_cuda> create(const Div_Op &op) {
         return std::make_unique<DivImpl_cuda>(op);
     }
 
@@ -47,13 +47,16 @@ public:
     void forward() override;
     void backward() override;
 
-private:
-    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
-    template <class T> void backward_(const Tensor& outGrad);
+  private:
+    template <class T>
+    void forward_(const std::vector<Tensor> &inputs,
+                  const std::vector<std::vector<int>> &inputsDims,
+                  const std::vector<std::vector<int>> &inputsStrides);
+    template <class T> void backward_(const Tensor &outGrad);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(Div_Op, "cuda", Aidge::DivImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_DIVIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp
index 512bec77bb63570ffeb8f1681e4e25cd323535fa..cc5999a640eb8342bf24744c35ebf1688a63ccd9 100644
--- a/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp
@@ -12,28 +12,29 @@
 #ifndef AIDGE_CUDA_OPERATOR_DIVIMPL_KERNELS_H_
 #define AIDGE_CUDA_OPERATOR_DIVIMPL_KERNELS_H_
 
-#include <stdexcept>
 #include <cfloat>
 #include <cuda.h>
-#include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdexcept>
 
-#include "aidge/data/Data.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
 
 template <class T>
-void divForward(const T* input1, T* output, const T* intput2,
-                const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, const std::vector<int>& outputDims,
-                const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+void divForward(const T *input1,
+                T *output,
+                const T *intput2,
+                const std::vector<int> &input1Dims,
+                const std::vector<int> &input2Dims,
+                const std::vector<int> &outputDims,
+                const std::vector<int> &input1Strides,
+                const std::vector<int> &input2Strides,
+                const std::vector<int> &outputStrides,
                 int outSize);
 
 }
 #endif /* AIDGE_CUDA_OPERATOR_DIVIMPL_KERNELS_H_ */
-
-
-
-
-
diff --git a/include/aidge/backend/cuda/operator/FCImpl.hpp b/include/aidge/backend/cuda/operator/FCImpl.hpp
index 8380754ea2419b2baff6de5126f8b6ff3e640178..3bd1da3560efe18018345fb2fe4586e17e80279f 100644
--- a/include/aidge/backend/cuda/operator/FCImpl.hpp
+++ b/include/aidge/backend/cuda/operator/FCImpl.hpp
@@ -29,10 +29,10 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class FCImpl_cuda : public OperatorImpl {
-public:
-    FCImpl_cuda(const FC_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    FCImpl_cuda(const FC_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<FCImpl_cuda> create(const FC_Op& op) {
+    static std::unique_ptr<FCImpl_cuda> create(const FC_Op &op) {
         return std::make_unique<FCImpl_cuda>(op);
     }
 
@@ -47,17 +47,25 @@ public:
     void forward() override;
     void backward() override;
 
-private:
+  private:
     std::shared_ptr<Tensor> mInput0Fallback;
     std::shared_ptr<Tensor> mInput1Fallback;
     std::shared_ptr<Tensor> mInput2Fallback;
 
-    template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels);
-    template <class T> void backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels);
+    template <class T>
+    void forward_(const Tensor &input0,
+                  const Tensor &input1,
+                  const Tensor &input2,
+                  std::size_t outChannels);
+    template <class T>
+    void backward_(const Tensor &input0,
+                   const Tensor &input1,
+                   const Tensor &input2,
+                   std::size_t outChannels);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(FC_Op, "cuda", Aidge::FCImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_FCIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp
index a956960df0a4dccb4ef9eb0634e5f61b9ddede0a..a3ecb6adad818a11d90dfebff33710a2bb36ea60 100644
--- a/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp
@@ -12,34 +12,45 @@
 #ifndef AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_
 #define AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_
 
-#include <stdexcept>
 #include <cfloat>
 #include <cuda.h>
-#include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdexcept>
 
-#include "aidge/data/Data.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/data/Data.hpp"
 
 namespace Aidge {
 
 template <class T>
 cublasStatus_t cublasGemm(cublasHandle_t handle,
-                          cublasOperation_t transa, cublasOperation_t transb,
-                          int m, int n, int k,
+                          cublasOperation_t transa,
+                          cublasOperation_t transb,
+                          int m,
+                          int n,
+                          int k,
                           const T *alpha,
-                          const T *A, int lda,
-                          const T *B, int ldb,
+                          const T *A,
+                          int lda,
+                          const T *B,
+                          int ldb,
                           const T *beta,
-                          T *C, int ldc);
+                          T *C,
+                          int ldc);
 
 template <class T>
-cublasStatus_t cublasGemv(cublasHandle_t handle, cublasOperation_t trans,
-                          int m, int n,
-                          const T  *alpha,
-                          const T *A, int lda,
-                          const T *x, int incx,
+cublasStatus_t cublasGemv(cublasHandle_t handle,
+                          cublasOperation_t trans,
+                          int m,
+                          int n,
+                          const T *alpha,
+                          const T *A,
+                          int lda,
+                          const T *x,
+                          int incx,
                           const T *beta,
-                          T *y, int incy);
-}
+                          T *y,
+                          int incy);
+} // namespace Aidge
 #endif /* AIDGE_CUDA_OPERATOR_FCIMPL_KERNELS_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp b/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp
index 5b0cf07ab8687b9746d13af2274465ad923e6571..65c97376dae44fc346dd98a0e2b0fc6f85378682 100644
--- a/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp
+++ b/include/aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp
@@ -29,35 +29,37 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class GlobalAveragePoolingImpl_cuda : public OperatorImpl {
-public:
-    GlobalAveragePoolingImpl_cuda(const GlobalAveragePooling_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    GlobalAveragePoolingImpl_cuda(const GlobalAveragePooling_Op &op)
+        : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<GlobalAveragePoolingImpl_cuda> create(const GlobalAveragePooling_Op& op) {
+    static std::unique_ptr<GlobalAveragePoolingImpl_cuda>
+    create(const GlobalAveragePooling_Op &op) {
         return std::make_unique<GlobalAveragePoolingImpl_cuda>(op);
     }
 
     virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
-        return {
-            {DataType::Any}
-        };
+        return {{DataType::Any}};
     }
 
     void forward() override;
     void backward() override;
     ~GlobalAveragePoolingImpl_cuda();
 
-private:
+  private:
     // CuDNN specific variables
     cudnnPoolingDescriptor_t mGlobalAveragePoolingDesc = nullptr;
     cudnnPoolingMode_t mMode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
     std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
 
-    template <class T> void forward_(const Tensor& input);
-    template <class T> void backward_(const Tensor& output_grad);
+    template <class T> void forward_(const Tensor &input);
+    template <class T> void backward_(const Tensor &output_grad);
 };
 
 // Implementation entry point registration to Operator
-REGISTRAR(GlobalAveragePooling_Op, "cuda", Aidge::GlobalAveragePoolingImpl_cuda::create);
-}  // namespace Aidge
+REGISTRAR(GlobalAveragePooling_Op,
+          "cuda",
+          Aidge::GlobalAveragePoolingImpl_cuda::create);
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp b/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp
index 0d858c4719899094f996ca4f82f075df547a6fd4..01e0dad76522fc55c638a6c51c29d46297601bb4 100644
--- a/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ILayerNormImpl.hpp
@@ -30,10 +30,11 @@
 
 namespace Aidge {
 class ILayerNormImpl_cuda : public OperatorImpl {
-public:
+  public:
     ILayerNormImpl_cuda(const ILayerNorm_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<ILayerNormImpl_cuda> create(const ILayerNorm_Op &op) {
+    static std::unique_ptr<ILayerNormImpl_cuda>
+    create(const ILayerNorm_Op &op) {
         return std::make_unique<ILayerNormImpl_cuda>(op);
     }
 
@@ -48,18 +49,20 @@ public:
     void forward() override;
     void backward() override;
 
-private:
+  private:
     std::shared_ptr<Tensor> mInput0Fallback;
     std::shared_ptr<Tensor> mInput1Fallback;
     std::shared_ptr<Tensor> mInput2Fallback;
     std::shared_ptr<Tensor> mOutputGradFallback;
 
-    template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2);
-    template <class T> void backward_(const Tensor& output_grad);
+    template <class T>
+    void
+    forward_(const Tensor &input0, const Tensor &input1, const Tensor &input2);
+    template <class T> void backward_(const Tensor &output_grad);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(ILayerNorm_Op, "cuda", Aidge::ILayerNormImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_ILAYERNORMIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp
index aa54029ea29bc46809f227038a1a23d91bc161ee..c9269380544babc7de7c50065e275155d9bafd8e 100644
--- a/include/aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/ILayerNormImpl_CUDA_kernels.hpp
@@ -14,79 +14,113 @@
 #ifndef AIDGE_CUDA_OPERATOR_ILAYERNORMIMPL_FORWARD_KERNEL_H_
 #define AIDGE_CUDA_OPERATOR_ILAYERNORMIMPL_FORWARD_KERNEL_H_
 
-#include <stdexcept>
 #include <cfloat>
 #include <cuda.h>
-#include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdexcept>
 
-#include "aidge/data/Data.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/data/Data.hpp"
 
 namespace Aidge {
 
 /**
-    * @brief Compute the forward for ILayerNorm
-    * @param input: Input tensor
-    * @param SF: Scaling factor of input tensor
-    * @param dims: Dimensions of input tensor
-    * @param quantized_tensor: Quantized output tensor
-    * @param square_tensor: Tensor use for computation
-    * @param weight: weight of ILayerNorm layer 
-    * @param bias: bias of ILayerNorm layer
-    * @param new_SF: Scaling factor of output that can be use to dequantify
-*/
+ * @brief Compute the forward for ILayerNorm
+ * @param input: Input tensor
+ * @param SF: Scaling factor of input tensor
+ * @param dims: Dimensions of input tensor
+ * @param quantized_tensor: Quantized output tensor
+ * @param square_tensor: Tensor use for computation
+ * @param weight: weight of ILayerNorm layer
+ * @param bias: bias of ILayerNorm layer
+ * @param new_SF: Scaling factor of output that can be use to dequantify
+ */
 template <class T>
-__global__ void ILayerNormforward_(T* input, double SF, int* dims, int* quantized_tensor,long long int* square_tensor, T* weight, T* biase, double new_SF);
+__global__ void ILayerNormforward_(T *input,
+                                   double SF,
+                                   int *dims,
+                                   int *quantized_tensor,
+                                   long long int *square_tensor,
+                                   T *weight,
+                                   T *biase,
+                                   double new_SF);
 
 /**
-    * @brief Wrapper function to execute ILayerNormforward_
-    * @note Output correspond to the non-quantized tensor, to obtain the quantized tensor we need to copy quantized_tensor and not input_cuda_tensor
-    * @param input: Input tensor
-    * @param output: Output tensor (not quantized)
-    * @param SF: Scaling factor of input tensor
-    * @param weight_raw: weight of ILayerNorm layer 
-    * @param bias_raw: bias of ILayerNorm layer
-    * @param size: Number of elements in the input tensor
-    * @param dims: Dimensions of input tensor
-*/
+ * @brief Wrapper function to execute ILayerNormforward_
+ * @note Output correspond to the non-quantized tensor, to obtain the quantized
+ * tensor we need to copy quantized_tensor and not input_cuda_tensor
+ * @param input: Input tensor
+ * @param output: Output tensor (not quantized)
+ * @param SF: Scaling factor of input tensor
+ * @param weight_raw: weight of ILayerNorm layer
+ * @param bias_raw: bias of ILayerNorm layer
+ * @param size: Number of elements in the input tensor
+ * @param dims: Dimensions of input tensor
+ */
 template <class T>
-void ILayerNormforward(const T* input, T* output, double SF, const T* weight_raw, const T* bias_raw, size_t size, std::vector<long unsigned int> dims_input);
+void ILayerNormforward(const T *input,
+                       T *output,
+                       double SF,
+                       const T *weight_raw,
+                       const T *bias_raw,
+                       size_t size,
+                       std::vector<long unsigned int> dims_input);
 
 /**
-    * @brief Compute the backward for ILayerNorm
-    * @param output_grad: Gradient of output tensor
-    * @param input_tensor: Input tensor
-    * @param output_tensor: Output tensor obtained after forward
-    * @param mean: Arithmetic mean of input tensor
-    * @param var: Arithmetic variance of input tensor
-    * @param weight: weight of ILayerNorm layer 
-    * @param bias: bias of ILayerNorm layer
-    * @param input_grad: Gradient of input tensor 
-    * @param weight_grad: Gradient of ILayerNorm weight 
-    * @param bias_grad: Gradient of ILayerNorm bias 
-    * @param size: Number of elements in the input tensor
-*/
+ * @brief Compute the backward for ILayerNorm
+ * @param output_grad: Gradient of output tensor
+ * @param input_tensor: Input tensor
+ * @param output_tensor: Output tensor obtained after forward
+ * @param mean: Arithmetic mean of input tensor
+ * @param var: Arithmetic variance of input tensor
+ * @param weight: weight of ILayerNorm layer
+ * @param bias: bias of ILayerNorm layer
+ * @param input_grad: Gradient of input tensor
+ * @param weight_grad: Gradient of ILayerNorm weight
+ * @param bias_grad: Gradient of ILayerNorm bias
+ * @param size: Number of elements in the input tensor
+ */
 template <class T>
-__global__ void ILayerNormbackward_(T* output_grad, T* input_tensor, T* output_tensor, T* mean, T* var, T* weight, T* bias, T* input_grad, T* weight_grad, T* bias_grad, int size);
+__global__ void ILayerNormbackward_(T *output_grad,
+                                    T *input_tensor,
+                                    T *output_tensor,
+                                    T *mean,
+                                    T *var,
+                                    T *weight,
+                                    T *bias,
+                                    T *input_grad,
+                                    T *weight_grad,
+                                    T *bias_grad,
+                                    int size);
 
 /**
-    * @brief Wrapper function to execute ILayerNormbackward_
-    * @param input_tensor: Input tensor
-    * @param output_grad: Gradient of output tensor
-    * @param output_tensor: Output tensor obtained after forward
-    * @param mean: Arithmetic mean of input tensor
-    * @param var: Arithmetic variance of input tensor
-    * @param weight: weight of ILayerNorm layer 
-    * @param bias: bias of ILayerNorm layer
-    * @param input_grad: Gradient of input tensor 
-    * @param weight_grad: Gradient of ILayerNorm weight 
-    * @param bias_grad: Gradient of ILayerNorm bias 
-    * @param size: Number of elements in the input tensor
-*/
+ * @brief Wrapper function to execute ILayerNormbackward_
+ * @param input_tensor: Input tensor
+ * @param output_grad: Gradient of output tensor
+ * @param output_tensor: Output tensor obtained after forward
+ * @param mean: Arithmetic mean of input tensor
+ * @param var: Arithmetic variance of input tensor
+ * @param weight: weight of ILayerNorm layer
+ * @param bias: bias of ILayerNorm layer
+ * @param input_grad: Gradient of input tensor
+ * @param weight_grad: Gradient of ILayerNorm weight
+ * @param bias_grad: Gradient of ILayerNorm bias
+ * @param size: Number of elements in the input tensor
+ */
 template <class T>
-void ILayerNormbackward(const T* input_tensor, const T* output_grad, const T* output_tensor,const T* mean,const T* var, const T* weight, const T* bias, T* input_grad, T* weight_grad, T* bias_grad, size_t size);
+void ILayerNormbackward(const T *input_tensor,
+                        const T *output_grad,
+                        const T *output_tensor,
+                        const T *mean,
+                        const T *var,
+                        const T *weight,
+                        const T *bias,
+                        T *input_grad,
+                        T *weight_grad,
+                        T *bias_grad,
+                        size_t size);
 
-}
+} // namespace Aidge
 
 #endif /* AIDGE_CUDA_OPERATOR_ILAYERNORMIMPL_FORWARD_KERNEL_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/LnImpl.hpp b/include/aidge/backend/cuda/operator/LnImpl.hpp
index fbbccc11275b5c11bbaa86d05a2c19a1a46c11c1..a72ddb0dbd7bc37202bf96fbad9681bfc11f5a72 100644
--- a/include/aidge/backend/cuda/operator/LnImpl.hpp
+++ b/include/aidge/backend/cuda/operator/LnImpl.hpp
@@ -29,10 +29,10 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class LnImpl_cuda : public OperatorImpl {
-public:
-    LnImpl_cuda(const Ln_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    LnImpl_cuda(const Ln_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<LnImpl_cuda> create(const Ln_Op& op) {
+    static std::unique_ptr<LnImpl_cuda> create(const Ln_Op &op) {
         return std::make_unique<LnImpl_cuda>(op);
     }
 
@@ -47,16 +47,16 @@ public:
     void forward() override;
     void backward() override;
 
-private:
+  private:
     std::shared_ptr<Tensor> mInputFallback;
     std::shared_ptr<Tensor> mOutputGradFallback;
 
-    template <class T> void forward_(const Tensor& input);
-    template <class T> void backward_(const Tensor& output_grad);
+    template <class T> void forward_(const Tensor &input);
+    template <class T> void backward_(const Tensor &output_grad);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(Ln_Op, "cuda", Aidge::LnImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_LNIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp
index 9652d88116ca2cac92abbc517f8bc650655f43cc..305a201edd3cb9169630cf554a005715df3c4970 100644
--- a/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp
@@ -12,25 +12,19 @@
 #ifndef AIDGE_CUDA_OPERATOR_LNIMPL_KERNELS_H_
 #define AIDGE_CUDA_OPERATOR_LNIMPL_KERNELS_H_
 
-#include <stdexcept>
 #include <cfloat>
 #include <cuda.h>
-#include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdexcept>
 
-#include "aidge/data/Data.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/data/Data.hpp"
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
 
-template <class T>
-void lnForward(const T* input, T* output, int size);
+template <class T> void lnForward(const T *input, T *output, int size);
 
 }
 #endif /* AIDGE_CUDA_OPERATOR_LNIMPL_KERNELS_H_ */
-
-
-
-
-
diff --git a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp
index 474a408f9697e8e91ffe9c8e2a79a79d7968e80a..44e09d0b8d5ed94c698d3774db2ac320bfd1fca8 100644
--- a/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp
+++ b/include/aidge/backend/cuda/operator/MaxPoolingImpl.hpp
@@ -28,38 +28,37 @@
 
 namespace Aidge {
 // Operator implementation entry point for the backend
-template <DimIdx_t DIM>
-class MaxPoolingImpl_cuda : public OperatorImpl {
-public:
-    MaxPoolingImpl_cuda(const MaxPooling_Op<DIM>& op) : OperatorImpl(op, "cuda") {}
+template <DimIdx_t DIM> class MaxPoolingImpl_cuda : public OperatorImpl {
+  public:
+    MaxPoolingImpl_cuda(const MaxPooling_Op<DIM> &op)
+        : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<MaxPoolingImpl_cuda> create(const MaxPooling_Op<DIM>& op) {
+    static std::unique_ptr<MaxPoolingImpl_cuda>
+    create(const MaxPooling_Op<DIM> &op) {
         return std::make_unique<MaxPoolingImpl_cuda>(op);
     }
 
     virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
-        return {
-            {DataType::Any}
-        };
+        return {{DataType::Any}};
     }
 
     void forward() override;
     void backward() override;
     ~MaxPoolingImpl_cuda();
 
-private:
+  private:
     // CuDNN specific variables
     cudnnPoolingDescriptor_t mMaxPoolingDesc = nullptr;
     cudnnPoolingMode_t mMode = CUDNN_POOLING_MAX;
     std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
 
-    template <class T> void forward_(const Tensor& input);
-    template <class T> void backward_(const Tensor& output_grad);
+    template <class T> void forward_(const Tensor &input);
+    template <class T> void backward_(const Tensor &output_grad);
 };
 
 // Implementation entry point registration to Operator
 using MaxPooling2D_Op = MaxPooling_Op<2>;
 REGISTRAR(MaxPooling2D_Op, "cuda", Aidge::MaxPoolingImpl_cuda<2>::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_MAXPOOLINGIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/MulImpl.hpp b/include/aidge/backend/cuda/operator/MulImpl.hpp
index 9a1a4d79d32c7a962d2086319d948e60a9f51049..d4995a1a7ecbdf8c6777769fe68f9e5c851e917c 100644
--- a/include/aidge/backend/cuda/operator/MulImpl.hpp
+++ b/include/aidge/backend/cuda/operator/MulImpl.hpp
@@ -29,10 +29,10 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class MulImpl_cuda : public OperatorImpl {
-public:
-    MulImpl_cuda(const Mul_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    MulImpl_cuda(const Mul_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<MulImpl_cuda> create(const Mul_Op& op) {
+    static std::unique_ptr<MulImpl_cuda> create(const Mul_Op &op) {
         return std::make_unique<MulImpl_cuda>(op);
     }
 
@@ -47,13 +47,19 @@ public:
     void forward() override;
     void backward() override;
 
-private:
-    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
-    template <class T> void backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+  private:
+    template <class T>
+    void forward_(const std::vector<Tensor> &inputs,
+                  const std::vector<std::vector<int>> &inputsDims,
+                  const std::vector<std::vector<int>> &inputsStrides);
+    template <class T>
+    void backward_(const Tensor &outputGrad,
+                   const std::vector<std::vector<int>> &inputsDims,
+                   const std::vector<std::vector<int>> &inputsStrides);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(Mul_Op, "cuda", Aidge::MulImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_MULIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/PadImpl.hpp b/include/aidge/backend/cuda/operator/PadImpl.hpp
index a0f7037c811cd3cb130cffed0bb7746e33220074..14482b2c78780100e4b301235834665ed92e7441 100644
--- a/include/aidge/backend/cuda/operator/PadImpl.hpp
+++ b/include/aidge/backend/cuda/operator/PadImpl.hpp
@@ -28,12 +28,11 @@
 
 namespace Aidge {
 // Operator implementation entry point for the backend
-template <DimIdx_t DIM>
-class PadImpl_cuda : public OperatorImpl {
-public:
-    PadImpl_cuda(const Pad_Op<DIM>& op) : OperatorImpl(op, "cuda") {}
+template <DimIdx_t DIM> class PadImpl_cuda : public OperatorImpl {
+  public:
+    PadImpl_cuda(const Pad_Op<DIM> &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<PadImpl_cuda> create(const Pad_Op<DIM>& op) {
+    static std::unique_ptr<PadImpl_cuda> create(const Pad_Op<DIM> &op) {
         return std::make_unique<PadImpl_cuda>(op);
     }
 
@@ -48,20 +47,20 @@ public:
     void forward() override;
     void backward() override;
 
-private:
+  private:
     // CuDNN specific variables
     std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
     int mLeftPad, mTopPad;
     double mPadVal;
     unsigned int mPadType;
 
-    template <class T> void forward_(const Tensor& input);
-    template <class T> void backward_(const Tensor& outGrad);
+    template <class T> void forward_(const Tensor &input);
+    template <class T> void backward_(const Tensor &outGrad);
 };
 
 // Implementation entry point registration to Operator
 using Pad2D_Op = Pad_Op<2>;
 REGISTRAR(Pad2D_Op, "cuda", Aidge::PadImpl_cuda<2>::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_PADIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp
index 11ddb0ea8b0e6603bf009c4ae0a7fa3247a8904f..5924a65d43b15946f3adeeea4ad9992fcbe90e8e 100644
--- a/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/PadImpl_CUDA_kernels.hpp
@@ -12,26 +12,25 @@
 #ifndef AIDGE_CUDA_OPERATOR_PADIMPL_KERNELS_H_
 #define AIDGE_CUDA_OPERATOR_PADIMPL_KERNELS_H_
 
-#include "aidge/data/Data.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/data/Data.hpp"
 
-namespace Aidge
-{
+namespace Aidge {
 
-    template <class T>
-    void cudaPadding(const cudaDeviceProp &deviceProp,
-                     unsigned int nbOutputs,
-                     unsigned int outputsWidth,
-                     unsigned int outputsHeight,
-                     unsigned int nbChannels,
-                     unsigned int batchSize,
-                     unsigned int inputWidth,
-                     unsigned int inputHeight,
-                     int leftPad,
-                     int topPad,
-                     unsigned int padType,
-                     T padValue,
-                     const T *input,
-                     T *outputs);
+template <class T>
+void cudaPadding(const cudaDeviceProp &deviceProp,
+                 unsigned int nbOutputs,
+                 unsigned int outputsWidth,
+                 unsigned int outputsHeight,
+                 unsigned int nbChannels,
+                 unsigned int batchSize,
+                 unsigned int inputWidth,
+                 unsigned int inputHeight,
+                 int leftPad,
+                 int topPad,
+                 unsigned int padType,
+                 T padValue,
+                 const T *input,
+                 T *outputs);
 }
 #endif /* AIDGE_CUDA_OPERATOR_PADIMPL_KERNELS_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/PowImpl.hpp b/include/aidge/backend/cuda/operator/PowImpl.hpp
index 9b53d8dc04985794238f79cff9c78c44408fb6d7..5a8b31978bc90de4334ec3adb8ef373d6a4720bd 100644
--- a/include/aidge/backend/cuda/operator/PowImpl.hpp
+++ b/include/aidge/backend/cuda/operator/PowImpl.hpp
@@ -29,10 +29,10 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class PowImpl_cuda : public OperatorImpl {
-public:
-    PowImpl_cuda(const Pow_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    PowImpl_cuda(const Pow_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<PowImpl_cuda> create(const Pow_Op& op) {
+    static std::unique_ptr<PowImpl_cuda> create(const Pow_Op &op) {
         return std::make_unique<PowImpl_cuda>(op);
     }
 
@@ -47,13 +47,16 @@ public:
     void forward() override;
     void backward() override;
 
-private:
-    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
-    template <class T> void backward_(const Tensor& outGrad);
+  private:
+    template <class T>
+    void forward_(const std::vector<Tensor> &inputs,
+                  const std::vector<std::vector<int>> &inputsDims,
+                  const std::vector<std::vector<int>> &inputsStrides);
+    template <class T> void backward_(const Tensor &outGrad);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(Pow_Op, "cuda", Aidge::PowImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_POWIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp
index e89bea53ba766b0bd90f0c7acd631b0370d96298..84de2e6b9758cd2b49fc29bf28e06092d166726d 100644
--- a/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp
@@ -12,27 +12,28 @@
 #ifndef AIDGE_CUDA_OPERATOR_POWIMPL_KERNELS_H_
 #define AIDGE_CUDA_OPERATOR_POWIMPL_KERNELS_H_
 
-#include <stdexcept>
 #include <cfloat>
 #include <cuda.h>
-#include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdexcept>
 
-#include "aidge/data/Data.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/data/Data.hpp"
 
 namespace Aidge {
 
 template <class T>
-void powForward(const T* input, T* output, const T* exponent,
-                const std::vector<int>& inputDims,const std::vector<int>& exponentDims, const std::vector<int>& outputDims,
-                const std::vector<int>& inputStrides, const std::vector<int>& exponentStrides,const std::vector<int>& outputStrides,
+void powForward(const T *input,
+                T *output,
+                const T *exponent,
+                const std::vector<int> &inputDims,
+                const std::vector<int> &exponentDims,
+                const std::vector<int> &outputDims,
+                const std::vector<int> &inputStrides,
+                const std::vector<int> &exponentStrides,
+                const std::vector<int> &outputStrides,
                 int outSize);
 
 }
 #endif /* AIDGE_CUDA_OPERATOR_POWIMPL_KERNELS_H_ */
-
-
-
-
-
diff --git a/include/aidge/backend/cuda/operator/ReLUImpl.hpp b/include/aidge/backend/cuda/operator/ReLUImpl.hpp
index 306a56c4d0959dc4d818a6791173c375f5435360..7f8aaaf1b6e96a0b8708b00ddb458058e8ca88b3 100644
--- a/include/aidge/backend/cuda/operator/ReLUImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ReLUImpl.hpp
@@ -29,39 +29,37 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class ReLUImpl_cuda : public OperatorImpl {
-public:
-    ReLUImpl_cuda(const ReLU_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    ReLUImpl_cuda(const ReLU_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<ReLUImpl_cuda> create(const ReLU_Op& op) {
+    static std::unique_ptr<ReLUImpl_cuda> create(const ReLU_Op &op) {
         return std::make_unique<ReLUImpl_cuda>(op);
     }
 
     virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
-        return {
-            {DataType::Any}
-        };
+        return {{DataType::Any}};
     }
 
     void forward() override;
     void backward() override;
     ~ReLUImpl_cuda();
 
-private:
-    // CuDNN specific variables
-    #if CUDNN_VERSION >= 5000
-        cudnnActivationDescriptor_t mReLUDesc = nullptr;
-    #else
-        cudnnActivationMode_t mReLUDesc = nullptr;
-    #endif
+  private:
+// CuDNN specific variables
+#if CUDNN_VERSION >= 5000
+    cudnnActivationDescriptor_t mReLUDesc = nullptr;
+#else
+    cudnnActivationMode_t mReLUDesc = nullptr;
+#endif
     std::shared_ptr<Tensor> mInputFallback;
     std::shared_ptr<Tensor> mOutputGradFallback;
 
-    template <class T> void forward_(const Tensor& input);
-    template <class T> void backward_(const Tensor& output_grad);
+    template <class T> void forward_(const Tensor &input);
+    template <class T> void backward_(const Tensor &output_grad);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(ReLU_Op, "cuda", Aidge::ReLUImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_RELUIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp
index 9d352b8b1d14aeaa4230accd7aa81c279c18b7a8..385f274843490267ddb753c261eb6e60c4dd95c3 100644
--- a/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp
@@ -12,19 +12,18 @@
 #ifndef AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_
 #define AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_
 
-#include "aidge/data/Data.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/data/Data.hpp"
 
-namespace Aidge
-{
+namespace Aidge {
 
-    template <class T>
-    void ReduceBackward(const T* input,
-                                   T* output,
-                                   const std::vector<std::size_t>& inputDims,
-                                   const std::vector<std::size_t>& outputDims,
-                                   const std::vector<int>& axes,
-                                   const std::vector<std::size_t>& factors,
-                                   int outSize);
+template <class T>
+void ReduceBackward(const T *input,
+                    T *output,
+                    const std::vector<std::size_t> &inputDims,
+                    const std::vector<std::size_t> &outputDims,
+                    const std::vector<int> &axes,
+                    const std::vector<std::size_t> &factors,
+                    int outSize);
 }
 #endif /* AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp b/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp
index 1f6878480d69e19f8c73a12862cc12b2d675440d..3eda27795289871bbe82414c6ea5e618b1608060 100644
--- a/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp
@@ -29,10 +29,11 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class ReduceMeanImpl_cuda : public OperatorImpl {
-public:
-    ReduceMeanImpl_cuda(const ReduceMean_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    ReduceMeanImpl_cuda(const ReduceMean_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<ReduceMeanImpl_cuda> create(const ReduceMean_Op& op) {
+    static std::unique_ptr<ReduceMeanImpl_cuda>
+    create(const ReduceMean_Op &op) {
         return std::make_unique<ReduceMeanImpl_cuda>(op);
     }
 
@@ -47,16 +48,19 @@ public:
     void forward() override;
     void backward() override;
 
-private:
+  private:
     // CuDNN specific variables
     std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
 
-    template <class T> void forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims);
-    template <class T> void backward_(const Tensor& output_grad, const std::vector<int>& axes);
+    template <class T>
+    void
+    forward_(const Tensor &input, const std::vector<int> &axes, bool keepDims);
+    template <class T>
+    void backward_(const Tensor &output_grad, const std::vector<int> &axes);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(ReduceMean_Op, "cuda", Aidge::ReduceMeanImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_REDUCEMEANIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp b/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp
index 10af90ba3a4ffc1d1464dd73f15313315b0c0032..16538964b7dced5082e41fd0416593af98694e13 100644
--- a/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp
@@ -29,10 +29,10 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class ReduceSumImpl_cuda : public OperatorImpl {
-public:
-    ReduceSumImpl_cuda(const ReduceSum_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    ReduceSumImpl_cuda(const ReduceSum_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<ReduceSumImpl_cuda> create(const ReduceSum_Op& op) {
+    static std::unique_ptr<ReduceSumImpl_cuda> create(const ReduceSum_Op &op) {
         return std::make_unique<ReduceSumImpl_cuda>(op);
     }
 
@@ -47,16 +47,19 @@ public:
     void forward() override;
     void backward() override;
 
-private:
+  private:
     // CuDNN specific variables
     std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
 
-    template <class T> void forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims);
-    template <class T> void backward_(const Tensor& output_grad, const std::vector<int>& axes);
+    template <class T>
+    void
+    forward_(const Tensor &input, const std::vector<int> &axes, bool keepDims);
+    template <class T>
+    void backward_(const Tensor &output_grad, const std::vector<int> &axes);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(ReduceSum_Op, "cuda", Aidge::ReduceSumImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_REDUCESUMIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ReshapeImpl.hpp b/include/aidge/backend/cuda/operator/ReshapeImpl.hpp
index 2c8ebd68cff0313031279f83109043eb17d919b5..e8e231b9d66bf8016dee86a22d73282c3dc7ea67 100644
--- a/include/aidge/backend/cuda/operator/ReshapeImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ReshapeImpl.hpp
@@ -29,10 +29,10 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class ReshapeImpl_cuda : public OperatorImpl {
-public:
-    ReshapeImpl_cuda(const Reshape_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    ReshapeImpl_cuda(const Reshape_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<ReshapeImpl_cuda> create(const Reshape_Op& op) {
+    static std::unique_ptr<ReshapeImpl_cuda> create(const Reshape_Op &op) {
         return std::make_unique<ReshapeImpl_cuda>(op);
     }
 
@@ -47,12 +47,12 @@ public:
     void forward() override;
     void backward() override;
 
-private:
+  private:
     std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(Reshape_Op, "cuda", Aidge::ReshapeImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_RESHAPEIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp b/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp
index 1eff6dfbb1777d8dbd823d7bc9b94894bb2646b9..c501d0f3ef9ad6249fd15f90f67a51cb2dae6fed 100644
--- a/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ShiftGELUImpl.hpp
@@ -30,7 +30,7 @@
 
 namespace Aidge {
 class ShiftGELUImpl_cuda : public OperatorImpl {
-public:
+  public:
     ShiftGELUImpl_cuda(const ShiftGELU_Op &op) : OperatorImpl(op, "cuda") {}
 
     static std::unique_ptr<ShiftGELUImpl_cuda> create(const ShiftGELU_Op &op) {
@@ -45,21 +45,19 @@ public:
         };
     }
 
-
     void forward() override;
     void backward() override;
 
-private:
+  private:
     std::shared_ptr<Tensor> mInputFallback;
     std::shared_ptr<Tensor> mOutputGradFallback;
 
-    template <class T> void forward_(const Tensor& input);
-    template <class T> void backward_(const Tensor& output_grad);
-    
+    template <class T> void forward_(const Tensor &input);
+    template <class T> void backward_(const Tensor &output_grad);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(ShiftGELU_Op, "cuda", Aidge::ShiftGELUImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_SHIFTGELUIMPL_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp
index 14268521451a631ccb9194d44ed7543af8d494f5..4ac4cd96fcdc59901282447d3e6e729988ee4d5e 100644
--- a/include/aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/ShiftGELUImpl_CUDA_kernels.hpp
@@ -14,65 +14,91 @@
 #ifndef AIDGE_CUDA_OPERATOR_SHIFTGELUIMPL_KERNELS_H_
 #define AIDGE_CUDA_OPERATOR_SHIFTGELUIMPL_KERNELS_H_
 
-#include <stdexcept>
 #include <cfloat>
 #include <cuda.h>
-#include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdexcept>
 
-#include "aidge/data/Data.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/data/Data.hpp"
 
 namespace Aidge {
 
 /**
-    * @brief Compute the forward for ShiftGELU
-    * @param input: Input tensor
-    * @param quantized_tensor: Quantized output tensor
-    * @param GELUtensor: Pointer to an empty memory block allocated on the GPU (just use for computation)
-    * @param SumTensor: Pointer to an empty memory block allocated on the GPU (just use for computation)
-    * @param dims: Dimensions of input tensor
-    * @param SF: Scaling factor of input tensor
-    * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required)
-    * @param output_bits: Desired bit precision (8 for int8, for example)
-*/
+ * @brief Compute the forward for ShiftGELU
+ * @param input: Input tensor
+ * @param quantized_tensor: Quantized output tensor
+ * @param GELUtensor: Pointer to an empty memory block allocated on the GPU
+ * (just use for computation)
+ * @param SumTensor: Pointer to an empty memory block allocated on the GPU
+ * (just use for computation)
+ * @param dims: Dimensions of input tensor
+ * @param SF: Scaling factor of input tensor
+ * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater
+ * the N, the more precise the operation, but the greater the number of bits
+ * required)
+ * @param output_bits: Desired bit precision (8 for int8, for example)
+ */
 template <class T>
-__global__ void ShiftGELUforward_(T* input,int* quantized_tensor,int* GELUtensor,int* SumTensor, int* dims, double SF, int N, int output_bits);
+__global__ void ShiftGELUforward_(T *input,
+                                  int *quantized_tensor,
+                                  int *GELUtensor,
+                                  int *SumTensor,
+                                  int *dims,
+                                  double SF,
+                                  int N,
+                                  int output_bits);
 
 /**
-    * @brief Wrapper function to execute ShiftGELUforward_
-    * @note Output correspond to the non-quantized tensor, to obtain the quantized tensor we need to copy quantized_tensor and not input_cuda_tensor
-    * @param input: Input tensor
-    * @param output: Output tensor (not quantized)
-    * @param SF: Scaling factor of input tensor
-    * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required)
-    * @param output_bits: Desired bit precision (8 for int8, for example)
-    * @param size: Number of elements in the input tensor
-    * @param dims_input: Dimensions of input tensor
-*/
+ * @brief Wrapper function to execute ShiftGELUforward_
+ * @note Output correspond to the non-quantized tensor, to obtain the quantized
+ * tensor we need to copy quantized_tensor and not input_cuda_tensor
+ * @param input: Input tensor
+ * @param output: Output tensor (not quantized)
+ * @param SF: Scaling factor of input tensor
+ * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater
+ * the N, the more precise the operation, but the greater the number of bits
+ * required)
+ * @param output_bits: Desired bit precision (8 for int8, for example)
+ * @param size: Number of elements in the input tensor
+ * @param dims_input: Dimensions of input tensor
+ */
 template <class T>
-void ShiftGELUforward(const T* input, T* output, double SF,int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input);
+void ShiftGELUforward(const T *input,
+                      T *output,
+                      double SF,
+                      int N,
+                      int output_bits,
+                      size_t size,
+                      std::vector<long unsigned int> dims_input);
 
 /**
-    * @brief Compute the backward for ShiftGELU
-    * @param input_grad: Gradient of input tensor (that we want to obtain)
-    * @param output_tensor: Output tensor obtained after forward
-    * @param output_grad: Gradient of output tensor
-    * @param size: Number of elements in the input tensor
-*/
+ * @brief Compute the backward for ShiftGELU
+ * @param input_grad: Gradient of input tensor (that we want to obtain)
+ * @param output_tensor: Output tensor obtained after forward
+ * @param output_grad: Gradient of output tensor
+ * @param size: Number of elements in the input tensor
+ */
 template <class T>
-__global__ void ShiftGELUbackward_(T* input_grad, const T* output_tensor, const T* output_grad, int size);
+__global__ void ShiftGELUbackward_(T *input_grad,
+                                   const T *output_tensor,
+                                   const T *output_grad,
+                                   int size);
 
 /**
-    * @brief Wrapper function to execute ShiftGELUbackward_
-    * @param output_tensor: Output tensor obtained after forward
-    * @param output_grad: Gradient of output tensor
-    * @param input_grad: Gradient of input tensor (that we want to obtain)
-    * @param size: Number of elements in the input tensor
-*/
+ * @brief Wrapper function to execute ShiftGELUbackward_
+ * @param output_tensor: Output tensor obtained after forward
+ * @param output_grad: Gradient of output tensor
+ * @param input_grad: Gradient of input tensor (that we want to obtain)
+ * @param size: Number of elements in the input tensor
+ */
 template <class T>
-void ShiftGELUbackward(const T* output_tensor, const T* output_grad, T* input_grad, size_t size);
+void ShiftGELUbackward(const T *output_tensor,
+                       const T *output_grad,
+                       T *input_grad,
+                       size_t size);
 
-}
+} // namespace Aidge
 
 #endif /* AIDGE_CUDA_OPERATOR_SHIFTGELUIMPL_FORWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp b/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp
index 3e6e3744cb544d0928a9229aa5110cf776f0c507..b21182e821934f29a69bc95c72a97507d7daf901 100644
--- a/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ShiftMaxImpl.hpp
@@ -30,7 +30,7 @@
 
 namespace Aidge {
 class ShiftMaxImpl_cuda : public OperatorImpl {
-public:
+  public:
     ShiftMaxImpl_cuda(const ShiftMax_Op &op) : OperatorImpl(op, "cuda") {}
 
     static std::unique_ptr<ShiftMaxImpl_cuda> create(const ShiftMax_Op &op) {
@@ -48,17 +48,16 @@ public:
     void forward() override;
     void backward() override;
 
-private:
+  private:
     std::shared_ptr<Tensor> mInputFallback;
     std::shared_ptr<Tensor> mOutputGradFallback;
 
-    template <class T> void forward_(const Tensor& input);
-    template <class T> void backward_(const Tensor& output_grad);
-    
+    template <class T> void forward_(const Tensor &input);
+    template <class T> void backward_(const Tensor &output_grad);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(ShiftMax_Op, "cuda", Aidge::ShiftMaxImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_SHIFTMAXIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp b/include/aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp
index 037a7cbb6362a8eca5a9e6f5a277b29a6a6bd907..5ed878f3f350ac2746c937b2054f97fc14479825 100644
--- a/include/aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/ShiftMaxImpl_CUDA_kernels.hpp
@@ -14,66 +14,92 @@
 #ifndef AIDGE_CUDA_OPERATOR_SHIFTMAXIMPL_KERNELS_H_
 #define AIDGE_CUDA_OPERATOR_SHIFTMAXIMPL_KERNELS_H_
 
-#include <stdexcept>
 #include <cfloat>
 #include <cuda.h>
-#include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdexcept>
 
-#include "aidge/data/Data.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/data/Data.hpp"
 
 namespace Aidge {
 
 /**
-    * @brief Compute the forward for ShiftMax
-    * @param input: Input tensor
-    * @param quantized_tensor: Quantized output tensor
-    * @param factor: Pointer to an empty memory block allocated on the GPU (just use for computation)
-    * @param dims: Dimensions of input tensor
-    * @param SF: Scaling factor of input tensor
-    * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required)
-    * @param output_bits: Desired bit precision (8 for int8, for example)
-    * @param new_SF: Scaling factor of output that can be use to dequantify
-*/
+ * @brief Compute the forward for ShiftMax
+ * @param input: Input tensor
+ * @param quantized_tensor: Quantized output tensor
+ * @param factor: Pointer to an empty memory block allocated on the GPU (just
+ * use for computation)
+ * @param dims: Dimensions of input tensor
+ * @param SF: Scaling factor of input tensor
+ * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater
+ * the N, the more precise the operation, but the greater the number of bits
+ * required)
+ * @param output_bits: Desired bit precision (8 for int8, for example)
+ * @param new_SF: Scaling factor of output that can be use to dequantify
+ */
 template <class T>
-__global__ void ShiftMaxforward_(T* input,int* quantized_tensor,int* factor, int* dims, double SF, int N, int output_bits,double new_SF);
+__global__ void ShiftMaxforward_(T *input,
+                                 int *quantized_tensor,
+                                 int *factor,
+                                 int *dims,
+                                 double SF,
+                                 int N,
+                                 int output_bits,
+                                 double new_SF);
 
 /**
-    * @brief Wrapper function to execute ShiftMaxforward_
-    * @note Output correspond to the non-quantized tensor, to obtain the quantized tensor we need to copy quantized_tensor and not input_cuda_tensor
-    * @param input: Input tensor
-    * @param output: Output tensor (not quantized)
-    * @param SF: Scaling factor of input tensor
-    * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater the N, the more precise the operation, but the greater the number of bits required)
-    * @param output_bits: Desired bit precision (8 for int8, for example)
-    * @param size: Number of elements in the input tensor
-    * @param dims_input: Dimensions of input tensor
-*/
+ * @brief Wrapper function to execute ShiftMaxforward_
+ * @note Output correspond to the non-quantized tensor, to obtain the quantized
+ * tensor we need to copy quantized_tensor and not input_cuda_tensor
+ * @param input: Input tensor
+ * @param output: Output tensor (not quantized)
+ * @param SF: Scaling factor of input tensor
+ * @param N: Arithmetic precision, currently set at 15 like I-ViT (the greater
+ * the N, the more precise the operation, but the greater the number of bits
+ * required)
+ * @param output_bits: Desired bit precision (8 for int8, for example)
+ * @param size: Number of elements in the input tensor
+ * @param dims_input: Dimensions of input tensor
+ */
 template <class T>
-void ShiftMaxforward(const T* input, T* output, double SF,int N, int output_bits, size_t size, std::vector<long unsigned int> dims_input);
+void ShiftMaxforward(const T *input,
+                     T *output,
+                     double SF,
+                     int N,
+                     int output_bits,
+                     size_t size,
+                     std::vector<long unsigned int> dims_input);
 
 /**
-    * @brief Compute the backward for ShiftMax
-    * @param input_grad: Gradient of input tensor (that we want to obtain)
-    * @param output_tensor: Output tensor obtained after forward
-    * @param output_grad: Gradient of output tensor
-    * @param dims: Dimensions of input tensor
-*/
+ * @brief Compute the backward for ShiftMax
+ * @param input_grad: Gradient of input tensor (that we want to obtain)
+ * @param output_tensor: Output tensor obtained after forward
+ * @param output_grad: Gradient of output tensor
+ * @param dims: Dimensions of input tensor
+ */
 template <class T>
-__global__ void ShiftMaxbackward_(T* input_grad, const T* output_tensor, const T* output_grad, const int* dims);
+__global__ void ShiftMaxbackward_(T *input_grad,
+                                  const T *output_tensor,
+                                  const T *output_grad,
+                                  const int *dims);
 
 /**
-    * @brief Wrapper function to execute ShiftMaxbackward_
-    * @param output_tensor: Output tensor obtained after forward
-    * @param output_grad: Gradient of output tensor
-    * @param input_grad: Gradient of input tensor (that we want to obtain)
-    * @param size: Number of elements in the input tensor
-    * @param dims: Dimensions of input tensor
-*/
+ * @brief Wrapper function to execute ShiftMaxbackward_
+ * @param output_tensor: Output tensor obtained after forward
+ * @param output_grad: Gradient of output tensor
+ * @param input_grad: Gradient of input tensor (that we want to obtain)
+ * @param size: Number of elements in the input tensor
+ * @param dims: Dimensions of input tensor
+ */
 template <class T>
-void ShiftMaxbackward(const T* output_tensor, const T* output_grad, T* input_grad, size_t size, std::vector<long unsigned int> dims);
+void ShiftMaxbackward(const T *output_tensor,
+                      const T *output_grad,
+                      T *input_grad,
+                      size_t size,
+                      std::vector<long unsigned int> dims);
 
-}
+} // namespace Aidge
 
 #endif /* AIDGE_CUDA_OPERATOR_SHIFTMAXIMPL_FORWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cuda/operator/SigmoidImpl.hpp b/include/aidge/backend/cuda/operator/SigmoidImpl.hpp
index dc1434c8ecc8568bd4f82c7c7ce5db78cc1885a9..2437d5cf58a68c6282d21aaf1b3b7868c57c3221 100644
--- a/include/aidge/backend/cuda/operator/SigmoidImpl.hpp
+++ b/include/aidge/backend/cuda/operator/SigmoidImpl.hpp
@@ -29,39 +29,37 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class SigmoidImpl_cuda : public OperatorImpl {
-public:
-    SigmoidImpl_cuda(const Sigmoid_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    SigmoidImpl_cuda(const Sigmoid_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<SigmoidImpl_cuda> create(const Sigmoid_Op& op) {
+    static std::unique_ptr<SigmoidImpl_cuda> create(const Sigmoid_Op &op) {
         return std::make_unique<SigmoidImpl_cuda>(op);
     }
 
     virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
-        return {
-            {DataType::Any}
-        };
+        return {{DataType::Any}};
     }
 
     void forward() override;
     void backward() override;
     ~SigmoidImpl_cuda();
 
-private:
-    // CuDNN specific variables
-    #if CUDNN_VERSION >= 5000
-        cudnnActivationDescriptor_t mSigmoidDesc = nullptr;
-    #else
-        cudnnActivationMode_t mSigmoidDesc = nullptr;
-    #endif
+  private:
+// CuDNN specific variables
+#if CUDNN_VERSION >= 5000
+    cudnnActivationDescriptor_t mSigmoidDesc = nullptr;
+#else
+    cudnnActivationMode_t mSigmoidDesc = nullptr;
+#endif
     std::shared_ptr<Tensor> mInputFallback;
     std::shared_ptr<Tensor> mOutputGradFallback;
 
-    template <class T> void forward_(const Tensor& input);
-    template <class T> void backward_(const Tensor& output_grad);
+    template <class T> void forward_(const Tensor &input);
+    template <class T> void backward_(const Tensor &output_grad);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(Sigmoid_Op, "cuda", Aidge::SigmoidImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_SIGMOIDIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/SubImpl.hpp b/include/aidge/backend/cuda/operator/SubImpl.hpp
index 529d0b2b2dd4a0ec8a3dae5bf0219f8a4f2968c6..973bd1a1f0322bfcc8267af569b35f392ab71deb 100644
--- a/include/aidge/backend/cuda/operator/SubImpl.hpp
+++ b/include/aidge/backend/cuda/operator/SubImpl.hpp
@@ -29,10 +29,10 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class SubImpl_cuda : public OperatorImpl {
-public:
-    SubImpl_cuda(const Sub_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    SubImpl_cuda(const Sub_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<SubImpl_cuda> create(const Sub_Op& op) {
+    static std::unique_ptr<SubImpl_cuda> create(const Sub_Op &op) {
         return std::make_unique<SubImpl_cuda>(op);
     }
 
@@ -47,13 +47,19 @@ public:
     void forward() override;
     void backward() override;
 
-private:
-    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
-    template <class T> void backward_(const Tensor& outGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+  private:
+    template <class T>
+    void forward_(const std::vector<Tensor> &inputs,
+                  const std::vector<std::vector<int>> &inputsDims,
+                  const std::vector<std::vector<int>> &inputsStrides);
+    template <class T>
+    void backward_(const Tensor &outGrad,
+                   const std::vector<std::vector<int>> &inputsDims,
+                   const std::vector<std::vector<int>> &inputsStrides);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(Sub_Op, "cuda", Aidge::SubImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_SUBIMPL_H_ */
diff --git a/include/aidge/backend/cuda/operator/TanhImpl.hpp b/include/aidge/backend/cuda/operator/TanhImpl.hpp
index a87d7bd8c318149cb625a3cf0122f7eac1ea6149..c83061fc2b79152bc49d009bfbc4a271f9b52b9b 100644
--- a/include/aidge/backend/cuda/operator/TanhImpl.hpp
+++ b/include/aidge/backend/cuda/operator/TanhImpl.hpp
@@ -29,39 +29,37 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 class TanhImpl_cuda : public OperatorImpl {
-public:
-    TanhImpl_cuda(const Tanh_Op& op) : OperatorImpl(op, "cuda") {}
+  public:
+    TanhImpl_cuda(const Tanh_Op &op) : OperatorImpl(op, "cuda") {}
 
-    static std::unique_ptr<TanhImpl_cuda> create(const Tanh_Op& op) {
+    static std::unique_ptr<TanhImpl_cuda> create(const Tanh_Op &op) {
         return std::make_unique<TanhImpl_cuda>(op);
     }
 
     virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
-        return {
-            {DataType::Any}
-        };
+        return {{DataType::Any}};
     }
 
     void forward() override;
     void backward() override;
     ~TanhImpl_cuda();
 
-private:
-    // CuDNN specific variables
-    #if CUDNN_VERSION >= 5000
-        cudnnActivationDescriptor_t mTanhDesc = nullptr;
-    #else
-        cudnnActivationMode_t mTanhDesc = nullptr;
-    #endif
+  private:
+// CuDNN specific variables
+#if CUDNN_VERSION >= 5000
+    cudnnActivationDescriptor_t mTanhDesc = nullptr;
+#else
+    cudnnActivationMode_t mTanhDesc = nullptr;
+#endif
     std::shared_ptr<Tensor> mInputFallback;
     std::shared_ptr<Tensor> mOutputGradFallback;
 
-    template <class T> void forward_(const Tensor& input);
-    template <class T> void backward_(const Tensor& output_grad);
+    template <class T> void forward_(const Tensor &input);
+    template <class T> void backward_(const Tensor &output_grad);
 };
 
 // Implementation entry point registration to Operator
 REGISTRAR(Tanh_Op, "cuda", Aidge::TanhImpl_cuda::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_BACKEND_CUDA_OPERATOR_TANHIMPL_H_ */
diff --git a/include/aidge/backend/cuda/utils/CudaContext.hpp b/include/aidge/backend/cuda/utils/CudaContext.hpp
index f21886e502b9017aa55e250e7257d16bc5d04501..1d66f4eab386ea55bcda1455de279fabaf68b147 100644
--- a/include/aidge/backend/cuda/utils/CudaContext.hpp
+++ b/include/aidge/backend/cuda/utils/CudaContext.hpp
@@ -3,19 +3,18 @@
 
 #include <vector>
 
-#include "aidge/utils/ErrorHandling.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
 
 namespace Aidge {
 class CudaContext {
-public:
-    static int nbDevice(){
+  public:
+    static int nbDevice() {
         int count = 1;
         CHECK_CUDA_STATUS(cudaGetDeviceCount(&count));
         return count;
     }
-    static void setDevice(int device = -1)
-    {
+    static void setDevice(int device = -1) {
         static int prevDevice = 0;
 
         if (device >= 0)
@@ -26,27 +25,25 @@ public:
         CHECK_CUDA_STATUS(cudaSetDevice(device));
     }
 
-    static std::pair<size_t, size_t> getMemInfo(){
+    static std::pair<size_t, size_t> getMemInfo() {
         size_t free;
         size_t total;
-        CHECK_CUDA_STATUS(cudaMemGetInfo (&free, &total));
+        CHECK_CUDA_STATUS(cudaMemGetInfo(&free, &total));
         return std::make_pair(free, total);
     }
-    
 
-    static int getDevice(){
+    static int getDevice() {
         int dev;
         CHECK_CUDA_STATUS(cudaGetDevice(&dev));
         return dev;
     }
 
-    static const cudaDeviceProp& getDeviceProp()
-    {
+    static const cudaDeviceProp &getDeviceProp() {
         static std::vector<cudaDeviceProp> deviceProp;
         static std::vector<bool> init;
 
         if (deviceProp.empty()) {
-//#pragma omp critical(CudaContext__getDeviceProp)
+            // #pragma omp critical(CudaContext__getDeviceProp)
             if (deviceProp.empty()) {
                 int count = 1;
                 CHECK_CUDA_STATUS(cudaGetDeviceCount(&count));
@@ -68,12 +65,11 @@ public:
     }
 
     // Declare cublas handle
-    static cublasHandle_t& cublasHandle()
-    {
+    static cublasHandle_t &cublasHandle() {
         static std::vector<cublasHandle_t> cublas_h;
 
         if (cublas_h.empty()) {
-//#pragma omp critical(CudaContext__cublasHandle)
+            // #pragma omp critical(CudaContext__cublasHandle)
             if (cublas_h.empty()) {
                 int count = 1;
                 CHECK_CUDA_STATUS(cudaGetDeviceCount(&count));
@@ -94,12 +90,11 @@ public:
     }
 
     // Declare cudnn handle
-    static cudnnHandle_t& cudnnHandle()
-    {
+    static cudnnHandle_t &cudnnHandle() {
         static std::vector<cudnnHandle_t> cudnn_h;
 
         if (cudnn_h.empty()) {
-//#pragma omp critical(CudaContext__cudnnHandle)
+            // #pragma omp critical(CudaContext__cudnnHandle)
             if (cudnn_h.empty()) {
                 int count = 1;
                 CHECK_CUDA_STATUS(cudaGetDeviceCount(&count));
@@ -119,54 +114,50 @@ public:
         return cudnn_h[dev];
     }
 
-    template <class T>
-    struct data_type {
+    template <class T> struct data_type {
         static const cudnnDataType_t value = CUDNN_DATA_FLOAT;
-                                            // Dummy value by default
+        // Dummy value by default
     };
 };
-}
+} // namespace Aidge
 
 namespace Aidge {
-    template <>
-    struct CudaContext::data_type<half_float::half> {
-        static const cudnnDataType_t value = CUDNN_DATA_HALF;
-    };
+template <> struct CudaContext::data_type<half_float::half> {
+    static const cudnnDataType_t value = CUDNN_DATA_HALF;
+};
 
-    template <>
-    struct CudaContext::data_type<float> {
-        static const cudnnDataType_t value = CUDNN_DATA_FLOAT;
-    };
+template <> struct CudaContext::data_type<float> {
+    static const cudnnDataType_t value = CUDNN_DATA_FLOAT;
+};
 
-    template <>
-    struct CudaContext::data_type<double> {
-        static const cudnnDataType_t value = CUDNN_DATA_DOUBLE;
-    };
+template <> struct CudaContext::data_type<double> {
+    static const cudnnDataType_t value = CUDNN_DATA_DOUBLE;
+};
 
-    inline cudnnDataType_t DataTypeToCudnn(DataType type) {
-        switch (type) {
-        case DataType::Float64:
-            return CUDNN_DATA_DOUBLE;
-        case DataType::Float32:
-            return CUDNN_DATA_FLOAT;
-        case DataType::Float16:
-            return CUDNN_DATA_HALF;
-        case DataType::Int8:
-            return CUDNN_DATA_INT8;
-        case DataType::UInt8:
-            return CUDNN_DATA_UINT8;
-        case DataType::Int32:
-            return CUDNN_DATA_INT32;
+inline cudnnDataType_t DataTypeToCudnn(DataType type) {
+    switch (type) {
+    case DataType::Float64:
+        return CUDNN_DATA_DOUBLE;
+    case DataType::Float32:
+        return CUDNN_DATA_FLOAT;
+    case DataType::Float16:
+        return CUDNN_DATA_HALF;
+    case DataType::Int8:
+        return CUDNN_DATA_INT8;
+    case DataType::UInt8:
+        return CUDNN_DATA_UINT8;
+    case DataType::Int32:
+        return CUDNN_DATA_INT32;
 #if CUDNN_VERSION >= 8100
-        case DataType::Int64:
-            return CUDNN_DATA_INT64;
+    case DataType::Int64:
+        return CUDNN_DATA_INT64;
 #endif
-        default:
-            assert(false && "Unsupported CuDNN type");
-        }
-
-        return CUDNN_DATA_FLOAT;  // TODO: undefined behavior
+    default:
+        assert(false && "Unsupported CuDNN type");
     }
+
+    return CUDNN_DATA_FLOAT; // TODO: undefined behavior
 }
+} // namespace Aidge
 
 #endif // AIDGE_BACKEND_CUDA_CUDA_CONTEXT_H
diff --git a/include/aidge/backend/cuda/utils/CudaUtils.hpp b/include/aidge/backend/cuda/utils/CudaUtils.hpp
index ab7c805224ed6fe073baf2036b84f4ed6f49b077..1601dd0c0944992fa0a978dcf38cd4bcb6c9771c 100644
--- a/include/aidge/backend/cuda/utils/CudaUtils.hpp
+++ b/include/aidge/backend/cuda/utils/CudaUtils.hpp
@@ -1,11 +1,11 @@
 #ifndef AIDGE_BACKEND_CUDA_CUDA_UTILS_H
 #define AIDGE_BACKEND_CUDA_CUDA_UTILS_H
 
-#include <string>
+#include <iostream>
 #include <memory>
 #include <sstream>
-#include <iostream>
 #include <stdexcept>
+#include <string>
 
 #include <cublas_v2.h>
 #include <cuda.h>
@@ -14,86 +14,85 @@
 #include "aidge/data/half.hpp"
 #include "aidge/utils/ErrorHandling.hpp"
 
-#define CHECK_CUDNN_STATUS(status)                                             \
-    do {                                                                       \
-        const cudnnStatus_t e = (status);                                      \
-        if (e != CUDNN_STATUS_SUCCESS) {                                       \
-            std::stringstream error;                                           \
-            error << "CUDNN failure: " << cudnnGetErrorString(e) << " ("       \
-                  << static_cast<int>(e) << ") in " << __FILE__ << ':' << __LINE__;         \
-            int status_dev;                                                           \
-            if (cudaGetDevice(&status_dev) == cudaSuccess)                            \
-                error << " on device #" << status_dev;                                \
-            std::cerr << error.str() << std::endl;                             \
-            cudaDeviceReset();                                                 \
-            throw std::runtime_error(error.str());                             \
-        }                                                                      \
-    } while(0)
+#define CHECK_CUDNN_STATUS(status)                                            \
+    do {                                                                      \
+        const cudnnStatus_t e = (status);                                     \
+        if (e != CUDNN_STATUS_SUCCESS) {                                      \
+            std::stringstream error;                                          \
+            error << "CUDNN failure: " << cudnnGetErrorString(e) << " ("      \
+                  << static_cast<int>(e) << ") in " << __FILE__ << ':'        \
+                  << __LINE__;                                                \
+            int status_dev;                                                   \
+            if (cudaGetDevice(&status_dev) == cudaSuccess)                    \
+                error << " on device #" << status_dev;                        \
+            std::cerr << error.str() << std::endl;                            \
+            cudaDeviceReset();                                                \
+            throw std::runtime_error(error.str());                            \
+        }                                                                     \
+    } while (0)
 
-#define CHECK_CUDA_STATUS(status)                                              \
-    do {                                                                       \
-        const cudaError_t e = (status);                                        \
-        if ((e) != cudaSuccess) {                                              \
-            std::stringstream error;                                           \
-            error << "Cuda failure: " << cudaGetErrorString(e) << " ("         \
-                  << static_cast<int>(e) << ") in " << __FILE__ << ':' << __LINE__;         \
-            int status_dev;                                                           \
-            if (cudaGetDevice(&status_dev) == cudaSuccess)                            \
-                error << " on device #" << status_dev;                                \
-            std::cerr << error.str() << std::endl;                             \
-            cudaDeviceReset();                                                 \
-            throw std::runtime_error(error.str());                             \
-        }                                                                      \
-    } while(0)
+#define CHECK_CUDA_STATUS(status)                                             \
+    do {                                                                      \
+        const cudaError_t e = (status);                                       \
+        if ((e) != cudaSuccess) {                                             \
+            std::stringstream error;                                          \
+            error << "Cuda failure: " << cudaGetErrorString(e) << " ("        \
+                  << static_cast<int>(e) << ") in " << __FILE__ << ':'        \
+                  << __LINE__;                                                \
+            int status_dev;                                                   \
+            if (cudaGetDevice(&status_dev) == cudaSuccess)                    \
+                error << " on device #" << status_dev;                        \
+            std::cerr << error.str() << std::endl;                            \
+            cudaDeviceReset();                                                \
+            throw std::runtime_error(error.str());                            \
+        }                                                                     \
+    } while (0)
 
-#define CHECK_CUBLAS_STATUS(status)                                            \
-    do {                                                                       \
-        const cublasStatus_t e = (status);                                     \
-        if (e != CUBLAS_STATUS_SUCCESS) {                                      \
-            std::stringstream error;                                           \
-            error << "Cublas failure: "                                        \
-                  << Aidge::Cuda::cublasGetErrorString(e) << " ("               \
-                  << static_cast<int>(e) << ") in " << __FILE__ << ':' << __LINE__;         \
-            int status_dev;                                                           \
-            if (cudaGetDevice(&status_dev) == cudaSuccess)                            \
-                error << " on device #" << status_dev;                                \
-            std::cerr << error.str() << std::endl;                             \
-            cudaDeviceReset();                                                 \
-            throw std::runtime_error(error.str());                             \
-        }                                                                      \
-    } while(0)
+#define CHECK_CUBLAS_STATUS(status)                                           \
+    do {                                                                      \
+        const cublasStatus_t e = (status);                                    \
+        if (e != CUBLAS_STATUS_SUCCESS) {                                     \
+            std::stringstream error;                                          \
+            error << "Cublas failure: "                                       \
+                  << Aidge::Cuda::cublasGetErrorString(e) << " ("             \
+                  << static_cast<int>(e) << ") in " << __FILE__ << ':'        \
+                  << __LINE__;                                                \
+            int status_dev;                                                   \
+            if (cudaGetDevice(&status_dev) == cudaSuccess)                    \
+                error << " on device #" << status_dev;                        \
+            std::cerr << error.str() << std::endl;                            \
+            cudaDeviceReset();                                                \
+            throw std::runtime_error(error.str());                            \
+        }                                                                     \
+    } while (0)
 
 namespace Aidge {
 namespace Cuda {
-    // CuDNN scaling parameters are typically "alpha" and "beta".
-    // Their type must be "float" for HALF and FLOAT (default template)
-    // and "double" for DOUBLE (specialized template)
-    template <class T>
-    struct cudnn_scaling_type {
-        typedef float type;
-    };
+// CuDNN scaling parameters are typically "alpha" and "beta".
+// Their type must be "float" for HALF and FLOAT (default template)
+// and "double" for DOUBLE (specialized template)
+template <class T> struct cudnn_scaling_type {
+    typedef float type;
+};
 
-    template <>
-    struct cudnn_scaling_type<double> {
-        typedef double type;
-    };
+template <> struct cudnn_scaling_type<double> {
+    typedef double type;
+};
 
-    template <class T>
-    struct cuda_type {
-        typedef T type;
-    };
+template <class T> struct cuda_type {
+    typedef T type;
+};
 
-    template <>
-    struct cuda_type<half_float::half> {
-        typedef __half type;
-    };
+template <> struct cuda_type<half_float::half> {
+    typedef __half type;
+};
 
-    const char* cublasGetErrorString(cublasStatus_t error);
+const char *cublasGetErrorString(cublasStatus_t error);
 
-    // Enable Peer-to-Peer communications between devices
-    // when it is possible
-    void setMultiDevicePeerAccess(unsigned int size, unsigned int* devices);
-}
-}
+// Enable Peer-to-Peer communications between devices
+// when it is possible
+void setMultiDevicePeerAccess(unsigned int size, unsigned int *devices);
+} // namespace Cuda
+} // namespace Aidge
 
 #endif // AIDGE_BACKEND_CUDA_CUDA_UTILS_H
diff --git a/include/aidge/utils/sys_info/CudaVersionInfo.hpp b/include/aidge/utils/sys_info/CudaVersionInfo.hpp
index 17490476b18d62da66671a28f76709349e3ba805..7e8e8dbbb42cff84cbb28a9310d92643998ad508 100644
--- a/include/aidge/utils/sys_info/CudaVersionInfo.hpp
+++ b/include/aidge/utils/sys_info/CudaVersionInfo.hpp
@@ -1,7 +1,7 @@
 #ifndef AIDGE_UTILS_SYS_INFO_CUDA_VERSION_INFO_H
 #define AIDGE_UTILS_SYS_INFO_CUDA_VERSION_INFO_H
 
-#include "aidge/backend/cuda/utils/CudaUtils.hpp"  // CHECK_CUDA_STATUS
+#include "aidge/backend/cuda/utils/CudaUtils.hpp" // CHECK_CUDA_STATUS
 #include "aidge/utils/Log.hpp"
 
 namespace Aidge {
@@ -16,9 +16,15 @@ namespace Aidge {
 #define CUDA_COMPILER_VERSION "Unknown version"
 #endif
 void showCudaVersion() {
-    Log::info("Aidge backend CUDA: {} ({}), {} {}", PROJECT_VERSION, GIT_COMMIT_HASH, __DATE__, __TIME__);
+    Log::info("Aidge backend CUDA: {} ({}), {} {}",
+              PROJECT_VERSION,
+              GIT_COMMIT_HASH,
+              __DATE__,
+              __TIME__);
     Log::info("CUDA compiler version: {}", CUDA_COMPILER_VERSION);
-    Log::info("CuDNN version: {}.{}.{}\n", CUDNN_MAJOR, CUDNN_MINOR,
+    Log::info("CuDNN version: {}.{}.{}\n",
+              CUDNN_MAJOR,
+              CUDNN_MINOR,
               CUDNN_PATCHLEVEL);
 
     int deviceCount = 0;
@@ -43,11 +49,14 @@ void showCudaVersion() {
         cudaRuntimeGetVersion(&runtimeVersion);
         Log::info(
             "\tCUDA Driver Version / Runtime Version:          {}.{} / {}.{}",
-            (driverVersion / 1000), ((driverVersion % 100) / 10),
-            (runtimeVersion / 1000), ((runtimeVersion % 100) / 10));
+            (driverVersion / 1000),
+            ((driverVersion % 100) / 10),
+            (runtimeVersion / 1000),
+            ((runtimeVersion % 100) / 10));
         Log::info("\tCUDA Capability Major/Minor version number:     {}.{}",
-                  deviceProp.major, deviceProp.minor);
+                  deviceProp.major,
+                  deviceProp.minor);
     }
 }
-}  // namespace Aidge
-#endif  // AIDGE_UTILS_SYS_INFO_CUDA_VERSION_INFO_H
+} // namespace Aidge
+#endif // AIDGE_UTILS_SYS_INFO_CUDA_VERSION_INFO_H
diff --git a/python_binding/pybind_backend_cuda.cpp b/python_binding/pybind_backend_cuda.cpp
index 3d7564459781d6933827aa66b405b03085806467..dcbc785a2c8b0974ebb375f885694030dc566fd4 100644
--- a/python_binding/pybind_backend_cuda.cpp
+++ b/python_binding/pybind_backend_cuda.cpp
@@ -17,9 +17,9 @@
 namespace py = pybind11;
 namespace Aidge {
 
-void init_cuda_sys_info(py::module& m);
+void init_cuda_sys_info(py::module &m);
 
-void init_Aidge(py::module& m){
+void init_Aidge(py::module &m) {
     init_cuda_sys_info(m);
 }
 
diff --git a/python_binding/utils/sys_info/pybind_CudaVersionInfo.cpp b/python_binding/utils/sys_info/pybind_CudaVersionInfo.cpp
index 64f650903ec75d579ffd58dbd6d7db7bbaf573a2..5e6f2db9868de3f1bf374f25f1f4574846ab0c45 100644
--- a/python_binding/utils/sys_info/pybind_CudaVersionInfo.cpp
+++ b/python_binding/utils/sys_info/pybind_CudaVersionInfo.cpp
@@ -1,9 +1,9 @@
-#include <pybind11/pybind11.h>
 #include "aidge/utils/sys_info/CudaVersionInfo.hpp"
+#include <pybind11/pybind11.h>
 
 namespace py = pybind11;
 namespace Aidge {
-void init_cuda_sys_info(py::module& m){
+void init_cuda_sys_info(py::module &m) {
     m.def("show_cuda_version", &showCudaVersion);
 }
-}
+} // namespace Aidge
diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp
index de7ea925554906ea5fe1e5dcba268b17a06a47bd..9ed7874a8ef1ecfc228720e054b35de903030a57 100644
--- a/src/operator/AddImpl.cpp
+++ b/src/operator/AddImpl.cpp
@@ -22,27 +22,39 @@
 #include "aidge/utils/Types.h"
 
 void Aidge::AddImpl_cuda::forward() {
-    const Add_Op& op = static_cast<const Add_Op&>(mOp);
+    const Add_Op &op = static_cast<const Add_Op &>(mOp);
     // Check inputs
     AIDGE_ASSERT(op.getInput(0), "missing input in Add operator");
-    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Add forward because the 0-th input has no implementation.");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(),
+                 "cannot run Add forward because the 0-th input has no "
+                 "implementation.");
     DataType datatypeFirstInput = op.getInput(0)->dataType();
     for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
         AIDGE_ASSERT(op.getInput(i), "missing input in Add operator");
-        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Add forward because the {}-th input has no implementation.", i);
-        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type.");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(),
+                     "cannot run Add forward because the {}-th input has no "
+                     "implementation.",
+                     i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput,
+                     "Cannot add inputs with two differents data type.");
     }
 
     std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs());
     std::vector<Tensor> inputs(op.nbInputs());
     std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
-    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    std::vector<std::vector<int>> strides(
+        op.nbInputs()); // For the cooresponding strides
     for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
-        inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
+        inputs[i] =
+            op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
 
         // Get tensor dims and broadcast them
-        std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
-        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+        std::copy(inputs[i].dims().begin(),
+                  inputs[i].dims().end(),
+                  std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(),
+                       op.getOutput(0)->nbDims() - dims[i].size(),
+                       int(1));
 
         if (dims[i].size() < 4) {
             dims[i].resize(4, 1);
@@ -58,76 +70,106 @@ void Aidge::AddImpl_cuda::forward() {
         strides[i] = tensorStrides;
     }
 
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            forward_<double>(inputs, dims, strides);
-            break;
-        case DataType::Float32:
-            forward_<float>(inputs, dims, strides);
-            break;
-        case DataType::Float16:
-            forward_<half>(inputs, dims, strides);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        forward_<double>(inputs, dims, strides);
+        break;
+    case DataType::Float32:
+        forward_<float>(inputs, dims, strides);
+        break;
+    case DataType::Float16:
+        forward_<half>(inputs, dims, strides);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <class T>
-void Aidge::AddImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::AddImpl_cuda::forward_(
+    const std::vector<Tensor> &inputs,
+    const std::vector<std::vector<int>> &inputsDims,
+    const std::vector<std::vector<int>> &inputsStrides) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
 
     // Create a Tensor descriptor with the broadcasted dims and strides
     cudnnTensorDescriptor_t tensorDesc;
     CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc));
-    CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data()));
-    // Add first input
     CHECK_CUDNN_STATUS(
-        cudnnAddTensor(CudaContext::cudnnHandle(),
-                       &alpha,
-                       tensorDesc,
-                       inputs[0].getImpl()->rawPtr(),
-                       &beta,
-                       std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                       std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())
-    );
+        cudnnSetTensorNdDescriptor(tensorDesc,
+                                   CudaContext::data_type<T>::value,
+                                   inputsDims[0].size(),
+                                   inputsDims[0].data(),
+                                   inputsStrides[0].data()));
+    // Add first input
+    CHECK_CUDNN_STATUS(cudnnAddTensor(
+        CudaContext::cudnnHandle(),
+        &alpha,
+        tensorDesc,
+        inputs[0].getImpl()->rawPtr(),
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr()));
     // Add other inputs if there are any
-    for (size_t i = 1; i < op.nbInputs(); ++i)
-    {
-        CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[i].size(), inputsDims[i].data(), inputsStrides[i].data()));
+    for (size_t i = 1; i < op.nbInputs(); ++i) {
+        CHECK_CUDNN_STATUS(
+            cudnnSetTensorNdDescriptor(tensorDesc,
+                                       CudaContext::data_type<T>::value,
+                                       inputsDims[i].size(),
+                                       inputsDims[i].data(),
+                                       inputsStrides[i].data()));
         CHECK_CUDNN_STATUS(
             cudnnAddTensor(CudaContext::cudnnHandle(),
-                        &alpha,
-                        tensorDesc,
-                        inputs[i].getImpl()->rawPtr(),
-                        &alpha,
-                        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                        std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())
-        );
+                           &alpha,
+                           tensorDesc,
+                           inputs[i].getImpl()->rawPtr(),
+                           &alpha,
+                           std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                               op.getOutput(0)->getImpl())
+                               ->getCudnnTensorDesc(*op.getOutput(0)),
+                           std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+                               ->getImpl()
+                               ->rawPtr()));
     }
     CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc));
 }
 
 void Aidge::AddImpl_cuda::backward() {
-    const Add_Op& op = static_cast<const Add_Op&>(mOp);
+    const Add_Op &op = static_cast<const Add_Op &>(mOp);
     // Check output
-    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Add operator");
-    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Add backward because the output gradient has no implementation.");
+    AIDGE_ASSERT(op.getOutput(0)->grad(),
+                 "missing output gradient in Add operator");
+    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(),
+                 "cannot run Add backward because the output gradient has no "
+                 "implementation.");
 
     std::shared_ptr<Tensor> outputGradFallback;
-    const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad());
+    const auto &outputGrad =
+        op.getOutput(0)->grad()->refCastFrom(outputGradFallback,
+                                             *op.getOutput(0)->grad());
 
     std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
-    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    std::vector<std::vector<int>> strides(
+        op.nbInputs()); // For the cooresponding strides
     for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
         std::shared_ptr<Tensor> inputFallback;
-        const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0));
+        const Tensor input =
+            op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0));
 
         // Get tensor dims and broadcast them
-        std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i]));
-        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+        std::copy(input.dims().begin(),
+                  input.dims().end(),
+                  std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(),
+                       op.getOutput(0)->nbDims() - dims[i].size(),
+                       int(1));
 
         // Compute the corresponding strides
         std::vector<int> tensorStrides(dims[i].size());
@@ -139,77 +181,89 @@ void Aidge::AddImpl_cuda::backward() {
         strides[i] = tensorStrides;
     }
 
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            backward_<double>(outputGrad, dims, strides);
-            break;
-        case DataType::Float32:
-            backward_<float>(outputGrad, dims, strides);
-            break;
-        case DataType::Float16:
-            backward_<half>(outputGrad, dims, strides);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        backward_<double>(outputGrad, dims, strides);
+        break;
+    case DataType::Float32:
+        backward_<float>(outputGrad, dims, strides);
+        break;
+    case DataType::Float16:
+        backward_<half>(outputGrad, dims, strides);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <class T>
-void Aidge::AddImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::AddImpl_cuda::backward_(
+    const Tensor &outputGrad,
+    const std::vector<std::vector<int>> &inputsDims,
+    const std::vector<std::vector<int>> &inputsStrides) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
 
-    for (std::size_t i = 0; i < inputsDims.size(); i++)
-    {
-        if (op.getInput(i)->size() == op.getOutput(0)->size())
-        {
+    for (std::size_t i = 0; i < inputsDims.size(); i++) {
+        if (op.getInput(i)->size() == op.getOutput(0)->size()) {
             // TODO: Test if we can avoid copy and simply set rawPtr
-            op.getInput(i)->grad()->getImpl()->copy(outputGrad.getImpl()->rawPtr(), op.getInput(i)->grad()->size());
-        }
-        else // In case of broadcasting
+            op.getInput(i)->grad()->getImpl()->copy(
+                outputGrad.getImpl()->rawPtr(),
+                op.getInput(i)->grad()->size());
+        } else // In case of broadcasting
         {
-            // Gradient with respect to input_i: sum outputGrad over the broadcasted dimensions using cudnnReduceTensor
+            // Gradient with respect to input_i: sum outputGrad over the
+            // broadcasted dimensions using cudnnReduceTensor
             cudnnReduceTensorDescriptor_t reduceDesc;
             CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc));
-            CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc,
-                                                              CUDNN_REDUCE_TENSOR_ADD,
-                                                              CudaContext::data_type<T>::value,
-                                                              CUDNN_PROPAGATE_NAN,
-                                                              CUDNN_REDUCE_TENSOR_NO_INDICES,
-                                                              CUDNN_32BIT_INDICES));
-
-            cudnnTensorDescriptor_t outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(*op.getOutput(0));
+            CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(
+                reduceDesc,
+                CUDNN_REDUCE_TENSOR_ADD,
+                CudaContext::data_type<T>::value,
+                CUDNN_PROPAGATE_NAN,
+                CUDNN_REDUCE_TENSOR_NO_INDICES,
+                CUDNN_32BIT_INDICES));
+
+            cudnnTensorDescriptor_t outputDesc =
+                std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                    outputGrad.getImpl())
+                    ->getCudnnTensorDesc(*op.getOutput(0));
             // Create a Tensor descriptor with the broadcasted dims and strides
             cudnnTensorDescriptor_t tensorDesc;
             CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc));
-            CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc,
-                                                          CudaContext::data_type<T>::value,
-                                                          inputsDims[i].size(),
-                                                          inputsDims[i].data(),
-                                                          inputsStrides[i].data()));
+            CHECK_CUDNN_STATUS(
+                cudnnSetTensorNdDescriptor(tensorDesc,
+                                           CudaContext::data_type<T>::value,
+                                           inputsDims[i].size(),
+                                           inputsDims[i].data(),
+                                           inputsStrides[i].data()));
             size_t workspaceSize;
-            CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(),
-                               reduceDesc,
-                               outputDesc,
-                               tensorDesc,
-                               &workspaceSize));
+            CHECK_CUDNN_STATUS(
+                cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(),
+                                               reduceDesc,
+                                               outputDesc,
+                                               tensorDesc,
+                                               &workspaceSize));
 
             void *d_workspace;
             CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize));
 
-            CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(),
-                               reduceDesc,
-                               NULL,
-                               0,
-                               d_workspace,
-                               workspaceSize,
-                               &alpha,
-                               outputDesc,
-                               outputGrad.getImpl()->rawPtr(),
-                               &beta,
-                               tensorDesc,
-                               op.getInput(i)->grad()->getImpl()->rawPtr()));
+            CHECK_CUDNN_STATUS(cudnnReduceTensor(
+                CudaContext::cudnnHandle(),
+                reduceDesc,
+                NULL,
+                0,
+                d_workspace,
+                workspaceSize,
+                &alpha,
+                outputDesc,
+                outputGrad.getImpl()->rawPtr(),
+                &beta,
+                tensorDesc,
+                op.getInput(i)->grad()->getImpl()->rawPtr()));
 
             CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc));
         }
diff --git a/src/operator/AndImpl.cpp b/src/operator/AndImpl.cpp
index e1ee9ebcb9437b89666da21a915907b5434ece26..ddbf69ab48d23c10a970bc30c807c6639b54cef4 100644
--- a/src/operator/AndImpl.cpp
+++ b/src/operator/AndImpl.cpp
@@ -23,27 +23,39 @@
 #include "aidge/utils/Types.h"
 
 void Aidge::AndImpl_cuda::forward() {
-    const And_Op& op = static_cast<const And_Op&>(mOp);
+    const And_Op &op = static_cast<const And_Op &>(mOp);
     // Check inputs
     AIDGE_ASSERT(op.getInput(0), "missing input in And operator");
-    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run And forward because the 0-th input has no implementation.");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(),
+                 "cannot run And forward because the 0-th input has no "
+                 "implementation.");
     DataType datatypeFirstInput = op.getInput(0)->dataType();
     for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
         AIDGE_ASSERT(op.getInput(i), "missing input in And operator");
-        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run And forward because the {}-th input has no implementation.", i);
-        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot And inputs with two differents data type.");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(),
+                     "cannot run And forward because the {}-th input has no "
+                     "implementation.",
+                     i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput,
+                     "Cannot And inputs with two differents data type.");
     }
 
     std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs());
     std::vector<Tensor> inputs(op.nbInputs());
     std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
-    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    std::vector<std::vector<int>> strides(
+        op.nbInputs()); // For the cooresponding strides
     for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
-        inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
+        inputs[i] =
+            op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
 
         // Get tensor dims and broadcast them
-        std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
-        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+        std::copy(inputs[i].dims().begin(),
+                  inputs[i].dims().end(),
+                  std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(),
+                       op.getOutput(0)->nbDims() - dims[i].size(),
+                       int(1));
 
         if (dims[i].size() < 4) {
             dims[i].resize(4, 1);
@@ -59,37 +71,48 @@ void Aidge::AndImpl_cuda::forward() {
         strides[i] = tensorStrides;
     }
 
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            forward_<double>(inputs, dims, strides);
-            break;
-        case DataType::Float32:
-            forward_<float>(inputs, dims, strides);
-            break;
-        case DataType::Float16:
-            forward_<half>(inputs, dims, strides);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        forward_<double>(inputs, dims, strides);
+        break;
+    case DataType::Float32:
+        forward_<float>(inputs, dims, strides);
+        break;
+    case DataType::Float16:
+        forward_<half>(inputs, dims, strides);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <class T>
-void Aidge::AndImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
-    const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr());
-    const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr());
-    T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
+void Aidge::AndImpl_cuda::forward_(
+    const std::vector<Tensor> &inputs,
+    const std::vector<std::vector<int>> &inputsDims,
+    const std::vector<std::vector<int>> &inputsStrides) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
+    const T *input1Ptr = static_cast<const T *>(inputs[0].getImpl()->rawPtr());
+    const T *input2Ptr = static_cast<const T *>(inputs[1].getImpl()->rawPtr());
+    T *outputPtr = static_cast<T *>(op.getOutput(0)->getImpl()->rawPtr());
 
     std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1);
-    if(op.getOutput(0)->nbDims()>1) {
-        for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) {
-            outputStrides[i] = outputStrides[i+1] *  op.getOutput(0)->dims()[i+1];
+    if (op.getOutput(0)->nbDims() > 1) {
+        for (int i = op.getOutput(0)->nbDims() - 2; i >= 0; i--) {
+            outputStrides[i] =
+                outputStrides[i + 1] * op.getOutput(0)->dims()[i + 1];
         }
     }
 
-    Aidge::AndForward<T>(input1Ptr, input2Ptr, outputPtr,
-                inputsDims[0], inputsDims[1],
-                inputsStrides[0], inputsStrides[1], outputStrides,
-                static_cast<int>(op.getOutput(0)->size()));
+    Aidge::AndForward<T>(input1Ptr,
+                         input2Ptr,
+                         outputPtr,
+                         inputsDims[0],
+                         inputsDims[1],
+                         inputsStrides[0],
+                         inputsStrides[1],
+                         outputStrides,
+                         static_cast<int>(op.getOutput(0)->size()));
 }
\ No newline at end of file
diff --git a/src/operator/ArgMaxImpl.cpp b/src/operator/ArgMaxImpl.cpp
index 50d00592ca70333d6fbdd7a10761a0ea2e9beb4b..2820ba52ad550a7c00e7e0c382ef4a17bc0fda2c 100644
--- a/src/operator/ArgMaxImpl.cpp
+++ b/src/operator/ArgMaxImpl.cpp
@@ -23,52 +23,66 @@
 #include "aidge/utils/Types.h"
 
 void Aidge::ArgMaxImpl_cuda::forward() {
-    const ArgMax_Op& op = dynamic_cast<const ArgMax_Op&>(mOp);
+    const ArgMax_Op &op = dynamic_cast<const ArgMax_Op &>(mOp);
     AIDGE_ASSERT(mOp.getRawInput(0), "missing input in ArgMax operator");
-    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run ArgMax forward because the input has no implementation.");
+    AIDGE_ASSERT(
+        op.getInput(0)->hasImpl(),
+        "cannot run ArgMax forward because the input has no implementation.");
 
-    const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(mInputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto &input =
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))
+            ->refCastFrom(
+                mInputFallback,
+                *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
     const std::int32_t axis = op.axis();
     const DimSize_t selectLastIdx = op.selectLastIndex();
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            forward_<double>(input, axis, selectLastIdx);
-            break;
-        case DataType::Float32:
-            forward_<float>(input, axis, selectLastIdx);
-            break;
-        case DataType::Float16:
-            forward_<half>(input, axis, selectLastIdx);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        forward_<double>(input, axis, selectLastIdx);
+        break;
+    case DataType::Float32:
+        forward_<float>(input, axis, selectLastIdx);
+        break;
+    case DataType::Float16:
+        forward_<half>(input, axis, selectLastIdx);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
-
 template <class T>
-void Aidge::ArgMaxImpl_cuda::forward_(const Tensor& input, std::int32_t axis, DimSize_t selectLastIdx) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
-
+void Aidge::ArgMaxImpl_cuda::forward_(const Tensor &input,
+                                      std::int32_t axis,
+                                      DimSize_t selectLastIdx) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
-    const T * inputPtr = static_cast<const T*>(input.getImpl()->rawPtr());
-    T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
+    const T *inputPtr = static_cast<const T *>(input.getImpl()->rawPtr());
+    T *outputPtr = static_cast<T *>(op.getOutput(0)->getImpl()->rawPtr());
 
     std::vector<int> inputStrides(op.getInput(0)->nbDims(), 1);
-    if(op.getInput(0)->nbDims()>1) {
-        for (int i = op.getInput(0)->nbDims()-2; i >= 0; i--) {
-            inputStrides[i] = inputStrides[i+1] *  op.getInput(0)->dims()[i+1];
+    if (op.getInput(0)->nbDims() > 1) {
+        for (int i = op.getInput(0)->nbDims() - 2; i >= 0; i--) {
+            inputStrides[i] =
+                inputStrides[i + 1] * op.getInput(0)->dims()[i + 1];
         }
     }
 
     std::vector<int> inputShape(input.nbDims());
 
     // Use std::transform to convert each element
-    std::transform(input.dims().begin(), input.dims().end(), inputShape.begin(),
-                   [](size_t value) {
-                       return static_cast<int>(value);
-                   });
-    Aidge::ArgMax_cuda_forward_kernel<T>(inputPtr, outputPtr,
-                                      inputShape, inputStrides,
-                                      axis, static_cast<int>(op.getInput(0)->size()), selectLastIdx);
+    std::transform(input.dims().begin(),
+                   input.dims().end(),
+                   inputShape.begin(),
+                   [](size_t value) { return static_cast<int>(value); });
+    Aidge::ArgMax_cuda_forward_kernel<T>(
+        inputPtr,
+        outputPtr,
+        inputShape,
+        inputStrides,
+        axis,
+        static_cast<int>(op.getInput(0)->size()),
+        selectLastIdx);
 }
diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp
index d1270ee4b0a556e1053f3cfde8d71ec5efbee279..3ae52d114f005408722e5f2259013db91a6a0f1c 100644
--- a/src/operator/AvgPoolingImpl.cpp
+++ b/src/operator/AvgPoolingImpl.cpp
@@ -21,27 +21,30 @@
 
 template <Aidge::DimIdx_t DIM>
 void Aidge::AvgPoolingImpl_cuda<DIM>::forward() {
-    const AvgPooling_Op<DIM>& op = dynamic_cast<const AvgPooling_Op<DIM>&>(mOp);
+    const AvgPooling_Op<DIM> &op =
+        dynamic_cast<const AvgPooling_Op<DIM> &>(mOp);
 
     AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0");
 
-    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+    const auto &input =
+        op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
 
     // Lazy-initialize CuDNN AvgPooling descriptor
     if (mAvgPoolingDesc == nullptr) {
-        const std::vector<int> strides(op.strideDims().begin(), op.strideDims().end());
+        const std::vector<int> strides(op.strideDims().begin(),
+                                       op.strideDims().end());
         const std::vector<int> paddings(DIM, 0);
-        const std::vector<int> window_dims(op.kernelDims().begin(), op.kernelDims().end());
+        const std::vector<int> window_dims(op.kernelDims().begin(),
+                                           op.kernelDims().end());
 
         CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mAvgPoolingDesc));
-        CHECK_CUDNN_STATUS(
-            cudnnSetPoolingNdDescriptor(mAvgPoolingDesc,
-                                        mMode,
-                                        CUDNN_NOT_PROPAGATE_NAN,
-                                        DIM,
-                                        &window_dims[0],
-                                        &paddings[0],
-                                        &strides[0]));
+        CHECK_CUDNN_STATUS(cudnnSetPoolingNdDescriptor(mAvgPoolingDesc,
+                                                       mMode,
+                                                       CUDNN_NOT_PROPAGATE_NAN,
+                                                       DIM,
+                                                       &window_dims[0],
+                                                       &paddings[0],
+                                                       &strides[0]));
     }
 
     // Do the actual forward computation
@@ -49,77 +52,85 @@ void Aidge::AvgPoolingImpl_cuda<DIM>::forward() {
     // excepted when the convolution is performed in double precision.
     if (op.getOutput(0)->dataType() == DataType::Float64) {
         forward_<double>(input);
-    }
-    else {
+    } else {
         forward_<float>(input);
     }
 }
 
 template <Aidge::DimIdx_t DIM>
 template <class T>
-void Aidge::AvgPoolingImpl_cuda<DIM>::forward_(const Tensor& input) {
-    const AvgPooling_Op<DIM>& op = dynamic_cast<const AvgPooling_Op<DIM>&>(mOp);
+void Aidge::AvgPoolingImpl_cuda<DIM>::forward_(const Tensor &input) {
+    const AvgPooling_Op<DIM> &op =
+        dynamic_cast<const AvgPooling_Op<DIM> &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
-    CHECK_CUDNN_STATUS(
-        cudnnPoolingForward(
-            CudaContext::cudnnHandle(),
-            mAvgPoolingDesc,
-            &alpha,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
-            input.getImpl()->rawPtr(),
-            &beta,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-            op.getOutput(0)->getImpl()->rawPtr()
-        )
-    );
+    CHECK_CUDNN_STATUS(cudnnPoolingForward(
+        CudaContext::cudnnHandle(),
+        mAvgPoolingDesc,
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())
+            ->getCudnnTensorDesc(input),
+        input.getImpl()->rawPtr(),
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        op.getOutput(0)->getImpl()->rawPtr()));
 }
 
 template <Aidge::DimIdx_t DIM>
 void Aidge::AvgPoolingImpl_cuda<DIM>::backward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
-    AIDGE_ASSERT(mAvgPoolingDesc != nullptr, "AvgPool descriptor must be created during forward!");
+    AIDGE_ASSERT(mAvgPoolingDesc != nullptr,
+                 "AvgPool descriptor must be created during forward!");
     AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output grad #0");
 
-    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+    const auto &output_grad =
+        op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback,
+                                             *op.getOutput(0)->grad());
 
     // Do the actual backward computation
     // Template is only for scaling parameters, which are always in float
     // excepted when the convolution is performed in double precision.
     if (op.getInput(0)->grad()->dataType() == DataType::Float64) {
         backward_<double>(output_grad);
-    }
-    else {
+    } else {
         backward_<float>(output_grad);
     }
 }
 
 template <Aidge::DimIdx_t DIM>
 template <class T>
-void Aidge::AvgPoolingImpl_cuda<DIM>::backward_(const Tensor& output_grad) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::AvgPoolingImpl_cuda<DIM>::backward_(const Tensor &output_grad) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     const T alpha = 1.0f;
     const T beta = 0.0f;
-    CHECK_CUDNN_STATUS(
-        cudnnPoolingBackward(CudaContext::cudnnHandle(),
-                             mAvgPoolingDesc,
-                             &alpha,
-                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                             std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(),
-                             std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad),
-                             output_grad.getImpl()->rawPtr(),
-                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
-                             op.getInput(0)->getImpl()->rawPtr(),
-                             &beta,
-                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
-                             op.getInput(0)->grad()->getImpl()->rawPtr()));
+    CHECK_CUDNN_STATUS(cudnnPoolingBackward(
+        CudaContext::cudnnHandle(),
+        mAvgPoolingDesc,
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())
+            ->getCudnnTensorDesc(output_grad),
+        output_grad.getImpl()->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getInput(0)),
+        op.getInput(0)->getImpl()->rawPtr(),
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(
+            op.getInput(0)->grad()->getImpl())
+            ->getCudnnTensorDesc(*op.getInput(0)),
+        op.getInput(0)->grad()->getImpl()->rawPtr()));
 }
 
 template <Aidge::DimIdx_t DIM>
 Aidge::AvgPoolingImpl_cuda<DIM>::~AvgPoolingImpl_cuda() {
-    if(mAvgPoolingDesc != nullptr)
+    if (mAvgPoolingDesc != nullptr)
         cudnnDestroyPoolingDescriptor(mAvgPoolingDesc);
 }
 
diff --git a/src/operator/BatchNormImpl.cpp b/src/operator/BatchNormImpl.cpp
index 5cf079326a0ea003fb72875bcaebefe847086ecb..dee6bcfed60ddd3e1b1aeab26495e722a1fb4231 100644
--- a/src/operator/BatchNormImpl.cpp
+++ b/src/operator/BatchNormImpl.cpp
@@ -20,8 +20,7 @@
 #include "aidge/operator/BatchNorm.hpp"
 #include "aidge/utils/Types.h"
 
-template <Aidge::DimIdx_t DIM>
-void Aidge::BatchNormImpl_cuda<DIM>::forward() {
+template <Aidge::DimIdx_t DIM> void Aidge::BatchNormImpl_cuda<DIM>::forward() {
     // FIXME: uncomment the following code once memory handling will work
     AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0");
     AIDGE_ASSERT(mOp.getRawInput(1), "missing input #1");
@@ -29,17 +28,37 @@ void Aidge::BatchNormImpl_cuda<DIM>::forward() {
     AIDGE_ASSERT(mOp.getRawInput(3), "missing input #3");
     AIDGE_ASSERT(mOp.getRawInput(4), "missing input #4");
 
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback,
+        input3Fallback, input4Fallback;
+    const auto &input0 =
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))
+            ->refCastFrom(
+                input0Fallback,
+                *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto &input1 =
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(1))
+            ->refCastFrom(
+                input1Fallback,
+                *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto &input2 =
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(2))
+            ->refCastFrom(
+                input2Fallback,
+                *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto &input3 =
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(3))
+            ->refCastFrom(
+                input3Fallback,
+                *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto &input4 =
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(4))
+            ->refCastFrom(
+                input4Fallback,
+                *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
 
-    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback, input3Fallback, input4Fallback;
-    const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
-    const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
-    const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
-    const auto& input3 = std::static_pointer_cast<Tensor>(mOp.getRawInput(3))->refCastFrom(input3Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
-    const auto& input4 = std::static_pointer_cast<Tensor>(mOp.getRawInput(4))->refCastFrom(input4Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
-
-   if (mBNDesc == nullptr)
-    {
-        const BatchNorm_Op<DIM>& bnOp = static_cast<const BatchNorm_Op<DIM>&>(mOp);
+    if (mBNDesc == nullptr) {
+        const BatchNorm_Op<DIM> &bnOp =
+            static_cast<const BatchNorm_Op<DIM> &>(mOp);
         mEpsilon = static_cast<double>(bnOp.epsilon());
         mMode = CUDNN_BATCHNORM_SPATIAL;
 
@@ -50,8 +69,10 @@ void Aidge::BatchNormImpl_cuda<DIM>::forward() {
 
         CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mBNDesc));
         CHECK_CUDNN_STATUS(cudnnDeriveBNTensorDescriptor(
-            mBNDesc, std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0), mMode));
-
+            mBNDesc,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())
+                ->getCudnnTensorDesc(input0),
+            mMode));
 
         cudnnDataType_t dataType;
         const unsigned int nbDimsRequested = DIM;
@@ -59,159 +80,218 @@ void Aidge::BatchNormImpl_cuda<DIM>::forward() {
         std::vector<int> strides(nbDimsRequested);
         int nbDims;
         CHECK_CUDNN_STATUS(cudnnGetTensorNdDescriptor(mBNDesc,
-                                                    nbDimsRequested,
-                                                    &dataType,
-                                                    &nbDims,
-                                                    &dims[0],
-                                                    &strides[0]));
+                                                      nbDimsRequested,
+                                                      &dataType,
+                                                      &nbDims,
+                                                      &dims[0],
+                                                      &strides[0]));
         dims.resize(nbDims);
         strides.resize(nbDims);
     }
 
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            forward_<double>(input0, input1, input2, input3, input4);
-            break;
-        case DataType::Float32:
-            forward_<float>(input0, input1, input2, input3, input4);
-            break;
-        case DataType::Float16:
-            forward_<half>(input0, input1, input2, input3, input4);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        forward_<double>(input0, input1, input2, input3, input4);
+        break;
+    case DataType::Float32:
+        forward_<float>(input0, input1, input2, input3, input4);
+        break;
+    case DataType::Float16:
+        forward_<half>(input0, input1, input2, input3, input4);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <Aidge::DimIdx_t DIM>
 template <class T>
-void Aidge::BatchNormImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, const Tensor& input3, const Tensor& input4) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::BatchNormImpl_cuda<DIM>::forward_(const Tensor &input0,
+                                              const Tensor &input1,
+                                              const Tensor &input2,
+                                              const Tensor &input3,
+                                              const Tensor &input4) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
 
     cudnnTensorDescriptor_t tensorDesc;
-    // For scale, bias, var and mean, if we have a 1D tensor, the dim should go on the channels
-    if (input1.nbDims() == 1)
-    {
+    // For scale, bias, var and mean, if we have a 1D tensor, the dim should go
+    // on the channels
+    if (input1.nbDims() == 1) {
         CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc));
-        const std::vector<int> dims = {1, static_cast<int>(input1.size()),1, 1};
-        const std::vector<int> strides = {static_cast<int>(input1.size()), 1, 1, 1};
-        CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, dims.size(), dims.data(), strides.data()));
-    }
-    else {
-        tensorDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(input1.getImpl())->getCudnnTensorDesc(input1);
+        const std::vector<int> dims = {1,
+                                       static_cast<int>(input1.size()),
+                                       1,
+                                       1};
+        const std::vector<int> strides = {static_cast<int>(input1.size()),
+                                          1,
+                                          1,
+                                          1};
+        CHECK_CUDNN_STATUS(
+            cudnnSetTensorNdDescriptor(tensorDesc,
+                                       CudaContext::data_type<T>::value,
+                                       dims.size(),
+                                       dims.data(),
+                                       strides.data()));
+    } else {
+        tensorDesc =
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input1.getImpl())
+                ->getCudnnTensorDesc(input1);
     }
-    CHECK_CUDNN_STATUS(
-        cudnnBatchNormalizationForwardInference(
-                CudaContext::cudnnHandle(),
-                mMode,
-                &alpha,
-                &beta,
-                std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
-                input0.getImpl()->rawPtr(),
-                std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(),
-                tensorDesc,
-                input1.getImpl()->rawPtr(),
-                input2.getImpl()->rawPtr(),
-                input3.getImpl()->rawPtr(),
-                input4.getImpl()->rawPtr(),
-                mEpsilon)
-    );
-    if (input1.nbDims() == 1)
-    {
+    CHECK_CUDNN_STATUS(cudnnBatchNormalizationForwardInference(
+        CudaContext::cudnnHandle(),
+        mMode,
+        &alpha,
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())
+            ->getCudnnTensorDesc(input0),
+        input0.getImpl()->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr(),
+        tensorDesc,
+        input1.getImpl()->rawPtr(),
+        input2.getImpl()->rawPtr(),
+        input3.getImpl()->rawPtr(),
+        input4.getImpl()->rawPtr(),
+        mEpsilon));
+    if (input1.nbDims() == 1) {
         CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc));
     }
 }
 
 template <Aidge::DimIdx_t DIM>
 void Aidge::BatchNormImpl_cuda<DIM>::backward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
-    AIDGE_ASSERT(mBNDesc != nullptr, "BatchNorm descriptor must be created during forward!");
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
+    AIDGE_ASSERT(mBNDesc != nullptr,
+                 "BatchNorm descriptor must be created during forward!");
     for (IOIndex_t i = 0; i < (op.nbInputs() - 2); ++i) {
-        AIDGE_ASSERT(op.getInput(i), "missing input # {} in BatchNorm operator", i);
-        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run BatchNorm backward because the {}-th input has no implementation.", i);
+        AIDGE_ASSERT(op.getInput(i),
+                     "missing input # {} in BatchNorm operator",
+                     i);
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(),
+                     "cannot run BatchNorm backward because the {}-th input "
+                     "has no implementation.",
+                     i);
     }
-    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing outputGrad in BatchNorm operator");
-    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run BatchNorm backward because the output grad has no implementation.");
-
-    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback, outputGradFallback;
-    const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
-    const auto& weights = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
-    const auto& bias = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
-    const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad());
-
-
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            backward_<double>(input0, outputGrad, weights);
-            break;
-        case DataType::Float32:
-            backward_<float>(input0, outputGrad, weights);
-            break;
-        case DataType::Float16:
-            backward_<half>(input0, outputGrad, weights);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    AIDGE_ASSERT(op.getOutput(0)->grad(),
+                 "missing outputGrad in BatchNorm operator");
+    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(),
+                 "cannot run BatchNorm backward because the output grad has "
+                 "no implementation.");
+
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback,
+        outputGradFallback;
+    const auto &input0 =
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))
+            ->refCastFrom(
+                input0Fallback,
+                *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto &weights =
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(1))
+            ->refCastFrom(
+                input1Fallback,
+                *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto &bias =
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(2))
+            ->refCastFrom(
+                input2Fallback,
+                *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto &outputGrad =
+        op.getOutput(0)->grad()->refCastFrom(outputGradFallback,
+                                             *op.getOutput(0)->grad());
+
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        backward_<double>(input0, outputGrad, weights);
+        break;
+    case DataType::Float32:
+        backward_<float>(input0, outputGrad, weights);
+        break;
+    case DataType::Float16:
+        backward_<half>(input0, outputGrad, weights);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <Aidge::DimIdx_t DIM>
 template <class T>
-void Aidge::BatchNormImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& outputGrad, const Tensor& weights) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::BatchNormImpl_cuda<DIM>::backward_(const Tensor &input0,
+                                               const Tensor &outputGrad,
+                                               const Tensor &weights) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
     const typename Cuda::cudnn_scaling_type<T>::type alphaData = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type betaData = 0.0f;
 
     cudnnTensorDescriptor_t scaleBiasDesc;
-    // For scale, bias, var and mean, if we have a 1D tensor, the dim should go on the channels
-    if (weights.nbDims() == 1)
-    {
+    // For scale, bias, var and mean, if we have a 1D tensor, the dim should go
+    // on the channels
+    if (weights.nbDims() == 1) {
         CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&scaleBiasDesc));
-        const std::vector<int> dims = {1, static_cast<int>(weights.size()),1, 1};
-        const std::vector<int> strides = {static_cast<int>(weights.size()), 1, 1, 1};
-        CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(scaleBiasDesc, CudaContext::data_type<T>::value, dims.size(), dims.data(), strides.data()));
-    }
-    else {
-        scaleBiasDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(weights.getImpl())->getCudnnTensorDesc(weights);
+        const std::vector<int> dims = {1,
+                                       static_cast<int>(weights.size()),
+                                       1,
+                                       1};
+        const std::vector<int> strides = {static_cast<int>(weights.size()),
+                                          1,
+                                          1,
+                                          1};
+        CHECK_CUDNN_STATUS(
+            cudnnSetTensorNdDescriptor(scaleBiasDesc,
+                                       CudaContext::data_type<T>::value,
+                                       dims.size(),
+                                       dims.data(),
+                                       strides.data()));
+    } else {
+        scaleBiasDesc =
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(weights.getImpl())
+                ->getCudnnTensorDesc(weights);
     }
 
-    CHECK_CUDNN_STATUS(
-        cudnnBatchNormalizationBackward(
-                CudaContext::cudnnHandle(),
-                mMode,
-                &alphaData,
-                &betaData,
-                &alpha,
-                &beta,
-                std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
-                input0.getImpl()->rawPtr(),
-                std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(outputGrad),
-                outputGrad.getImpl()->rawPtr(),
-                std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
-                op.getInput(0)->grad()->getImpl()->rawPtr(),
-                scaleBiasDesc,
-                weights.getImpl()->rawPtr(),
-                op.getInput(1)->grad()->getImpl()->rawPtr(),
-                op.getInput(2)->grad()->getImpl()->rawPtr(),
-                mEpsilon,
-                nullptr,
-                nullptr) // TODO add savedMean and savedVar?
+    CHECK_CUDNN_STATUS(cudnnBatchNormalizationBackward(
+        CudaContext::cudnnHandle(),
+        mMode,
+        &alphaData,
+        &betaData,
+        &alpha,
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())
+            ->getCudnnTensorDesc(input0),
+        input0.getImpl()->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())
+            ->getCudnnTensorDesc(outputGrad),
+        outputGrad.getImpl()->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(
+            op.getInput(0)->grad()->getImpl())
+            ->getCudnnTensorDesc(*op.getInput(0)),
+        op.getInput(0)->grad()->getImpl()->rawPtr(),
+        scaleBiasDesc,
+        weights.getImpl()->rawPtr(),
+        op.getInput(1)->grad()->getImpl()->rawPtr(),
+        op.getInput(2)->grad()->getImpl()->rawPtr(),
+        mEpsilon,
+        nullptr,
+        nullptr) // TODO add savedMean and savedVar?
     );
-    if (weights.nbDims() == 1)
-    {
+    if (weights.nbDims() == 1) {
         CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(scaleBiasDesc));
     }
 }
 
 template <Aidge::DimIdx_t DIM>
 Aidge::BatchNormImpl_cuda<DIM>::~BatchNormImpl_cuda() {
-    if(mBNDesc != nullptr)
-    {
+    if (mBNDesc != nullptr) {
         cudnnDestroyTensorDescriptor(mBNDesc);
     }
 }
diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp
index 24e01db03692ffaa884b31a224a1947a9e1645a0..ca9f3aa32cf748b410d67c1d76ade216a0e5a829 100644
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -20,21 +20,25 @@
 #include "aidge/operator/ConvDepthWise.hpp"
 #include "aidge/utils/Types.h"
 
-template <Aidge::DimIdx_t DIM>
-void Aidge::ConvImpl_cuda<DIM>::forward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+template <Aidge::DimIdx_t DIM> void Aidge::ConvImpl_cuda<DIM>::forward() {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     AIDGE_ASSERT(op.getInput(0), "missing input #0");
-    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "the 0-th input has no implementation.");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(),
+                 "the 0-th input has no implementation.");
     AIDGE_ASSERT(op.getInput(1), "missing input #1");
-    AIDGE_ASSERT(op.getInput(1)->hasImpl(), "the 1-th input has no implementation.");
+    AIDGE_ASSERT(op.getInput(1)->hasImpl(),
+                 "the 1-th input has no implementation.");
 
     // Convert input data (no overhead if not needed!)
-    const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0));
-    const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0));
+    const auto &input0 =
+        op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0));
+    const auto &input1 =
+        op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0));
     Tensor input2;
-    if(op.getInput(2) && op.getInput(2)->hasImpl()) {
-        input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0));
+    if (op.getInput(2) && op.getInput(2)->hasImpl()) {
+        input2 =
+            op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0));
     }
 
     // Lazy-initialize CuDNN convolution descriptor
@@ -42,18 +46,24 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
         const std::vector<int> paddings(DIM, 0);
         std::vector<int> strides, upscales;
         if (mDepthWise) {
-            const ConvDepthWise_Op<DIM>& convDWOp = static_cast<const ConvDepthWise_Op<DIM>&>(mOp);
-            strides = std::vector<int>(convDWOp.strideDims().begin(), convDWOp.strideDims().end());
-            upscales = std::vector<int>(convDWOp.dilationDims().begin(), convDWOp.dilationDims().end());
-        }
-        else {
-            const Conv_Op<DIM>& convOp = static_cast<const Conv_Op<DIM>&>(mOp);
-            strides = std::vector<int>(convOp.strideDims().begin(), convOp.strideDims().end());
-            upscales = std::vector<int>(convOp.dilationDims().begin(), convOp.dilationDims().end());
+            const ConvDepthWise_Op<DIM> &convDWOp =
+                static_cast<const ConvDepthWise_Op<DIM> &>(mOp);
+            strides = std::vector<int>(convDWOp.strideDims().begin(),
+                                       convDWOp.strideDims().end());
+            upscales = std::vector<int>(convDWOp.dilationDims().begin(),
+                                        convDWOp.dilationDims().end());
+        } else {
+            const Conv_Op<DIM> &convOp =
+                static_cast<const Conv_Op<DIM> &>(mOp);
+            strides = std::vector<int>(convOp.strideDims().begin(),
+                                       convOp.strideDims().end());
+            upscales = std::vector<int>(convOp.dilationDims().begin(),
+                                        convOp.dilationDims().end());
         }
 
         CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc));
-        CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc,
+        CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(
+            mConvDesc,
             DIM,
             &paddings[0],
             &strides[0],
@@ -64,27 +74,33 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
 
     // Lazy-initialize CuDNN filter descriptor
     if (mFilterDesc == nullptr) {
-        const std::vector<int> kernels(input1.dims().begin(), input1.dims().end());
+        const std::vector<int> kernels(input1.dims().begin(),
+                                       input1.dims().end());
 
         CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc));
-        CHECK_CUDNN_STATUS(cudnnSetFilterNdDescriptor(mFilterDesc,
-            DataTypeToCudnn(input1.dataType()),
-            CUDNN_TENSOR_NCHW,
-            kernels.size(),
-            &kernels[0]));
+        CHECK_CUDNN_STATUS(
+            cudnnSetFilterNdDescriptor(mFilterDesc,
+                                       DataTypeToCudnn(input1.dataType()),
+                                       CUDNN_TENSOR_NCHW,
+                                       kernels.size(),
+                                       &kernels[0]));
     }
 
     // Set forward algorithm and allocate the required workspace
     if (mFwdWorkspace == nullptr) {
-        // Allocate the workspace required by the chosen CuDNN forward algorithm
+        // Allocate the workspace required by the chosen CuDNN forward
+        // algorithm
         size_t workspaceSize = 0;
 
         CHECK_CUDNN_STATUS(cudnnGetConvolutionForwardWorkspaceSize(
             CudaContext::cudnnHandle(),
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())
+                ->getCudnnTensorDesc(input0),
             mFilterDesc,
             mConvDesc,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                op.getOutput(0)->getImpl())
+                ->getCudnnTensorDesc(*op.getOutput(0)),
             mFwdAlgo,
             &workspaceSize));
 
@@ -97,21 +113,24 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
     // excepted when the convolution is performed in double precision.
     if (op.getOutput(0)->dataType() == DataType::Float64) {
         forward_<double>(input0, input1, input2);
-    }
-    else {
+    } else {
         forward_<float>(input0, input1, input2);
     }
 }
 
 template <Aidge::DimIdx_t DIM>
 template <class T>
-void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor &input0,
+                                         const Tensor &input1,
+                                         const Tensor &input2) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
-    CHECK_CUDNN_STATUS(cudnnConvolutionForward(CudaContext::cudnnHandle(),
+    CHECK_CUDNN_STATUS(cudnnConvolutionForward(
+        CudaContext::cudnnHandle(),
         &alpha,
-        std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())
+            ->getCudnnTensorDesc(input0),
         input0.getImpl()->rawPtr(),
         mFilterDesc,
         input1.getImpl()->rawPtr(),
@@ -120,63 +139,81 @@ void Aidge::ConvImpl_cuda<DIM>::forward_(const Tensor& input0, const Tensor& inp
         mFwdWorkspace,
         mWorkspaceSize,
         &beta,
-        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
         op.getOutput(0)->getImpl()->rawPtr()));
 
     // Add bias (if there is any)
     if (mOp.getRawInput(2) && input2.size() > 0) {
-        // Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor()
-        std::vector<DimSize_t> biasDims(DIM+2, 1);
+        // Bias tensor needs to have the same number of dims than output tensor
+        // for cudnnAddTensor()
+        std::vector<DimSize_t> biasDims(DIM + 2, 1);
         biasDims[1] = input2.size();
 
-        // Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc())
+        // Create a dummy tensor with the right dims in order to get a CuDNN
+        // tensor descriptor (with getCudnnTensorDesc())
         Tensor bias(input2.dataType());
         bias.setBackend("cuda");
         bias.resize(biasDims);
         // TODO: find a more elegant solution(?)
 
-        CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(),
+        CHECK_CUDNN_STATUS(cudnnAddTensor(
+            CudaContext::cudnnHandle(),
             &alpha,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(bias.getImpl())->getCudnnTensorDesc(bias),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(bias.getImpl())
+                ->getCudnnTensorDesc(bias),
             input2.getImpl()->rawPtr(),
             &alpha,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                op.getOutput(0)->getImpl())
+                ->getCudnnTensorDesc(*op.getOutput(0)),
             op.getOutput(0)->getImpl()->rawPtr()));
     }
 }
 
-template <Aidge::DimIdx_t DIM>
-void Aidge::ConvImpl_cuda<DIM>::backward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+template <Aidge::DimIdx_t DIM> void Aidge::ConvImpl_cuda<DIM>::backward() {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     AIDGE_ASSERT(op.getInput(0), "missing input #0");
-    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "the 0-th input has no implementation.");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(),
+                 "the 0-th input has no implementation.");
     AIDGE_ASSERT(op.getInput(1), "missing input #1");
-    AIDGE_ASSERT(op.getInput(1)->hasImpl(), "the 1-th input has no implementation.");
+    AIDGE_ASSERT(op.getInput(1)->hasImpl(),
+                 "the 1-th input has no implementation.");
 
     // Convert input data (no overhead if not needed!)
-    const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0));
-    const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0));
+    const auto &input0 =
+        op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0));
+    const auto &input1 =
+        op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0));
     Tensor input2;
-    if(op.getInput(2) && op.getInput(2)->hasImpl()) {
-        input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0));
+    if (op.getInput(2) && op.getInput(2)->hasImpl()) {
+        input2 =
+            op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0));
     }
 
     // Set forward algorithm and allocate the required workspace
     if (mBwdWorkspace == nullptr) {
-        // Find the best CuDNN backward algorithm (the one with the lowest compute time)
+        // Find the best CuDNN backward algorithm (the one with the lowest
+        // compute time)
         int maxAlgoIterations = 0;
-        cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(CudaContext::cudnnHandle(),
-                                                    &maxAlgoIterations);
-        assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionBackwardFilterAlgorithm");
+        cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+            CudaContext::cudnnHandle(),
+            &maxAlgoIterations);
+        assert(maxAlgoIterations > 0 &&
+               "No available CUDNN ConvolutionBackwardFilterAlgorithm");
 
         int returnAlgoCounts = 0;
-        std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> returnBwdFilterAlgo(maxAlgoIterations);
+        std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> returnBwdFilterAlgo(
+            maxAlgoIterations);
 
         CHECK_CUDNN_STATUS(cudnnFindConvolutionBackwardFilterAlgorithm(
             CudaContext::cudnnHandle(),
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())
+                ->getCudnnTensorDesc(input0),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                op.getOutput(0)->getImpl())
+                ->getCudnnTensorDesc(*op.getOutput(0)),
             mConvDesc,
             mFilterDesc,
             maxAlgoIterations,
@@ -186,33 +223,43 @@ void Aidge::ConvImpl_cuda<DIM>::backward() {
         mBwdFilterAlgo = returnBwdFilterAlgo[0].algo;
 
         maxAlgoIterations = 0;
-        cudnnGetConvolutionBackwardDataAlgorithmMaxCount(CudaContext::cudnnHandle(),
-                                                    &maxAlgoIterations);
-        assert(maxAlgoIterations > 0 && "No available CUDNN ConvolutionBackwardDataAlgorithm");
+        cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
+            CudaContext::cudnnHandle(),
+            &maxAlgoIterations);
+        assert(maxAlgoIterations > 0 &&
+               "No available CUDNN ConvolutionBackwardDataAlgorithm");
 
         returnAlgoCounts = 0;
-        std::vector<cudnnConvolutionBwdDataAlgoPerf_t> returnBwdDataAlgo(maxAlgoIterations);
+        std::vector<cudnnConvolutionBwdDataAlgoPerf_t> returnBwdDataAlgo(
+            maxAlgoIterations);
 
         CHECK_CUDNN_STATUS(cudnnFindConvolutionBackwardDataAlgorithm(
             CudaContext::cudnnHandle(),
             mFilterDesc,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                op.getOutput(0)->getImpl())
+                ->getCudnnTensorDesc(*op.getOutput(0)),
             mConvDesc,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())
+                ->getCudnnTensorDesc(input0),
             maxAlgoIterations,
             &returnAlgoCounts,
             &returnBwdDataAlgo[0]));
 
         mBwdDataAlgo = returnBwdDataAlgo[0].algo;
 
-        // Allocate the workspace required by the chosen CuDNN backward algorithm
+        // Allocate the workspace required by the chosen CuDNN backward
+        // algorithm
         size_t workspaceSize = 0;
         CHECK_CUDNN_STATUS(cudnnGetConvolutionBackwardFilterWorkspaceSize(
             CudaContext::cudnnHandle(),
             // same arguments as cudnnGetConvolutionBackwardFilterAlgorithm()
             // -->
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())
+                ->getCudnnTensorDesc(input0),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                op.getOutput(0)->getImpl())
+                ->getCudnnTensorDesc(*op.getOutput(0)),
             mConvDesc,
             mFilterDesc,
             // <--
@@ -224,9 +271,12 @@ void Aidge::ConvImpl_cuda<DIM>::backward() {
             CudaContext::cudnnHandle(),
             // same arguments as cudnnGetConvolutionBackwardDataAlgorithm() -->
             mFilterDesc,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                op.getOutput(0)->getImpl())
+                ->getCudnnTensorDesc(*op.getOutput(0)),
             mConvDesc,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())
+                ->getCudnnTensorDesc(input0),
             // <--
             mBwdDataAlgo,
             &workspaceSizeData));
@@ -250,19 +300,22 @@ void Aidge::ConvImpl_cuda<DIM>::backward() {
     // excepted when the convolution is performed in double precision.
     if (op.getOutput(0)->dataType() == DataType::Float64) {
         backward_<double>(input0, input1, input2);
-    }
-    else {
+    } else {
         backward_<float>(input0, input1, input2);
     }
 }
 
 template <Aidge::DimIdx_t DIM>
 template <class T>
-void Aidge::ConvImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::ConvImpl_cuda<DIM>::backward_(const Tensor &input0,
+                                          const Tensor &input1,
+                                          const Tensor &input2) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     std::shared_ptr<Tensor> gradOutputFallback;
-    const auto& gradOutput = op.getOutput(0)->grad()->refCastFrom(gradOutputFallback, *(op.getInput(0)->grad()));
+    const auto &gradOutput =
+        op.getOutput(0)->grad()->refCastFrom(gradOutputFallback,
+                                             *(op.getInput(0)->grad()));
 
     const T alpha = 1.0f;
     const T beta = 0.0f;
@@ -270,9 +323,11 @@ void Aidge::ConvImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& in
     CHECK_CUDNN_STATUS(cudnnConvolutionBackwardFilter(
         CudaContext::cudnnHandle(),
         &alpha,
-        std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())->getCudnnTensorDesc(input0),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(input0.getImpl())
+            ->getCudnnTensorDesc(input0),
         input0.getImpl()->rawPtr(),
-        std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())
+            ->getCudnnTensorDesc(gradOutput),
         gradOutput.getImpl()->rawPtr(),
         mConvDesc,
         mBwdFilterAlgo,
@@ -287,40 +342,47 @@ void Aidge::ConvImpl_cuda<DIM>::backward_(const Tensor& input0, const Tensor& in
         &alpha,
         mFilterDesc,
         input1.getImpl()->rawPtr(),
-        std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())
+            ->getCudnnTensorDesc(gradOutput),
         gradOutput.getImpl()->rawPtr(),
         mConvDesc,
         mBwdDataAlgo,
         mBwdWorkspace,
         mWorkspaceSize,
         &beta,
-        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(
+            op.getInput(0)->grad()->getImpl())
+            ->getCudnnTensorDesc(*op.getInput(0)),
         op.getInput(0)->grad()->getImpl()->rawPtr()));
 
     // Add bias (if there is any)
     if (mOp.getRawInput(2) && input2.size() > 0) {
-        // Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor()
-        std::vector<DimSize_t> gradBiasDims(DIM+2, 1);
+        // Bias tensor needs to have the same number of dims than output tensor
+        // for cudnnAddTensor()
+        std::vector<DimSize_t> gradBiasDims(DIM + 2, 1);
         gradBiasDims[1] = op.getInput(2)->grad()->size();
 
-        // Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc())
+        // Create a dummy tensor with the right dims in order to get a CuDNN
+        // tensor descriptor (with getCudnnTensorDesc())
         Tensor gradBias(op.getInput(2)->grad()->dataType());
         gradBias.setBackend("cuda");
         gradBias.resize(gradBiasDims);
         // TODO: find a more elegant solution(?)
 
-        CHECK_CUDNN_STATUS(cudnnConvolutionBackwardBias(CudaContext::cudnnHandle(),
+        CHECK_CUDNN_STATUS(cudnnConvolutionBackwardBias(
+            CudaContext::cudnnHandle(),
             &alpha,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())->getCudnnTensorDesc(gradOutput),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(gradOutput.getImpl())
+                ->getCudnnTensorDesc(gradOutput),
             gradOutput.getImpl()->rawPtr(),
             &beta,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(gradBias.getImpl())->getCudnnTensorDesc(gradBias),
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(gradBias.getImpl())
+                ->getCudnnTensorDesc(gradBias),
             op.getInput(2)->grad()->getImpl()->rawPtr()));
     }
 }
 
-template <Aidge::DimIdx_t DIM>
-Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() {
+template <Aidge::DimIdx_t DIM> Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() {
     if (mConvDesc != nullptr) {
         cudnnDestroyConvolutionDescriptor(mConvDesc);
     }
@@ -334,7 +396,6 @@ Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() {
     }
 }
 
-
 // Template declarations
 template class Aidge::ConvImpl_cuda<1>;
 template class Aidge::ConvImpl_cuda<2>;
diff --git a/src/operator/DivImpl.cpp b/src/operator/DivImpl.cpp
index 0326a60c1a3aabf43ca3a1d892328991d6d72366..4ae5c2b28cdd61956bc3c3b0bb7886ce99dcdf85 100644
--- a/src/operator/DivImpl.cpp
+++ b/src/operator/DivImpl.cpp
@@ -23,27 +23,39 @@
 #include "aidge/utils/Types.h"
 
 void Aidge::DivImpl_cuda::forward() {
-    const Div_Op& op = static_cast<const Div_Op&>(mOp);
+    const Div_Op &op = static_cast<const Div_Op &>(mOp);
     // Check inputs
     AIDGE_ASSERT(op.getInput(0), "missing input in Div operator");
-    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Div forward because the 0-th input has no implementation.");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(),
+                 "cannot run Div forward because the 0-th input has no "
+                 "implementation.");
     DataType datatypeFirstInput = op.getInput(0)->dataType();
     for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
         AIDGE_ASSERT(op.getInput(i), "missing input in Div operator");
-        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Div forward because the {}-th input has no implementation.", i);
-        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot Div inputs with two differents data type.");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(),
+                     "cannot run Div forward because the {}-th input has no "
+                     "implementation.",
+                     i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput,
+                     "Cannot Div inputs with two differents data type.");
     }
 
     std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs());
     std::vector<Tensor> inputs(op.nbInputs());
     std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
-    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    std::vector<std::vector<int>> strides(
+        op.nbInputs()); // For the cooresponding strides
     for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
-        inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
+        inputs[i] =
+            op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
 
         // Get tensor dims and broadcast them
-        std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
-        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+        std::copy(inputs[i].dims().begin(),
+                  inputs[i].dims().end(),
+                  std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(),
+                       op.getOutput(0)->nbDims() - dims[i].size(),
+                       int(1));
 
         if (dims[i].size() < 4) {
             dims[i].resize(4, 1);
@@ -59,54 +71,67 @@ void Aidge::DivImpl_cuda::forward() {
         strides[i] = tensorStrides;
     }
 
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            forward_<double>(inputs, dims, strides);
-            break;
-        case DataType::Float32:
-            forward_<float>(inputs, dims, strides);
-            break;
-        case DataType::Float16:
-            forward_<half>(inputs, dims, strides);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        forward_<double>(inputs, dims, strides);
+        break;
+    case DataType::Float32:
+        forward_<float>(inputs, dims, strides);
+        break;
+    case DataType::Float16:
+        forward_<half>(inputs, dims, strides);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <class T>
-void Aidge::DivImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::DivImpl_cuda::forward_(
+    const std::vector<Tensor> &inputs,
+    const std::vector<std::vector<int>> &inputsDims,
+    const std::vector<std::vector<int>> &inputsStrides) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     // const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
-    const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr());
-    const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr());
-    T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
+    const T *input1Ptr = static_cast<const T *>(inputs[0].getImpl()->rawPtr());
+    const T *input2Ptr = static_cast<const T *>(inputs[1].getImpl()->rawPtr());
+    T *outputPtr = static_cast<T *>(op.getOutput(0)->getImpl()->rawPtr());
 
     std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1);
-    if(op.getOutput(0)->nbDims()>1) {
-        for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) {
-            outputStrides[i] = outputStrides[i+1] *  op.getOutput(0)->dims()[i+1];
+    if (op.getOutput(0)->nbDims() > 1) {
+        for (int i = op.getOutput(0)->nbDims() - 2; i >= 0; i--) {
+            outputStrides[i] =
+                outputStrides[i + 1] * op.getOutput(0)->dims()[i + 1];
         }
     }
-    std::vector<int> outDims(std::max(op.getOutput(0)->nbDims(),std::size_t(4)), 1);
+    std::vector<int> outDims(
+        std::max(op.getOutput(0)->nbDims(), std::size_t(4)),
+        1);
     for (std::size_t i = 0; i < op.getOutput(0)->nbDims(); i++) {
         outDims[i] = static_cast<int>(op.getOutput(0)->dims()[i]);
     }
 
-    Aidge::divForward<T>(input1Ptr, outputPtr, input2Ptr,
-                inputsDims[0], inputsDims[1], outDims,
-                inputsStrides[0], inputsStrides[1], outputStrides,
-                static_cast<int>(op.getOutput(0)->size()));
+    Aidge::divForward<T>(input1Ptr,
+                         outputPtr,
+                         input2Ptr,
+                         inputsDims[0],
+                         inputsDims[1],
+                         outDims,
+                         inputsStrides[0],
+                         inputsStrides[1],
+                         outputStrides,
+                         static_cast<int>(op.getOutput(0)->size()));
 }
 
 void Aidge::DivImpl_cuda::backward() {
     // TODO
 }
 
-template <class T>
-void Aidge::DivImpl_cuda::backward_(const Tensor& outGrad) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+template <class T> void Aidge::DivImpl_cuda::backward_(const Tensor &outGrad) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
     // TODO
diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp
index 1a7bb8edb51312d08467354e20723ad19176bfee..0478c14cd4086ce4d43343735bbd01ed6aef58f1 100644
--- a/src/operator/FCImpl.cpp
+++ b/src/operator/FCImpl.cpp
@@ -28,91 +28,110 @@ void Aidge::FCImpl_cuda::forward() {
     AIDGE_ASSERT(mOp.getRawInput(1), "missing input #1");
     AIDGE_ASSERT(mOp.getRawInput(2), "missing input #2");
 
-    const auto& fcOp = static_cast<const FC_Op&>(mOp);
+    const auto &fcOp = static_cast<const FC_Op &>(mOp);
     std::size_t outChannels = fcOp.outChannels();
 
-    const auto& input0 = fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0));
-    const auto& input1 = fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0));
-    const auto& input2 = (fcOp.getInput(2)) ? fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0)) : Tensor();
-
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            forward_<double>(input0, input1, input2, outChannels);
-            break;
-        case DataType::Float32:
-            forward_<float>(input0, input1, input2, outChannels);
-            break;
-        case DataType::Float16:
-            forward_<half>(input0, input1, input2, outChannels);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    const auto &input0 =
+        fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0));
+    const auto &input1 =
+        fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0));
+    const auto &input2 =
+        (fcOp.getInput(2)) ? fcOp.getInput(2)->refCastFrom(mInput2Fallback,
+                                                           *fcOp.getOutput(0))
+                           : Tensor();
+
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        forward_<double>(input0, input1, input2, outChannels);
+        break;
+    case DataType::Float32:
+        forward_<float>(input0, input1, input2, outChannels);
+        break;
+    case DataType::Float16:
+        forward_<half>(input0, input1, input2, outChannels);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
-template<class T>
-void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels)
-{
-    const T * input = static_cast<const T*>(input0.getImpl()->rawPtr());
-    const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr());
-    T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
+template <class T>
+void Aidge::FCImpl_cuda::forward_(const Tensor &input0,
+                                  const Tensor &input1,
+                                  const Tensor &input2,
+                                  std::size_t outChannels) {
+    const T *input = static_cast<const T *>(input0.getImpl()->rawPtr());
+    const T *weights = static_cast<const T *>(input1.getImpl()->rawPtr());
+    T *output =
+        static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))
+                             ->getImpl()
+                             ->rawPtr());
 
     // Performing output = T(weights) * input
     //            [n x m] = [n x k] * [k x m]
-    // cublas is column-major so instead of transposing inputs, computing output [m x n] and transposing output, we compute output as [n x m]
+    // cublas is column-major so instead of transposing inputs, computing
+    // output [m x n] and transposing output, we compute output as [n x m]
     int n = outChannels;
-    int m = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->size()/n;
-    int k = input0.size()/m;
-    int lda = k;  // leading dimension of weights
-    int ldb = k;  // leading dimension of input
-    int ldc = n;  // leading dimension of output
+    int m = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))
+                ->getImpl()
+                ->size() /
+            n;
+    int k = input0.size() / m;
+    int lda = k; // leading dimension of weights
+    int ldb = k; // leading dimension of input
+    int ldc = n; // leading dimension of output
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
-    CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(),
-                                    CUBLAS_OP_T,
-                                    CUBLAS_OP_N,
-                                    n,
-                                    m,
-                                    k,
-                                    reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
-                                    weights,
-                                    ldb,
-                                    input,
-                                    lda,
-                                    reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta),
-                                    output,
-                                    ldc));
-
-    if(!input2.empty()){
-        T* onesVector;
-        CHECK_CUDA_STATUS(cudaMalloc((void**)&onesVector, m * sizeof(T)));
+    CHECK_CUBLAS_STATUS(cublasGemm(
+        CudaContext::cublasHandle(),
+        CUBLAS_OP_T,
+        CUBLAS_OP_N,
+        n,
+        m,
+        k,
+        reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(&alpha),
+        weights,
+        ldb,
+        input,
+        lda,
+        reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(&beta),
+        output,
+        ldc));
+
+    if (!input2.empty()) {
+        T *onesVector;
+        CHECK_CUDA_STATUS(cudaMalloc((void **)&onesVector, m * sizeof(T)));
         // Fill the vector with ones
         std::vector<T> onesVec(m, T(1.0));
         CHECK_CUDA_STATUS(cudaMemcpy(onesVector,
-                                    &onesVec[0],
-                                    m * sizeof(T),
-                                    cudaMemcpyHostToDevice));
-        const T * biases = static_cast<const T*>(input2.getImpl()->rawPtr());
+                                     &onesVec[0],
+                                     m * sizeof(T),
+                                     cudaMemcpyHostToDevice));
+        const T *biases = static_cast<const T *>(input2.getImpl()->rawPtr());
         // Performing output = biases * onesVector + output
         //           [n x m] = [n x 1] * [1 x m]   + [n x m]
-        CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(),
-                                       CUBLAS_OP_N,
-                                       CUBLAS_OP_N,
-                                       n,
-                                       m,
-                                       1,
-                                       reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
-                                       biases,
-                                       n,
-                                       onesVector,
-                                       1,
-                                       reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
-                                       output,
-                                       n));
+        CHECK_CUBLAS_STATUS(cublasGemm(
+            CudaContext::cublasHandle(),
+            CUBLAS_OP_N,
+            CUBLAS_OP_N,
+            n,
+            m,
+            1,
+            reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(
+                &alpha),
+            biases,
+            n,
+            onesVector,
+            1,
+            reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(
+                &alpha),
+            output,
+            n));
 
         CHECK_CUDA_STATUS(cudaFree(onesVector));
     }
-
 }
 
 void Aidge::FCImpl_cuda::backward() {
@@ -120,45 +139,56 @@ void Aidge::FCImpl_cuda::backward() {
     AIDGE_ASSERT(mOp.getRawInput(1), "missing input #1");
     AIDGE_ASSERT(mOp.getRawInput(2), "missing input #2");
 
-    const auto& fcOp = static_cast<const FC_Op&>(mOp);
+    const auto &fcOp = static_cast<const FC_Op &>(mOp);
     std::size_t outChannels = fcOp.outChannels();
 
-    const auto& input0 = fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0));
-    const auto& input1 = fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0));
-    const auto& input2 = (fcOp.getInput(2)) ? fcOp.getInput(2)->refCastFrom(mInput2Fallback, *fcOp.getOutput(0)) : Tensor();
-
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            backward_<double>(input0, input1, input2, outChannels);
-            break;
-        case DataType::Float32:
-            backward_<float>(input0, input1, input2, outChannels);
-            break;
-        case DataType::Float16:
-            backward_<half>(input0, input1, input2, outChannels);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    const auto &input0 =
+        fcOp.getInput(0)->refCastFrom(mInput0Fallback, *fcOp.getOutput(0));
+    const auto &input1 =
+        fcOp.getInput(1)->refCastFrom(mInput1Fallback, *fcOp.getOutput(0));
+    const auto &input2 =
+        (fcOp.getInput(2)) ? fcOp.getInput(2)->refCastFrom(mInput2Fallback,
+                                                           *fcOp.getOutput(0))
+                           : Tensor();
+
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        backward_<double>(input0, input1, input2, outChannels);
+        break;
+    case DataType::Float32:
+        backward_<float>(input0, input1, input2, outChannels);
+        break;
+    case DataType::Float16:
+        backward_<half>(input0, input1, input2, outChannels);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
-template<class T>
-void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, std::size_t outChannels)
-{
+template <class T>
+void Aidge::FCImpl_cuda::backward_(const Tensor &input0,
+                                   const Tensor &input1,
+                                   const Tensor &input2,
+                                   std::size_t outChannels) {
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
     const typename Cuda::cudnn_scaling_type<T>::type betaData = 0.0f;
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
-    const T * input = static_cast<const T*>(input0.getImpl()->rawPtr());
-    const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr());
-    const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr());
-    T * weightsGrad = static_cast<T*>(op.getInput(1)->grad()->getImpl()->rawPtr());
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
+    const T *input = static_cast<const T *>(input0.getImpl()->rawPtr());
+    const T *weights = static_cast<const T *>(input1.getImpl()->rawPtr());
+    const T *outputGrad =
+        static_cast<const T *>(op.getOutput(0)->grad()->getImpl()->rawPtr());
+    T *weightsGrad =
+        static_cast<T *>(op.getInput(1)->grad()->getImpl()->rawPtr());
 
     // Performing weightsGrad = (input) * T(outputGrad)
     //              [n x m]   = [n x k] *   [k x m]
     int m = input1.dims()[1];
-    int k = input0.size()/m;
-    int n = input1.size()/m;
+    int k = input0.size() / m;
+    int n = input1.size() / m;
     CHECK_CUBLAS_STATUS(cublasGemm(
         CudaContext::cublasHandle(),
         CUBLAS_OP_N,
@@ -166,38 +196,41 @@ void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, c
         m,
         n,
         k,
-        reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
+        reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(&alpha),
         input,
         m,
         outputGrad,
         n,
-        reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta),
+        reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(&beta),
         weightsGrad,
         m));
 
-    if(!input2.empty()){
-        T * biasGrad = static_cast<T*>(op.getInput(2)->grad()->getImpl()->rawPtr());
-        T* onesVector;
-        CHECK_CUDA_STATUS(cudaMalloc((void**)&onesVector, m * sizeof(T)));
+    if (!input2.empty()) {
+        T *biasGrad =
+            static_cast<T *>(op.getInput(2)->grad()->getImpl()->rawPtr());
+        T *onesVector;
+        CHECK_CUDA_STATUS(cudaMalloc((void **)&onesVector, m * sizeof(T)));
         // Fill the vector with ones
         std::vector<T> onesVec(m, T(1.0));
         CHECK_CUDA_STATUS(cudaMemcpy(onesVector,
-                                    &onesVec[0],
-                                    m * sizeof(T),
-                                    cudaMemcpyHostToDevice));
+                                     &onesVec[0],
+                                     m * sizeof(T),
+                                     cudaMemcpyHostToDevice));
         // Performing biasGrad = outputGrad * onesVector
-        CHECK_CUBLAS_STATUS(cublasGemv(CudaContext::cublasHandle(),
-                                       CUBLAS_OP_N,
-                                       outChannels,
-                                       k,
-                                       reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
-                                       outputGrad,
-                                       outChannels,
-                                       onesVector,
-                                       1,
-                                       reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&beta),
-                                       biasGrad,
-                                       1));
+        CHECK_CUBLAS_STATUS(cublasGemv(
+            CudaContext::cublasHandle(),
+            CUBLAS_OP_N,
+            outChannels,
+            k,
+            reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(
+                &alpha),
+            outputGrad,
+            outChannels,
+            onesVector,
+            1,
+            reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(&beta),
+            biasGrad,
+            1));
         CHECK_CUDA_STATUS(cudaFree(onesVector));
     }
     // Performing inputGrad = (weights) * (outputGrad)
@@ -205,16 +238,15 @@ void Aidge::FCImpl_cuda::backward_(const Tensor& input0, const Tensor& input1, c
         CudaContext::cublasHandle(),
         CUBLAS_OP_N,
         CUBLAS_OP_N,
-        op.getInput(1)->grad()->size()/outChannels,
+        op.getInput(1)->grad()->size() / outChannels,
         k,
         outChannels,
-        reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&alpha),
-        weights,//w
-        op.getInput(1)->grad()->size()/outChannels,
-        outputGrad,//dY
+        reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(&alpha),
+        weights,                                                       // w
+        op.getInput(1)->grad()->size() / outChannels,
+        outputGrad,                                                    // dY
         outChannels,
-        reinterpret_cast<const typename Cuda::cuda_type<T>::type*>(&betaData),
-        static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr()),//dX
-        op.getInput(1)->grad()->size()/outChannels));
-
+        reinterpret_cast<const typename Cuda::cuda_type<T>::type *>(&betaData),
+        static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr()), // dX
+        op.getInput(1)->grad()->size() / outChannels));
 }
diff --git a/src/operator/GlobalAveragePoolingImpl.cpp b/src/operator/GlobalAveragePoolingImpl.cpp
index 8c83d477094d9cce41807d888cca57bd614e9cc6..d8392f9c8e52d8d226fec8713148e35468460975 100644
--- a/src/operator/GlobalAveragePoolingImpl.cpp
+++ b/src/operator/GlobalAveragePoolingImpl.cpp
@@ -20,92 +20,113 @@
 #include "aidge/utils/Types.h"
 
 void Aidge::GlobalAveragePoolingImpl_cuda::forward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0");
 
-    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+    const auto &input =
+        op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
 
     // Lazy-initialize CuDNN GlobalAveragePooling descriptor
     if (mGlobalAveragePoolingDesc == nullptr) {
-        int poolingDims = 2; // Assuming 2D pooling
-        int windowDims[2] = {static_cast<int>(input.dims().at(2)), static_cast<int>(input.dims().at(3))}; // Pooling window dimensions matching spatial dimensions of input tensor
-        int padding[2] = {0, 0}; // No padding
-        int stride[2] = {1, 1}; // Stride of 1
-        CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mGlobalAveragePoolingDesc));
+        int poolingDims = 2;           // Assuming 2D pooling
+        int windowDims[2] = {static_cast<int>(input.dims().at(2)),
+                             static_cast<int>(input.dims().at(
+                                 3))}; // Pooling window dimensions matching
+                                       // spatial dimensions of input tensor
+        int padding[2] = {0, 0};       // No padding
+        int stride[2] = {1, 1};        // Stride of 1
         CHECK_CUDNN_STATUS(
-            cudnnSetPoolingNdDescriptor(mGlobalAveragePoolingDesc, mMode, CUDNN_NOT_PROPAGATE_NAN, poolingDims, windowDims, padding, stride)
-            // cudnnSetPooling2dDesccomputedOutputriptor(mGlobalAveragePoolingDesc, mMode, CUDNN_NOT_PROPAGATE_NAN, 1, 1, 0, 0, 1, 1)
+            cudnnCreatePoolingDescriptor(&mGlobalAveragePoolingDesc));
+        CHECK_CUDNN_STATUS(
+            cudnnSetPoolingNdDescriptor(mGlobalAveragePoolingDesc,
+                                        mMode,
+                                        CUDNN_NOT_PROPAGATE_NAN,
+                                        poolingDims,
+                                        windowDims,
+                                        padding,
+                                        stride)
+            // cudnnSetPooling2dDesccomputedOutputriptor(mGlobalAveragePoolingDesc,
+            // mMode, CUDNN_NOT_PROPAGATE_NAN, 1, 1, 0, 0, 1, 1)
         );
     }
 
     if (op.getOutput(0)->dataType() == DataType::Float64) {
         forward_<double>(input);
-    }
-    else {
+    } else {
         forward_<float>(input);
     }
 }
 
 template <class T>
-void Aidge::GlobalAveragePoolingImpl_cuda::forward_(const Tensor& input) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::GlobalAveragePoolingImpl_cuda::forward_(const Tensor &input) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
 
-    CHECK_CUDNN_STATUS(
-        cudnnPoolingForward(
-            CudaContext::cudnnHandle(),
-            mGlobalAveragePoolingDesc,
-            &alpha,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
-            input.getImpl()->rawPtr(),
-            &beta,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-            std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()
-        )
-    );
+    CHECK_CUDNN_STATUS(cudnnPoolingForward(
+        CudaContext::cudnnHandle(),
+        mGlobalAveragePoolingDesc,
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())
+            ->getCudnnTensorDesc(input),
+        input.getImpl()->rawPtr(),
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr()));
 }
 
 void Aidge::GlobalAveragePoolingImpl_cuda::backward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
-    AIDGE_ASSERT(mGlobalAveragePoolingDesc != nullptr, "GlobalAvgPool descriptor must be created during forward!");
+    AIDGE_ASSERT(mGlobalAveragePoolingDesc != nullptr,
+                 "GlobalAvgPool descriptor must be created during forward!");
     AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output grad #0");
 
-    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+    const auto &output_grad =
+        op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback,
+                                             *op.getOutput(0)->grad());
 
     if (op.getOutput(0)->dataType() == DataType::Float64) {
         backward_<double>(output_grad);
-    }
-    else {
+    } else {
         backward_<float>(output_grad);
     }
 }
 
 template <class T>
-void Aidge::GlobalAveragePoolingImpl_cuda::backward_(const Tensor& output_grad) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::GlobalAveragePoolingImpl_cuda::backward_(
+    const Tensor &output_grad) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     const T alpha = 1.0f;
     const T beta = 0.0f;
-    CHECK_CUDNN_STATUS(
-        cudnnPoolingBackward(CudaContext::cudnnHandle(),
-                             mGlobalAveragePoolingDesc,
-                             &alpha,
-                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                             std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(),
-                             std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad),
-                             output_grad.getImpl()->rawPtr(),
-                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
-                             op.getInput(0)->getImpl()->rawPtr(),
-                             &beta,
-                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
-                             op.getInput(0)->grad()->getImpl()->rawPtr()));
+    CHECK_CUDNN_STATUS(cudnnPoolingBackward(
+        CudaContext::cudnnHandle(),
+        mGlobalAveragePoolingDesc,
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())
+            ->getCudnnTensorDesc(output_grad),
+        output_grad.getImpl()->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getInput(0)),
+        op.getInput(0)->getImpl()->rawPtr(),
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(
+            op.getInput(0)->grad()->getImpl())
+            ->getCudnnTensorDesc(*op.getInput(0)),
+        op.getInput(0)->grad()->getImpl()->rawPtr()));
 }
 
 Aidge::GlobalAveragePoolingImpl_cuda::~GlobalAveragePoolingImpl_cuda() {
-    if(mGlobalAveragePoolingDesc != nullptr)
+    if (mGlobalAveragePoolingDesc != nullptr)
         cudnnDestroyPoolingDescriptor(mGlobalAveragePoolingDesc);
 }
-
diff --git a/src/operator/ILayerNormImpl.cpp b/src/operator/ILayerNormImpl.cpp
index 47dd1d5d1a3f127c9e08788f605796020a7814a7..41b9b48c57894316d71ae361cae8159d4d4a9ca8 100644
--- a/src/operator/ILayerNormImpl.cpp
+++ b/src/operator/ILayerNormImpl.cpp
@@ -11,14 +11,14 @@
  *
  ********************************************************************************/
 
+#include <algorithm> // For std::max
 #include <cassert>
-#include <chrono>  // std::chrono::milliseconds
-#include <numeric> // std::accumulate
-#include <thread>  // std::this_thread::sleep_for
-#include <vector>
-#include <algorithm>  // For std::max
-#include <cmath>      // For pow
+#include <chrono>    // std::chrono::milliseconds
+#include <cmath>     // For pow
+#include <numeric>   // std::accumulate
+#include <thread>    // std::this_thread::sleep_for
 #include <typeinfo>
+#include <vector>
 
 #include "aidge/backend/cuda/data/TensorImpl.hpp"
 #include "aidge/backend/cuda/operator/ILayerNormImpl.hpp"
@@ -30,52 +30,60 @@
 
 void Aidge::ILayerNormImpl_cuda::forward() {
 
-
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     assert(mOp.getRawInput(0) && "missing input #0");
     assert(mOp.getRawInput(1) && "missing input #1");
     assert(mOp.getRawInput(2) && "missing input #2");
 
-    const auto& input0 = op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0));
-    const auto& input1 = op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0));
-    const auto& input2 = op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0));
-
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            forward_<double>(input0, input1, input2);
-            break;
-        case DataType::Float32:
-            forward_<float>(input0, input1, input2);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    const auto &input0 =
+        op.getInput(0)->refCastFrom(mInput0Fallback, *op.getOutput(0));
+    const auto &input1 =
+        op.getInput(1)->refCastFrom(mInput1Fallback, *op.getOutput(0));
+    const auto &input2 =
+        op.getInput(2)->refCastFrom(mInput2Fallback, *op.getOutput(0));
+
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        forward_<double>(input0, input1, input2);
+        break;
+    case DataType::Float32:
+        forward_<float>(input0, input1, input2);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
-
-template<class T>
-void Aidge::ILayerNormImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2)
-{
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
-    const T * input_raw = static_cast<const T*>(input0.getImpl()->rawPtr());
-    const T * weight = static_cast<const T*>(input1.getImpl()->rawPtr());
-    const T * bias = static_cast<const T*>(input2.getImpl()->rawPtr());
-    T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
+template <class T>
+void Aidge::ILayerNormImpl_cuda::forward_(const Tensor &input0,
+                                          const Tensor &input1,
+                                          const Tensor &input2) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
+    const T *input_raw = static_cast<const T *>(input0.getImpl()->rawPtr());
+    const T *weight = static_cast<const T *>(input1.getImpl()->rawPtr());
+    const T *bias = static_cast<const T *>(input2.getImpl()->rawPtr());
+    T *output =
+        static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))
+                             ->getImpl()
+                             ->rawPtr());
 
     int N = 15;
     int output_bits = 8;
     size_t size = input0.size();
     std::vector<DimSize_t> dims_input = input0.dims();
 
-    // maybe find a most efficient way to compute scaling factor (a max and min function could help to retrieve scaling factor value)
+    // maybe find a most efficient way to compute scaling factor (a max and min
+    // function could help to retrieve scaling factor value)
 
     double min = std::numeric_limits<double>::max();
     double max = std::numeric_limits<double>::min();
-    for(std::size_t i = 0; i < dims_input[0]; i++) {
-        for(std::size_t j = 0; j < dims_input[1]; j++) {
-            for(std::size_t k = 0; k < dims_input[2]; k++) {
-                for(std::size_t l = 0; l < dims_input[3]; l++) {
+    for (std::size_t i = 0; i < dims_input[0]; i++) {
+        for (std::size_t j = 0; j < dims_input[1]; j++) {
+            for (std::size_t k = 0; k < dims_input[2]; k++) {
+                for (std::size_t l = 0; l < dims_input[3]; l++) {
                     std::vector<std::size_t> coordIdx = {i, j, k, l};
                     std::size_t newFlatIdx = input0.getIdx(coordIdx);
                     if (newFlatIdx < min) {
@@ -84,57 +92,76 @@ void Aidge::ILayerNormImpl_cuda::forward_(const Tensor& input0, const Tensor& in
                     if (newFlatIdx > max) {
                         max = newFlatIdx;
                     }
-               }
-            }     
+                }
+            }
         }
     }
     double m = std::max(std::abs(min), std::abs(max));
-    double normalization_factor = static_cast<double>(1 << (output_bits - 1)) - 1;
-    double scaling_factor =  m / normalization_factor;
-    
-    // The new scaling factor that we can use to dequantify the returned tensor (not used here)
-    // double new_SF = 1/std::pow(2,2*output_bits-1); 
-
-    ILayerNormforward(input_raw, output, scaling_factor, weight, bias, size, dims_input);
+    double normalization_factor =
+        static_cast<double>(1 << (output_bits - 1)) - 1;
+    double scaling_factor = m / normalization_factor;
+
+    // The new scaling factor that we can use to dequantify the returned tensor
+    // (not used here) double new_SF = 1/std::pow(2,2*output_bits-1);
+
+    ILayerNormforward(input_raw,
+                      output,
+                      scaling_factor,
+                      weight,
+                      bias,
+                      size,
+                      dims_input);
 }
 
 void Aidge::ILayerNormImpl_cuda::backward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     assert(op.getOutput(0)->grad() && "missing output #0");
 
-    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+    const auto &output_grad =
+        op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback,
+                                             *op.getOutput(0)->grad());
 
     if (op.getInput(0)->grad()->dataType() == DataType::Float64) {
         backward_<double>(output_grad);
-    }
-    else {
+    } else {
         backward_<float>(output_grad);
     }
 }
 
 template <class T>
-void Aidge::ILayerNormImpl_cuda::backward_(const Tensor& output_grad) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::ILayerNormImpl_cuda::backward_(const Tensor &output_grad) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     size_t size = output_grad.size();
     std::vector<DimSize_t> dims_input = output_grad.dims();
 
-    const T * output = static_cast<const T*>(std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr());
-
-    T * input_grad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr());
-    T * weight_grad = static_cast<T*>(op.getInput(1)->grad()->getImpl()->rawPtr());
-    T * bias_grad = static_cast<T*>(op.getInput(2)->grad()->getImpl()->rawPtr());
-    
-    const T * input = static_cast<const T*>(op.getInput(0)->getImpl()->rawPtr());
-    const T * weight = static_cast<const T*>(op.getInput(1)->getImpl()->rawPtr());
-    const T * bias = static_cast<const T*>(op.getInput(2)->getImpl()->rawPtr());
+    const T *output = static_cast<const T *>(
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr());
+
+    T *input_grad =
+        static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr());
+    T *weight_grad =
+        static_cast<T *>(op.getInput(1)->grad()->getImpl()->rawPtr());
+    T *bias_grad =
+        static_cast<T *>(op.getInput(2)->grad()->getImpl()->rawPtr());
+
+    const T *input =
+        static_cast<const T *>(op.getInput(0)->getImpl()->rawPtr());
+    const T *weight =
+        static_cast<const T *>(op.getInput(1)->getImpl()->rawPtr());
+    const T *bias =
+        static_cast<const T *>(op.getInput(2)->getImpl()->rawPtr());
 
     // maybe find a most efficient way to compute mean and variance tensor
 
-    std::vector<std::vector<std::vector<std::vector<T>>>> means(dims_input[0],
-        std::vector<std::vector<std::vector<T>>>(dims_input[1],
+    std::vector<std::vector<std::vector<std::vector<T>>>> means(
+        dims_input[0],
+        std::vector<std::vector<std::vector<T>>>(
+            dims_input[1],
             std::vector<std::vector<T>>(dims_input[2],
-                std::vector<T>(dims_input[3], 0.0f))));
+                                        std::vector<T>(dims_input[3], 0.0f))));
 
     for (std::size_t i = 0; i < dims_input[0]; i++) {
         for (std::size_t j = 0; j < dims_input[1]; j++) {
@@ -157,16 +184,20 @@ void Aidge::ILayerNormImpl_cuda::backward_(const Tensor& output_grad) {
     for (const auto &vec3d : means) {
         for (const auto &vec2d : vec3d) {
             for (const auto &vec1d : vec2d) {
-                flat_means.insert(flat_means.end(), vec1d.begin(), vec1d.end());
+                flat_means.insert(flat_means.end(),
+                                  vec1d.begin(),
+                                  vec1d.end());
             }
         }
     }
 
-    std::vector<std::vector<std::vector<std::vector<T>>>> vars(dims_input[0],
-        std::vector<std::vector<std::vector<T>>>(dims_input[1],
+    std::vector<std::vector<std::vector<std::vector<T>>>> vars(
+        dims_input[0],
+        std::vector<std::vector<std::vector<T>>>(
+            dims_input[1],
             std::vector<std::vector<T>>(dims_input[2],
-                std::vector<T>(dims_input[3], 0.0f))));
-    
+                                        std::vector<T>(dims_input[3], 0.0f))));
+
     for (std::size_t i = 0; i < dims_input[0]; i++) {
         for (std::size_t j = 0; j < dims_input[1]; j++) {
             for (std::size_t k = 0; k < dims_input[2]; k++) {
@@ -196,9 +227,20 @@ void Aidge::ILayerNormImpl_cuda::backward_(const Tensor& output_grad) {
         }
     }
 
-    const T* mean_ = flat_means.data();
-    const T* var_ = flat_vars.data();
-    const T * output_grad_raw = static_cast<const T*>(output_grad.getImpl()->rawPtr());
-
-    ILayerNormbackward(output, output_grad_raw, input, mean_, var_, weight, bias, input_grad, weight_grad, bias_grad, size);
+    const T *mean_ = flat_means.data();
+    const T *var_ = flat_vars.data();
+    const T *output_grad_raw =
+        static_cast<const T *>(output_grad.getImpl()->rawPtr());
+
+    ILayerNormbackward(output,
+                       output_grad_raw,
+                       input,
+                       mean_,
+                       var_,
+                       weight,
+                       bias,
+                       input_grad,
+                       weight_grad,
+                       bias_grad,
+                       size);
 }
diff --git a/src/operator/LnImpl.cpp b/src/operator/LnImpl.cpp
index ed09ed45f5006c3760376a9d6f44f29d05bcfabe..1b88a601b29775abaca9ab1f7008142ab06d8b8c 100644
--- a/src/operator/LnImpl.cpp
+++ b/src/operator/LnImpl.cpp
@@ -21,60 +21,66 @@
 #include "aidge/utils/Types.h"
 
 void Aidge::LnImpl_cuda::forward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     assert(mOp.getRawInput(0) && "missing input #0");
 
-    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+    const auto &input =
+        op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
 
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            forward_<double>(input);
-            break;
-        case DataType::Float32:
-            forward_<float>(input);
-            break;
-        case DataType::Float16:
-            forward_<half>(input);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        forward_<double>(input);
+        break;
+    case DataType::Float32:
+        forward_<float>(input);
+        break;
+    case DataType::Float16:
+        forward_<half>(input);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
-template <class T>
-void Aidge::LnImpl_cuda::forward_(const Tensor& input) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
-    const T * inputPtr = static_cast<const T*>(input.getImpl()->rawPtr());
-    T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
-
+template <class T> void Aidge::LnImpl_cuda::forward_(const Tensor &input) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
+    const T *inputPtr = static_cast<const T *>(input.getImpl()->rawPtr());
+    T *outputPtr = static_cast<T *>(op.getOutput(0)->getImpl()->rawPtr());
 
-    Aidge::lnForward<T>(inputPtr, outputPtr, static_cast<int>(op.getOutput(0)->size()));
+    Aidge::lnForward<T>(inputPtr,
+                        outputPtr,
+                        static_cast<int>(op.getOutput(0)->size()));
 }
 
 void Aidge::LnImpl_cuda::backward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     assert(op.getOutput(0)->grad() && "missing output #0");
 
-    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+    const auto &output_grad =
+        op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback,
+                                             *op.getOutput(0)->grad());
 
-    switch(op.getInput(0)->grad()->dataType()) {
-        case DataType::Float64:
-            backward_<double>(output_grad);
-            break;
-        case DataType::Float32:
-            backward_<float>(output_grad);
-            break;
-        case DataType::Float16:
-            backward_<half>(output_grad);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    switch (op.getInput(0)->grad()->dataType()) {
+    case DataType::Float64:
+        backward_<double>(output_grad);
+        break;
+    case DataType::Float32:
+        backward_<float>(output_grad);
+        break;
+    case DataType::Float16:
+        backward_<half>(output_grad);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <class T>
-void Aidge::LnImpl_cuda::backward_(const Tensor& output_grad) {
-    //TODO
+void Aidge::LnImpl_cuda::backward_(const Tensor &output_grad) {
+    // TODO
 }
diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp
index 39050635102ebebaed8192cb4bb338e2bc31d5e8..7a6518af1cd603609e65fa9d0bba3003e04b42df 100644
--- a/src/operator/MaxPoolingImpl.cpp
+++ b/src/operator/MaxPoolingImpl.cpp
@@ -21,109 +21,120 @@
 
 template <Aidge::DimIdx_t DIM>
 void Aidge::MaxPoolingImpl_cuda<DIM>::forward() {
-    const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp);
+    const MaxPooling_Op<DIM> &op_ =
+        static_cast<const MaxPooling_Op<DIM> &>(mOp);
 
     AIDGE_ASSERT(mOp.getRawInput(0), "missing input #0");
 
-    const auto& input = op_.getInput(0)->refCastFrom(mInputFallback, *op_.getOutput(0));
+    const auto &input =
+        op_.getInput(0)->refCastFrom(mInputFallback, *op_.getOutput(0));
 
     // Lazy-initialize CuDNN MaxPooling descriptor
     if (mMaxPoolingDesc == nullptr) {
-        const std::vector<int> strides(op_.strideDims().begin(), op_.strideDims().end());
+        const std::vector<int> strides(op_.strideDims().begin(),
+                                       op_.strideDims().end());
         const std::vector<int> paddings(DIM, 0);
-        const std::vector<int> window_dims(op_.kernelDims().begin(), op_.kernelDims().end());
+        const std::vector<int> window_dims(op_.kernelDims().begin(),
+                                           op_.kernelDims().end());
 
         CHECK_CUDNN_STATUS(cudnnCreatePoolingDescriptor(&mMaxPoolingDesc));
-        CHECK_CUDNN_STATUS(
-            cudnnSetPoolingNdDescriptor(mMaxPoolingDesc,
-                                        mMode,
-                                        CUDNN_NOT_PROPAGATE_NAN,
-                                        DIM,
-                                        &window_dims[0],
-                                        &paddings[0],
-                                        &strides[0]));
+        CHECK_CUDNN_STATUS(cudnnSetPoolingNdDescriptor(mMaxPoolingDesc,
+                                                       mMode,
+                                                       CUDNN_NOT_PROPAGATE_NAN,
+                                                       DIM,
+                                                       &window_dims[0],
+                                                       &paddings[0],
+                                                       &strides[0]));
     }
 
-
     // Do the actual forward computation
     // Template is only for scaling parameters, which are always in float
     // excepted when the convolution is performed in double precision.
     if (op_.getOutput(0)->dataType() == DataType::Float64) {
         forward_<double>(input);
-    }
-    else {
+    } else {
         forward_<float>(input);
     }
 }
 
 template <Aidge::DimIdx_t DIM>
 template <class T>
-void Aidge::MaxPoolingImpl_cuda<DIM>::forward_(const Tensor& input) {
-    const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp);
+void Aidge::MaxPoolingImpl_cuda<DIM>::forward_(const Tensor &input) {
+    const MaxPooling_Op<DIM> &op_ =
+        static_cast<const MaxPooling_Op<DIM> &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
-    CHECK_CUDNN_STATUS(
-        cudnnPoolingForward(
-            CudaContext::cudnnHandle(),
-            mMaxPoolingDesc,
-            &alpha,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
-            input.getImpl()->rawPtr(),
-            &beta,
-            std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getOutput(0)->getImpl())->getCudnnTensorDesc(*op_.getOutput(0)),
-            op_.getOutput(0)->getImpl()->rawPtr()
-        )
-    );
+    CHECK_CUDNN_STATUS(cudnnPoolingForward(
+        CudaContext::cudnnHandle(),
+        mMaxPoolingDesc,
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())
+            ->getCudnnTensorDesc(input),
+        input.getImpl()->rawPtr(),
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(
+            op_.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op_.getOutput(0)),
+        op_.getOutput(0)->getImpl()->rawPtr()));
 }
 
 template <Aidge::DimIdx_t DIM>
 void Aidge::MaxPoolingImpl_cuda<DIM>::backward() {
-    const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp);
+    const MaxPooling_Op<DIM> &op_ =
+        static_cast<const MaxPooling_Op<DIM> &>(mOp);
 
-    AIDGE_ASSERT(mMaxPoolingDesc != nullptr, "MaxPool descriptor must be created during forward!");
+    AIDGE_ASSERT(mMaxPoolingDesc != nullptr,
+                 "MaxPool descriptor must be created during forward!");
     AIDGE_ASSERT(op_.getOutput(0)->grad(), "missing output grad #0");
 
-    const auto& output_grad = op_.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op_.getOutput(0)->grad());
+    const auto &output_grad =
+        op_.getOutput(0)->grad()->refCastFrom(mOutputGradFallback,
+                                              *op_.getOutput(0)->grad());
 
     // Do the actual backward computation
     // Template is only for scaling parameters, which are always in float
     // excepted when the convolution is performed in double precision.
     if (op_.getOutput(0)->dataType() == DataType::Float64) {
         backward_<double>(output_grad);
-    }
-    else {
+    } else {
         backward_<float>(output_grad);
     }
 }
 
 template <Aidge::DimIdx_t DIM>
 template <class T>
-void Aidge::MaxPoolingImpl_cuda<DIM>::backward_(const Tensor& output_grad) {
-    const MaxPooling_Op<DIM>& op_ = static_cast<const MaxPooling_Op<DIM>&>(mOp);
+void Aidge::MaxPoolingImpl_cuda<DIM>::backward_(const Tensor &output_grad) {
+    const MaxPooling_Op<DIM> &op_ =
+        static_cast<const MaxPooling_Op<DIM> &>(mOp);
 
     const T alpha = 1.0f;
     const T beta = 0.0f;
-    CHECK_CUDNN_STATUS(
-        cudnnPoolingBackward(CudaContext::cudnnHandle(),
-                             mMaxPoolingDesc,
-                             &alpha,
-                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getOutput(0)->getImpl())->getCudnnTensorDesc(*op_.getOutput(0)),
-                             op_.getOutput(0)->getImpl()->rawPtr(),
-                             std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad),
-                             output_grad.getImpl()->rawPtr(),
-                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getInput(0)->getImpl())->getCudnnTensorDesc(*op_.getInput(0)),
-                             op_.getInput(0)->getImpl()->rawPtr(),
-                             &beta,
-                             std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op_.getInput(0)),
-                             op_.getInput(0)->grad()->getImpl()->rawPtr()));
+    CHECK_CUDNN_STATUS(cudnnPoolingBackward(
+        CudaContext::cudnnHandle(),
+        mMaxPoolingDesc,
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(
+            op_.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op_.getOutput(0)),
+        op_.getOutput(0)->getImpl()->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())
+            ->getCudnnTensorDesc(output_grad),
+        output_grad.getImpl()->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op_.getInput(0)->getImpl())
+            ->getCudnnTensorDesc(*op_.getInput(0)),
+        op_.getInput(0)->getImpl()->rawPtr(),
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(
+            op_.getInput(0)->grad()->getImpl())
+            ->getCudnnTensorDesc(*op_.getInput(0)),
+        op_.getInput(0)->grad()->getImpl()->rawPtr()));
 }
 
 template <Aidge::DimIdx_t DIM>
 Aidge::MaxPoolingImpl_cuda<DIM>::~MaxPoolingImpl_cuda() {
-    if(mMaxPoolingDesc != nullptr)
+    if (mMaxPoolingDesc != nullptr)
         cudnnDestroyPoolingDescriptor(mMaxPoolingDesc);
 }
 
-
 // Template declarations
 template class Aidge::MaxPoolingImpl_cuda<2>;
diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp
index af87251e8f29eded7d24cca2f08b880557ebb482..ed66e27fdcc2dc3e332349033e6866e4557f316b 100644
--- a/src/operator/MulImpl.cpp
+++ b/src/operator/MulImpl.cpp
@@ -11,9 +11,9 @@
 
 #include <algorithm>
 #include <cassert>
+#include <chrono>
 #include <numeric>
 #include <vector>
-#include <chrono>
 
 #include "aidge/backend/cuda/data/TensorImpl.hpp"
 #include "aidge/backend/cuda/operator/MulImpl.hpp"
@@ -23,27 +23,39 @@
 #include "aidge/utils/Types.h"
 
 void Aidge::MulImpl_cuda::forward() {
-   const Mul_Op& op = static_cast<const Mul_Op&>(mOp);
+    const Mul_Op &op = static_cast<const Mul_Op &>(mOp);
     // Check inputs
     AIDGE_ASSERT(op.getInput(0), "missing input in Mul operator");
-    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Mul forward because the 0-th input has no implementation.");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(),
+                 "cannot run Mul forward because the 0-th input has no "
+                 "implementation.");
     DataType datatypeFirstInput = op.getInput(0)->dataType();
     for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
         AIDGE_ASSERT(op.getInput(i), "missing input in Mul operator");
-        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Mul forward because the {}-th input has no implementation.", i);
-        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot Mul inputs with two differents data type.");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(),
+                     "cannot run Mul forward because the {}-th input has no "
+                     "implementation.",
+                     i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput,
+                     "Cannot Mul inputs with two differents data type.");
     }
 
     std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs());
     std::vector<Tensor> inputs(op.nbInputs());
     std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
-    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    std::vector<std::vector<int>> strides(
+        op.nbInputs()); // For the cooresponding strides
     for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
-        inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
+        inputs[i] =
+            op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
 
         // Get tensor dims and broadcast them
-        std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
-        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+        std::copy(inputs[i].dims().begin(),
+                  inputs[i].dims().end(),
+                  std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(),
+                       op.getOutput(0)->nbDims() - dims[i].size(),
+                       int(1));
 
         if (dims[i].size() < 4) {
             dims[i].resize(4, 1);
@@ -59,62 +71,90 @@ void Aidge::MulImpl_cuda::forward() {
         strides[i] = tensorStrides;
     }
 
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            forward_<double>(inputs, dims, strides);
-            break;
-        case DataType::Float32:
-            forward_<float>(inputs, dims, strides);
-            break;
-        case DataType::Float16:
-            forward_<half>(inputs, dims, strides);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        forward_<double>(inputs, dims, strides);
+        break;
+    case DataType::Float32:
+        forward_<float>(inputs, dims, strides);
+        break;
+    case DataType::Float16:
+        forward_<half>(inputs, dims, strides);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <class T>
-void Aidge::MulImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::MulImpl_cuda::forward_(
+    const std::vector<Tensor> &inputs,
+    const std::vector<std::vector<int>> &inputsDims,
+    const std::vector<std::vector<int>> &inputsStrides) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
 
     // Create a Tensor descriptor with the broadcasted dims and strides
     cudnnTensorDescriptor_t tensorDesc0, tensorDesc1;
     CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc0));
-    CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc0, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data()));
+    CHECK_CUDNN_STATUS(
+        cudnnSetTensorNdDescriptor(tensorDesc0,
+                                   CudaContext::data_type<T>::value,
+                                   inputsDims[0].size(),
+                                   inputsDims[0].data(),
+                                   inputsStrides[0].data()));
     CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc1));
-    CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc1, CudaContext::data_type<T>::value, inputsDims[1].size(), inputsDims[1].data(), inputsStrides[1].data()));
+    CHECK_CUDNN_STATUS(
+        cudnnSetTensorNdDescriptor(tensorDesc1,
+                                   CudaContext::data_type<T>::value,
+                                   inputsDims[1].size(),
+                                   inputsDims[1].data(),
+                                   inputsStrides[1].data()));
     // Multiply inputs
     cudnnOpTensorDescriptor_t opTensorDesc;
     CHECK_CUDNN_STATUS(cudnnCreateOpTensorDescriptor(&opTensorDesc));
-    CHECK_CUDNN_STATUS(cudnnSetOpTensorDescriptor(opTensorDesc, CUDNN_OP_TENSOR_MUL, CudaContext::data_type<T>::value, CUDNN_PROPAGATE_NAN));
-    if(inputs[0].size()>inputs[1].size()) {
-        CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(),
-                                        opTensorDesc,
-                                        &alpha,
-                                        tensorDesc0,
-                                        inputs[0].getImpl()->rawPtr(),
-                                        &alpha,
-                                        tensorDesc1,
-                                        inputs[1].getImpl()->rawPtr(),
-                                        &beta,
-                                        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                                        std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
-    }
-    else {
-        CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(),
-                                opTensorDesc,
-                                &alpha,
-                                tensorDesc1,
-                                inputs[1].getImpl()->rawPtr(),
-                                &alpha,
-                                tensorDesc0,
-                                inputs[0].getImpl()->rawPtr(),
-                                &beta,
-                                std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                                std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+    CHECK_CUDNN_STATUS(
+        cudnnSetOpTensorDescriptor(opTensorDesc,
+                                   CUDNN_OP_TENSOR_MUL,
+                                   CudaContext::data_type<T>::value,
+                                   CUDNN_PROPAGATE_NAN));
+    if (inputs[0].size() > inputs[1].size()) {
+        CHECK_CUDNN_STATUS(
+            cudnnOpTensor(CudaContext::cudnnHandle(),
+                          opTensorDesc,
+                          &alpha,
+                          tensorDesc0,
+                          inputs[0].getImpl()->rawPtr(),
+                          &alpha,
+                          tensorDesc1,
+                          inputs[1].getImpl()->rawPtr(),
+                          &beta,
+                          std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                              op.getOutput(0)->getImpl())
+                              ->getCudnnTensorDesc(*op.getOutput(0)),
+                          std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+                              ->getImpl()
+                              ->rawPtr()));
+    } else {
+        CHECK_CUDNN_STATUS(
+            cudnnOpTensor(CudaContext::cudnnHandle(),
+                          opTensorDesc,
+                          &alpha,
+                          tensorDesc1,
+                          inputs[1].getImpl()->rawPtr(),
+                          &alpha,
+                          tensorDesc0,
+                          inputs[0].getImpl()->rawPtr(),
+                          &beta,
+                          std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                              op.getOutput(0)->getImpl())
+                              ->getCudnnTensorDesc(*op.getOutput(0)),
+                          std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+                              ->getImpl()
+                              ->rawPtr()));
     }
 
     CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc0));
@@ -123,24 +163,35 @@ void Aidge::MulImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std:
 }
 
 void Aidge::MulImpl_cuda::backward() {
-    const Mul_Op& op = static_cast<const Mul_Op&>(mOp);
+    const Mul_Op &op = static_cast<const Mul_Op &>(mOp);
     // Check output
-    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Mul operator");
-    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Mul backward because the output gradient has no implementation.");
+    AIDGE_ASSERT(op.getOutput(0)->grad(),
+                 "missing output gradient in Mul operator");
+    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(),
+                 "cannot run Mul backward because the output gradient has no "
+                 "implementation.");
 
     std::shared_ptr<Tensor> outputGradFallback;
-    const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad());
+    const auto &outputGrad =
+        op.getOutput(0)->grad()->refCastFrom(outputGradFallback,
+                                             *op.getOutput(0)->grad());
 
     std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
-    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    std::vector<std::vector<int>> strides(
+        op.nbInputs()); // For the cooresponding strides
     for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
         std::shared_ptr<Tensor> inputFallback;
-        const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0));
+        const Tensor input =
+            op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0));
 
         // Get tensor dims and broadcast them
-        std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i]));
-        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
-        
+        std::copy(input.dims().begin(),
+                  input.dims().end(),
+                  std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(),
+                       op.getOutput(0)->nbDims() - dims[i].size(),
+                       int(1));
+
         if (dims[i].size() < 4) {
             dims[i].resize(4, 1);
         }
@@ -155,66 +206,88 @@ void Aidge::MulImpl_cuda::backward() {
         strides[i] = tensorStrides;
     }
 
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            backward_<double>(outputGrad, dims, strides);
-            break;
-        case DataType::Float32:
-            backward_<float>(outputGrad, dims, strides);
-            break;
-        case DataType::Float16:
-            backward_<half>(outputGrad, dims, strides);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        backward_<double>(outputGrad, dims, strides);
+        break;
+    case DataType::Float32:
+        backward_<float>(outputGrad, dims, strides);
+        break;
+    case DataType::Float16:
+        backward_<half>(outputGrad, dims, strides);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <class T>
-void Aidge::MulImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::MulImpl_cuda::backward_(
+    const Tensor &outputGrad,
+    const std::vector<std::vector<int>> &inputsDims,
+    const std::vector<std::vector<int>> &inputsStrides) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
 
-
     // Create a Tensor descriptor with the broadcasted dims and strides
     cudnnTensorDescriptor_t tensorDesc0, tensorDesc1;
     CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc0));
-    CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc0, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data()));
+    CHECK_CUDNN_STATUS(
+        cudnnSetTensorNdDescriptor(tensorDesc0,
+                                   CudaContext::data_type<T>::value,
+                                   inputsDims[0].size(),
+                                   inputsDims[0].data(),
+                                   inputsStrides[0].data()));
     CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc1));
-    CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc1, CudaContext::data_type<T>::value, inputsDims[1].size(), inputsDims[1].data(), inputsStrides[1].data()));
-    
+    CHECK_CUDNN_STATUS(
+        cudnnSetTensorNdDescriptor(tensorDesc1,
+                                   CudaContext::data_type<T>::value,
+                                   inputsDims[1].size(),
+                                   inputsDims[1].data(),
+                                   inputsStrides[1].data()));
+
     // Create the operation descriptor
     cudnnOpTensorDescriptor_t opTensorDesc;
     CHECK_CUDNN_STATUS(cudnnCreateOpTensorDescriptor(&opTensorDesc));
-    CHECK_CUDNN_STATUS(cudnnSetOpTensorDescriptor(opTensorDesc, CUDNN_OP_TENSOR_MUL, CudaContext::data_type<T>::value, CUDNN_PROPAGATE_NAN));
+    CHECK_CUDNN_STATUS(
+        cudnnSetOpTensorDescriptor(opTensorDesc,
+                                   CUDNN_OP_TENSOR_MUL,
+                                   CudaContext::data_type<T>::value,
+                                   CUDNN_PROPAGATE_NAN));
 
     // Input0_grad = output_grad * Input1
-    CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(),
-                            opTensorDesc,
-                            &alpha,
-                            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                            outputGrad.getImpl()->rawPtr(),
-                            &alpha,
-                            tensorDesc1,
-                            op.getInput(1)->getImpl()->rawPtr(),
-                            &beta,
-                            tensorDesc0,
-                            op.getInput(0)->grad()->getImpl()->rawPtr()));
+    CHECK_CUDNN_STATUS(cudnnOpTensor(
+        CudaContext::cudnnHandle(),
+        opTensorDesc,
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        outputGrad.getImpl()->rawPtr(),
+        &alpha,
+        tensorDesc1,
+        op.getInput(1)->getImpl()->rawPtr(),
+        &beta,
+        tensorDesc0,
+        op.getInput(0)->grad()->getImpl()->rawPtr()));
 
     // Input1_grad = output_grad * Input0
-    CHECK_CUDNN_STATUS(cudnnOpTensor(CudaContext::cudnnHandle(),
-                            opTensorDesc,
-                            &alpha,
-                            std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                            outputGrad.getImpl()->rawPtr(),
-                            &alpha,
-                            tensorDesc0,
-                            op.getInput(0)->getImpl()->rawPtr(),
-                            &beta,
-                            tensorDesc1,
-                            op.getInput(1)->grad()->getImpl()->rawPtr()));
-    
+    CHECK_CUDNN_STATUS(cudnnOpTensor(
+        CudaContext::cudnnHandle(),
+        opTensorDesc,
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        outputGrad.getImpl()->rawPtr(),
+        &alpha,
+        tensorDesc0,
+        op.getInput(0)->getImpl()->rawPtr(),
+        &beta,
+        tensorDesc1,
+        op.getInput(1)->grad()->getImpl()->rawPtr()));
+
     CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc0));
     CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc1));
     CHECK_CUDNN_STATUS(cudnnDestroyOpTensorDescriptor(opTensorDesc));
diff --git a/src/operator/PadImpl.cpp b/src/operator/PadImpl.cpp
index 3606ba66d002f1467aa65771015cab02c066d5a5..6eed8a662d8bfe7bd1b9b0dc2abdfb54023f1c9f 100644
--- a/src/operator/PadImpl.cpp
+++ b/src/operator/PadImpl.cpp
@@ -21,15 +21,15 @@
 #include "aidge/utils/ErrorHandling.hpp"
 #include "aidge/utils/Types.h"
 
-template <Aidge::DimIdx_t DIM>
-void Aidge::PadImpl_cuda<DIM>::forward()
-{
+template <Aidge::DimIdx_t DIM> void Aidge::PadImpl_cuda<DIM>::forward() {
     const Pad_Op<DIM> &op = static_cast<const Pad_Op<DIM> &>(mOp);
 
     AIDGE_ASSERT(op.getInput(0), "missing input in Pad operator");
-    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Pad forward input has no implementation.");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(),
+                 "cannot run Pad forward input has no implementation.");
 
-    const auto &input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+    const auto &input =
+        op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
 
     auto paddingBorders = op.beginEndBorders();
 
@@ -38,8 +38,7 @@ void Aidge::PadImpl_cuda<DIM>::forward()
     mPadVal = op.borderValue();
     mPadType = static_cast<unsigned int>(op.borderType());
 
-    switch (op.getOutput(0)->dataType())
-    {
+    switch (op.getOutput(0)->dataType()) {
     case DataType::Float64:
         forward_<double>(input);
         break;
@@ -50,17 +49,21 @@ void Aidge::PadImpl_cuda<DIM>::forward()
         forward_<half>(input);
         break;
     default:
-        AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <Aidge::DimIdx_t DIM>
 template <class T>
-void Aidge::PadImpl_cuda<DIM>::forward_(const Tensor &input)
-{
-    const auto outDims = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims();
+void Aidge::PadImpl_cuda<DIM>::forward_(const Tensor &input) {
+    const auto outDims =
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims();
     const T *inputPtr = static_cast<const T *>(input.getImpl()->rawPtr());
-    T *output = static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
+    T *output =
+        static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))
+                             ->getImpl()
+                             ->rawPtr());
     Aidge::cudaPadding(CudaContext::getDeviceProp(),
                        outDims[1],
                        outDims[3],
@@ -77,15 +80,18 @@ void Aidge::PadImpl_cuda<DIM>::forward_(const Tensor &input)
                        output);
 }
 
-template <Aidge::DimIdx_t DIM>
-void Aidge::PadImpl_cuda<DIM>::backward()
-{
+template <Aidge::DimIdx_t DIM> void Aidge::PadImpl_cuda<DIM>::backward() {
     const Pad_Op<DIM> &op = static_cast<const Pad_Op<DIM> &>(mOp);
 
-    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Pad operator");
-    AIDGE_ASSERT(op.getOutput(0)->grad(), "cannot run Pad backward, output gradient has no implementation.");
+    AIDGE_ASSERT(op.getOutput(0)->grad(),
+                 "missing output gradient in Pad operator");
+    AIDGE_ASSERT(
+        op.getOutput(0)->grad(),
+        "cannot run Pad backward, output gradient has no implementation.");
 
-    const auto &outGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getInput(0));
+    const auto &outGrad =
+        op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback,
+                                             *op.getInput(0));
 
     auto paddingBorders = op.beginEndBorders();
 
@@ -94,8 +100,8 @@ void Aidge::PadImpl_cuda<DIM>::backward()
     mPadVal = op.borderValue();
     mPadType = static_cast<unsigned int>(op.borderType());
 
-    switch (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType())
-    {
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
     case DataType::Float64:
         backward_<double>(outGrad);
         break;
@@ -106,17 +112,18 @@ void Aidge::PadImpl_cuda<DIM>::backward()
         backward_<half>(outGrad);
         break;
     default:
-        AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <Aidge::DimIdx_t DIM>
 template <class T>
-void Aidge::PadImpl_cuda<DIM>::backward_(const Tensor &outGrad)
-{
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::PadImpl_cuda<DIM>::backward_(const Tensor &outGrad) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const auto inputGradDims = op.getInput(0)->grad()->dims();
-    T *inputGrad = static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr());
+    T *inputGrad =
+        static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr());
     Aidge::cudaPadding(CudaContext::getDeviceProp(),
                        inputGradDims[1],
                        inputGradDims[3],
diff --git a/src/operator/PowImpl.cpp b/src/operator/PowImpl.cpp
index 84af8c2a74c8ebaeb7d7380975089086e4db31da..60a47d2f38de9e257933f220d99bed306772d17f 100644
--- a/src/operator/PowImpl.cpp
+++ b/src/operator/PowImpl.cpp
@@ -23,27 +23,39 @@
 #include "aidge/utils/Types.h"
 
 void Aidge::PowImpl_cuda::forward() {
-    const Pow_Op& op = static_cast<const Pow_Op&>(mOp);
+    const Pow_Op &op = static_cast<const Pow_Op &>(mOp);
     // Check inputs
     AIDGE_ASSERT(op.getInput(0), "missing input in Pow operator");
-    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Pow forward because the 0-th input has no implementation.");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(),
+                 "cannot run Pow forward because the 0-th input has no "
+                 "implementation.");
     DataType datatypeFirstInput = op.getInput(0)->dataType();
     for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
         AIDGE_ASSERT(op.getInput(i), "missing input in Pow operator");
-        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Pow forward because the {}-th input has no implementation.", i);
-        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot Pow inputs with two differents data type.");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(),
+                     "cannot run Pow forward because the {}-th input has no "
+                     "implementation.",
+                     i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput,
+                     "Cannot Pow inputs with two differents data type.");
     }
 
     std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs());
     std::vector<Tensor> inputs(op.nbInputs());
     std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
-    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    std::vector<std::vector<int>> strides(
+        op.nbInputs()); // For the cooresponding strides
     for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
-        inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
+        inputs[i] =
+            op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
 
         // Get tensor dims and broadcast them
-        std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
-        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+        std::copy(inputs[i].dims().begin(),
+                  inputs[i].dims().end(),
+                  std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(),
+                       op.getOutput(0)->nbDims() - dims[i].size(),
+                       int(1));
 
         if (dims[i].size() < 4) {
             dims[i].resize(4, 1);
@@ -59,54 +71,67 @@ void Aidge::PowImpl_cuda::forward() {
         strides[i] = tensorStrides;
     }
 
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            forward_<double>(inputs, dims, strides);
-            break;
-        case DataType::Float32:
-            forward_<float>(inputs, dims, strides);
-            break;
-        case DataType::Float16:
-            forward_<half>(inputs, dims, strides);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        forward_<double>(inputs, dims, strides);
+        break;
+    case DataType::Float32:
+        forward_<float>(inputs, dims, strides);
+        break;
+    case DataType::Float16:
+        forward_<half>(inputs, dims, strides);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <class T>
-void Aidge::PowImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::PowImpl_cuda::forward_(
+    const std::vector<Tensor> &inputs,
+    const std::vector<std::vector<int>> &inputsDims,
+    const std::vector<std::vector<int>> &inputsStrides) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     // const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
-    const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr());
-    const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr());
-    T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
+    const T *input1Ptr = static_cast<const T *>(inputs[0].getImpl()->rawPtr());
+    const T *input2Ptr = static_cast<const T *>(inputs[1].getImpl()->rawPtr());
+    T *outputPtr = static_cast<T *>(op.getOutput(0)->getImpl()->rawPtr());
 
     std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1);
-    if(op.getOutput(0)->nbDims()>1) {
-        for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) {
-            outputStrides[i] = outputStrides[i+1] *  op.getOutput(0)->dims()[i+1];
+    if (op.getOutput(0)->nbDims() > 1) {
+        for (int i = op.getOutput(0)->nbDims() - 2; i >= 0; i--) {
+            outputStrides[i] =
+                outputStrides[i + 1] * op.getOutput(0)->dims()[i + 1];
         }
     }
-    std::vector<int> outDims(std::max(op.getOutput(0)->nbDims(),std::size_t(4)), 1);
+    std::vector<int> outDims(
+        std::max(op.getOutput(0)->nbDims(), std::size_t(4)),
+        1);
     for (std::size_t i = 0; i < op.getOutput(0)->nbDims(); i++) {
         outDims[i] = static_cast<int>(op.getOutput(0)->dims()[i]);
     }
 
-    Aidge::powForward<T>(input1Ptr, outputPtr, input2Ptr,
-                inputsDims[0], inputsDims[1], outDims,
-                inputsStrides[0], inputsStrides[1], outputStrides,
-                static_cast<int>(op.getOutput(0)->size()));
+    Aidge::powForward<T>(input1Ptr,
+                         outputPtr,
+                         input2Ptr,
+                         inputsDims[0],
+                         inputsDims[1],
+                         outDims,
+                         inputsStrides[0],
+                         inputsStrides[1],
+                         outputStrides,
+                         static_cast<int>(op.getOutput(0)->size()));
 }
 
 void Aidge::PowImpl_cuda::backward() {
     // TODO
 }
 
-template <class T>
-void Aidge::PowImpl_cuda::backward_(const Tensor& outGrad) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+template <class T> void Aidge::PowImpl_cuda::backward_(const Tensor &outGrad) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
     // TODO
diff --git a/src/operator/ReLUImpl.cpp b/src/operator/ReLUImpl.cpp
index 80d52045e832b42a95b6d7448f2016530bb9d1ac..688b035a8ff99defb7d47c240f1f9ebc0fe7ac78 100644
--- a/src/operator/ReLUImpl.cpp
+++ b/src/operator/ReLUImpl.cpp
@@ -20,21 +20,25 @@
 #include "aidge/utils/Types.h"
 
 void Aidge::ReLUImpl_cuda::forward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     assert(mOp.getRawInput(0) && "missing input #0");
 
-    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+    const auto &input =
+        op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
 
     // Lazy-initialize CuDNN ReLU descriptor
     if (mReLUDesc == nullptr) {
-		#if CUDNN_VERSION >= 5000
-			CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mReLUDesc));
-			CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor(
-				mReLUDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0));
-		#else
-			mReLUDesc = CUDNN_ACTIVATION_RELU;
-		#endif
+#if CUDNN_VERSION >= 5000
+        CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mReLUDesc));
+        CHECK_CUDNN_STATUS(
+            cudnnSetActivationDescriptor(mReLUDesc,
+                                         CUDNN_ACTIVATION_RELU,
+                                         CUDNN_NOT_PROPAGATE_NAN,
+                                         0.0));
+#else
+        mReLUDesc = CUDNN_ACTIVATION_RELU;
+#endif
     }
 
     // Do the actual forward computation
@@ -42,44 +46,51 @@ void Aidge::ReLUImpl_cuda::forward() {
     // excepted when the convolution is performed in double precision.
     if (op.getOutput(0)->dataType() == DataType::Float64) {
         forward_<double>(input);
-    }
-    else {
+    } else {
         forward_<float>(input);
     }
 }
 
-template <class T>
-void Aidge::ReLUImpl_cuda::forward_(const Tensor& input) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+template <class T> void Aidge::ReLUImpl_cuda::forward_(const Tensor &input) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
-    CHECK_CUDNN_STATUS(
-        cudnnActivationForward(CudaContext::cudnnHandle(),
-                               mReLUDesc,
-                               &alpha,
-							   std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
-                               input.getImpl()->rawPtr(),
-                               &beta,
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                               std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+    CHECK_CUDNN_STATUS(cudnnActivationForward(
+        CudaContext::cudnnHandle(),
+        mReLUDesc,
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())
+            ->getCudnnTensorDesc(input),
+        input.getImpl()->rawPtr(),
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr()));
 }
 
 void Aidge::ReLUImpl_cuda::backward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     assert(op.getOutput(0)->grad() && "missing output #0");
 
-    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+    const auto &output_grad =
+        op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback,
+                                             *op.getOutput(0)->grad());
 
     // Lazy-initialize CuDNN ReLU descriptor
     if (mReLUDesc == nullptr) {
-		#if CUDNN_VERSION >= 5000
-			CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mReLUDesc));
-			CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor(
-				mReLUDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0));
-		#else
-			mReLUDesc = CUDNN_ACTIVATION_RELU;
-		#endif
+#if CUDNN_VERSION >= 5000
+        CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mReLUDesc));
+        CHECK_CUDNN_STATUS(
+            cudnnSetActivationDescriptor(mReLUDesc,
+                                         CUDNN_ACTIVATION_RELU,
+                                         CUDNN_NOT_PROPAGATE_NAN,
+                                         0.0));
+#else
+        mReLUDesc = CUDNN_ACTIVATION_RELU;
+#endif
     }
 
     // Do the actual backward computation
@@ -87,37 +98,44 @@ void Aidge::ReLUImpl_cuda::backward() {
     // excepted when the convolution is performed in double precision.
     if (op.getInput(0)->grad()->dataType() == DataType::Float64) {
         backward_<double>(output_grad);
-    }
-    else {
+    } else {
         backward_<float>(output_grad);
     }
 }
 
 template <class T>
-void Aidge::ReLUImpl_cuda::backward_(const Tensor& output_grad) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::ReLUImpl_cuda::backward_(const Tensor &output_grad) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
-    CHECK_CUDNN_STATUS(
-        cudnnActivationBackward(CudaContext::cudnnHandle(),
-                               mReLUDesc,
-                               &alpha,
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                               std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(),
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad),
-                               output_grad.getImpl()->rawPtr(),
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
-                               std::static_pointer_cast<Tensor>(op.getRawInput(0))->getImpl()->rawPtr(),
-                               &beta,
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)->grad()),
-                               op.getInput(0)->grad()->getImpl()->rawPtr()));
+    CHECK_CUDNN_STATUS(cudnnActivationBackward(
+        CudaContext::cudnnHandle(),
+        mReLUDesc,
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())
+            ->getCudnnTensorDesc(output_grad),
+        output_grad.getImpl()->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getInput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawInput(0))
+            ->getImpl()
+            ->rawPtr(),
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(
+            op.getInput(0)->grad()->getImpl())
+            ->getCudnnTensorDesc(*op.getInput(0)->grad()),
+        op.getInput(0)->grad()->getImpl()->rawPtr()));
 }
 
 Aidge::ReLUImpl_cuda::~ReLUImpl_cuda() {
     if (mReLUDesc != nullptr) {
-		#if CUDNN_VERSION >= 5000
-            cudnnDestroyActivationDescriptor(mReLUDesc);
-		#endif
+#if CUDNN_VERSION >= 5000
+        cudnnDestroyActivationDescriptor(mReLUDesc);
+#endif
     }
 }
-
diff --git a/src/operator/ReduceMeanImpl.cpp b/src/operator/ReduceMeanImpl.cpp
index ff83ea5153a95e109ce7ef83c42ed4d672561ad1..9f3cbc570e765e18120332a0eba25e0366a2aae7 100644
--- a/src/operator/ReduceMeanImpl.cpp
+++ b/src/operator/ReduceMeanImpl.cpp
@@ -15,93 +15,114 @@
 #include <vector>
 
 #include "aidge/backend/cuda/data/TensorImpl.hpp"
-#include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp"
 #include "aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp"
 #include "aidge/backend/cuda/utils/CudaContext.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 #include "aidge/operator/ReduceMean.hpp"
 #include "aidge/utils/Types.h"
 
 void Aidge::ReduceMeanImpl_cuda::forward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     AIDGE_ASSERT(op.getInput(0), "missing input in ReduceMean operator");
-    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run ReduceMean forward because the input has no implementation.");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(),
+                 "cannot run ReduceMean forward because the input has no "
+                 "implementation.");
 
-    const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(mInputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto &input =
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))
+            ->refCastFrom(
+                mInputFallback,
+                *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
 
-    const ReduceMean_Op& rmOp = static_cast<const ReduceMean_Op&>(mOp);
+    const ReduceMean_Op &rmOp = static_cast<const ReduceMean_Op &>(mOp);
     bool keepDims = rmOp.keepDims();
-    auto axes =  rmOp.axes();
+    auto axes = rmOp.axes();
     if (axes.empty()) {
-        input.getImpl()->copy(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr(), input.size());
-    }
-    else {
-        switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-            case DataType::Float64:
-                forward_<double>(input, axes, keepDims);
-                break;
-            case DataType::Float32:
-                forward_<float>(input, axes, keepDims);
-                break;
-            case DataType::Float16:
-                forward_<half>(input, axes, keepDims);
-                break;
-            default:
-                AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+        input.getImpl()->copy(
+            std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))
+                ->getImpl()
+                ->rawPtr(),
+            input.size());
+    } else {
+        switch (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))
+                    ->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input, axes, keepDims);
+            break;
+        case DataType::Float32:
+            forward_<float>(input, axes, keepDims);
+            break;
+        case DataType::Float16:
+            forward_<half>(input, axes, keepDims);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error,
+                                 "Data type is not supported by Backend Cuda");
         }
     }
 }
 
-
 template <class T>
-void Aidge::ReduceMeanImpl_cuda::forward_(const Tensor& input, const std::vector<int>& axes,  bool keepDims) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::ReduceMeanImpl_cuda::forward_(const Tensor &input,
+                                          const std::vector<int> &axes,
+                                          bool keepDims) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
 
     cudnnReduceTensorDescriptor_t reduceDesc;
     cudnnTensorDescriptor_t outputDesc;
     if (keepDims) {
-        outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0));
+        outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                         op.getOutput(0)->getImpl())
+                         ->getCudnnTensorDesc(*op.getOutput(0));
         CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc));
-        CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc,
-                                                            CUDNN_REDUCE_TENSOR_AVG,
-                                                            CudaContext::data_type<T>::value,
-                                                            CUDNN_PROPAGATE_NAN,
-                                                            CUDNN_REDUCE_TENSOR_NO_INDICES,
-                                                            CUDNN_32BIT_INDICES));
-
+        CHECK_CUDNN_STATUS(
+            cudnnSetReduceTensorDescriptor(reduceDesc,
+                                           CUDNN_REDUCE_TENSOR_AVG,
+                                           CudaContext::data_type<T>::value,
+                                           CUDNN_PROPAGATE_NAN,
+                                           CUDNN_REDUCE_TENSOR_NO_INDICES,
+                                           CUDNN_32BIT_INDICES));
 
         size_t workspaceSize;
-        CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(),
-                            reduceDesc,
-                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
-                            outputDesc,
-                            &workspaceSize));
+        CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(
+            CudaContext::cudnnHandle(),
+            reduceDesc,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())
+                ->getCudnnTensorDesc(input),
+            outputDesc,
+            &workspaceSize));
 
         void *d_workspace;
         CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize));
 
-        CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(),
-                            reduceDesc,
-                            NULL,
-                            0,
-                            d_workspace,
-                            workspaceSize,
-                            &alpha,
-                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
-                            input.getImpl()->rawPtr(),
-                            &beta,
-                            outputDesc,
-                            std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+        CHECK_CUDNN_STATUS(cudnnReduceTensor(
+            CudaContext::cudnnHandle(),
+            reduceDesc,
+            NULL,
+            0,
+            d_workspace,
+            workspaceSize,
+            &alpha,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())
+                ->getCudnnTensorDesc(input),
+            input.getImpl()->rawPtr(),
+            &beta,
+            outputDesc,
+            std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+                ->getImpl()
+                ->rawPtr()));
 
         CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc));
-    }
-    else {
+    } else {
         CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&outputDesc));
         std::vector<int> outputDims;
-        std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(outputDims));
-        for (const auto axis:axes) {
+        std::copy(input.dims().begin(),
+                  input.dims().end(),
+                  std::back_inserter(outputDims));
+        for (const auto axis : axes) {
             outputDims[axis] = 1;
         }
         if (outputDims.size() < 4) {
@@ -114,39 +135,50 @@ void Aidge::ReduceMeanImpl_cuda::forward_(const Tensor& input, const std::vector
             outputStrides[i - 1] = product;
             product *= outputDims[i - 1];
         }
-        CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(outputDesc, CudaContext::data_type<T>::value, outputDims.size(), outputDims.data(), outputStrides.data()));
-    
-        CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc));
-        CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc,
-                                                            CUDNN_REDUCE_TENSOR_AVG,
-                                                            CudaContext::data_type<T>::value,
-                                                            CUDNN_PROPAGATE_NAN,
-                                                            CUDNN_REDUCE_TENSOR_NO_INDICES,
-                                                            CUDNN_32BIT_INDICES));
+        CHECK_CUDNN_STATUS(
+            cudnnSetTensorNdDescriptor(outputDesc,
+                                       CudaContext::data_type<T>::value,
+                                       outputDims.size(),
+                                       outputDims.data(),
+                                       outputStrides.data()));
 
+        CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc));
+        CHECK_CUDNN_STATUS(
+            cudnnSetReduceTensorDescriptor(reduceDesc,
+                                           CUDNN_REDUCE_TENSOR_AVG,
+                                           CudaContext::data_type<T>::value,
+                                           CUDNN_PROPAGATE_NAN,
+                                           CUDNN_REDUCE_TENSOR_NO_INDICES,
+                                           CUDNN_32BIT_INDICES));
 
         size_t workspaceSize;
-        CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(),
-                            reduceDesc,
-                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
-                            outputDesc,
-                            &workspaceSize));
+        CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(
+            CudaContext::cudnnHandle(),
+            reduceDesc,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())
+                ->getCudnnTensorDesc(input),
+            outputDesc,
+            &workspaceSize));
 
         void *d_workspace;
         CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize));
 
-        CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(),
-                            reduceDesc,
-                            NULL,
-                            0,
-                            d_workspace,
-                            workspaceSize,
-                            &alpha,
-                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
-                            input.getImpl()->rawPtr(),
-                            &beta,
-                            outputDesc,
-                            std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+        CHECK_CUDNN_STATUS(cudnnReduceTensor(
+            CudaContext::cudnnHandle(),
+            reduceDesc,
+            NULL,
+            0,
+            d_workspace,
+            workspaceSize,
+            &alpha,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())
+                ->getCudnnTensorDesc(input),
+            input.getImpl()->rawPtr(),
+            &beta,
+            outputDesc,
+            std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+                ->getImpl()
+                ->rawPtr()));
 
         CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc));
         CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(outputDesc));
@@ -154,47 +186,57 @@ void Aidge::ReduceMeanImpl_cuda::forward_(const Tensor& input, const std::vector
 }
 
 void Aidge::ReduceMeanImpl_cuda::backward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
-    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing outputGrad in ReduceMean operator");
-    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run ReduceMean backward because the output grad has no implementation.");
-
-    const auto& outGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getInput(0)->grad());
-
-    const ReduceMean_Op& rmOp = static_cast<const ReduceMean_Op&>(mOp);
-    auto axes =  rmOp.axes();
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            backward_<double>(outGrad, axes);
-            break;
-        case DataType::Float32:
-            backward_<float>(outGrad, axes);
-            break;
-        case DataType::Float16:
-            backward_<half>(outGrad, axes);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
+    AIDGE_ASSERT(op.getOutput(0)->grad(),
+                 "missing outputGrad in ReduceMean operator");
+    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(),
+                 "cannot run ReduceMean backward because the output grad has "
+                 "no implementation.");
+
+    const auto &outGrad =
+        op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback,
+                                             *op.getInput(0)->grad());
+
+    const ReduceMean_Op &rmOp = static_cast<const ReduceMean_Op &>(mOp);
+    auto axes = rmOp.axes();
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        backward_<double>(outGrad, axes);
+        break;
+    case DataType::Float32:
+        backward_<float>(outGrad, axes);
+        break;
+    case DataType::Float16:
+        backward_<half>(outGrad, axes);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <class T>
-void Aidge::ReduceMeanImpl_cuda::backward_(const Tensor& outGrad, const std::vector<int>& axes) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::ReduceMeanImpl_cuda::backward_(const Tensor &outGrad,
+                                           const std::vector<int> &axes) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     // const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
-    const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr());
-    T * inputGrad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr());
+    const T *outputGrad =
+        static_cast<const T *>(op.getOutput(0)->grad()->getImpl()->rawPtr());
+    T *inputGrad =
+        static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr());
 
     std::vector<std::size_t> factors;
-    for (auto axis:axes) {
+    for (auto axis : axes) {
         factors.push_back(op.getInput(0)->grad()->dims()[axis]);
     }
-    
+
     Aidge::ReduceBackward(outputGrad,
-                            inputGrad,
-                            outGrad.dims(),
-                            op.getInput(0)->grad()->dims(),
-                            axes,
-                            factors,
-                            static_cast<int>(op.getInput(0)->grad()->size()));
+                          inputGrad,
+                          outGrad.dims(),
+                          op.getInput(0)->grad()->dims(),
+                          axes,
+                          factors,
+                          static_cast<int>(op.getInput(0)->grad()->size()));
 }
diff --git a/src/operator/ReduceSumImpl.cpp b/src/operator/ReduceSumImpl.cpp
index 895584d87dab88f3f71a424a02a3b32954c4dc43..9bd6d839d209874704feb4830b3e73704f795e4d 100644
--- a/src/operator/ReduceSumImpl.cpp
+++ b/src/operator/ReduceSumImpl.cpp
@@ -15,93 +15,114 @@
 #include <vector>
 
 #include "aidge/backend/cuda/data/TensorImpl.hpp"
-#include "aidge/backend/cuda/operator/ReduceSumImpl.hpp"
 #include "aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/operator/ReduceSumImpl.hpp"
 #include "aidge/backend/cuda/utils/CudaContext.hpp"
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 #include "aidge/operator/ReduceSum.hpp"
 #include "aidge/utils/Types.h"
 
 void Aidge::ReduceSumImpl_cuda::forward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     AIDGE_ASSERT(op.getInput(0), "missing input in ReduceSum operator");
-    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run ReduceSum forward because the input has no implementation.");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(),
+                 "cannot run ReduceSum forward because the input has no "
+                 "implementation.");
 
-    const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(mInputFallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto &input =
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))
+            ->refCastFrom(
+                mInputFallback,
+                *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
 
-    const ReduceSum_Op& rsOp = static_cast<const ReduceSum_Op&>(mOp);
+    const ReduceSum_Op &rsOp = static_cast<const ReduceSum_Op &>(mOp);
     bool keepDims = rsOp.keepDims();
-    auto axes =  rsOp.axes();
+    auto axes = rsOp.axes();
     if (axes.empty()) {
-        input.getImpl()->copy(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr(), input.size());
-    }
-    else {
-        switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-            case DataType::Float64:
-                forward_<double>(input, axes, keepDims);
-                break;
-            case DataType::Float32:
-                forward_<float>(input, axes, keepDims);
-                break;
-            case DataType::Float16:
-                forward_<half>(input, axes, keepDims);
-                break;
-            default:
-                AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+        input.getImpl()->copy(
+            std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))
+                ->getImpl()
+                ->rawPtr(),
+            input.size());
+    } else {
+        switch (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))
+                    ->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input, axes, keepDims);
+            break;
+        case DataType::Float32:
+            forward_<float>(input, axes, keepDims);
+            break;
+        case DataType::Float16:
+            forward_<half>(input, axes, keepDims);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error,
+                                 "Data type is not supported by Backend Cuda");
         }
     }
 }
 
-
 template <class T>
-void Aidge::ReduceSumImpl_cuda::forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::ReduceSumImpl_cuda::forward_(const Tensor &input,
+                                         const std::vector<int> &axes,
+                                         bool keepDims) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
 
     cudnnReduceTensorDescriptor_t reduceDesc;
     cudnnTensorDescriptor_t outputDesc;
     if (keepDims) {
-        outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0));
+        outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                         op.getOutput(0)->getImpl())
+                         ->getCudnnTensorDesc(*op.getOutput(0));
         CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc));
-        CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc,
-                                                            CUDNN_REDUCE_TENSOR_ADD,
-                                                            CudaContext::data_type<T>::value,
-                                                            CUDNN_PROPAGATE_NAN,
-                                                            CUDNN_REDUCE_TENSOR_NO_INDICES,
-                                                            CUDNN_32BIT_INDICES));
-
+        CHECK_CUDNN_STATUS(
+            cudnnSetReduceTensorDescriptor(reduceDesc,
+                                           CUDNN_REDUCE_TENSOR_ADD,
+                                           CudaContext::data_type<T>::value,
+                                           CUDNN_PROPAGATE_NAN,
+                                           CUDNN_REDUCE_TENSOR_NO_INDICES,
+                                           CUDNN_32BIT_INDICES));
 
         size_t workspaceSize;
-        CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(),
-                            reduceDesc,
-                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
-                            outputDesc,
-                            &workspaceSize));
+        CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(
+            CudaContext::cudnnHandle(),
+            reduceDesc,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())
+                ->getCudnnTensorDesc(input),
+            outputDesc,
+            &workspaceSize));
 
         void *d_workspace;
         CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize));
 
-        CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(),
-                            reduceDesc,
-                            NULL,
-                            0,
-                            d_workspace,
-                            workspaceSize,
-                            &alpha,
-                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
-                            input.getImpl()->rawPtr(),
-                            &beta,
-                            outputDesc,
-                            std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+        CHECK_CUDNN_STATUS(cudnnReduceTensor(
+            CudaContext::cudnnHandle(),
+            reduceDesc,
+            NULL,
+            0,
+            d_workspace,
+            workspaceSize,
+            &alpha,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())
+                ->getCudnnTensorDesc(input),
+            input.getImpl()->rawPtr(),
+            &beta,
+            outputDesc,
+            std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+                ->getImpl()
+                ->rawPtr()));
 
         CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc));
-    }
-    else {
+    } else {
         CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&outputDesc));
         std::vector<int> outputDims;
-        std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(outputDims));
-        for (const auto axis:axes) {
+        std::copy(input.dims().begin(),
+                  input.dims().end(),
+                  std::back_inserter(outputDims));
+        for (const auto axis : axes) {
             outputDims[axis] = 1;
         }
         if (outputDims.size() < 4) {
@@ -114,39 +135,50 @@ void Aidge::ReduceSumImpl_cuda::forward_(const Tensor& input, const std::vector<
             outputStrides[i - 1] = product;
             product *= outputDims[i - 1];
         }
-        CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(outputDesc, CudaContext::data_type<T>::value, outputDims.size(), outputDims.data(), outputStrides.data()));
-    
-        CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc));
-        CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc,
-                                                            CUDNN_REDUCE_TENSOR_ADD,
-                                                            CudaContext::data_type<T>::value,
-                                                            CUDNN_PROPAGATE_NAN,
-                                                            CUDNN_REDUCE_TENSOR_NO_INDICES,
-                                                            CUDNN_32BIT_INDICES));
+        CHECK_CUDNN_STATUS(
+            cudnnSetTensorNdDescriptor(outputDesc,
+                                       CudaContext::data_type<T>::value,
+                                       outputDims.size(),
+                                       outputDims.data(),
+                                       outputStrides.data()));
 
+        CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc));
+        CHECK_CUDNN_STATUS(
+            cudnnSetReduceTensorDescriptor(reduceDesc,
+                                           CUDNN_REDUCE_TENSOR_ADD,
+                                           CudaContext::data_type<T>::value,
+                                           CUDNN_PROPAGATE_NAN,
+                                           CUDNN_REDUCE_TENSOR_NO_INDICES,
+                                           CUDNN_32BIT_INDICES));
 
         size_t workspaceSize;
-        CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(),
-                            reduceDesc,
-                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
-                            outputDesc,
-                            &workspaceSize));
+        CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(
+            CudaContext::cudnnHandle(),
+            reduceDesc,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())
+                ->getCudnnTensorDesc(input),
+            outputDesc,
+            &workspaceSize));
 
         void *d_workspace;
         CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize));
 
-        CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(),
-                            reduceDesc,
-                            NULL,
-                            0,
-                            d_workspace,
-                            workspaceSize,
-                            &alpha,
-                            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
-                            input.getImpl()->rawPtr(),
-                            &beta,
-                            outputDesc,
-                            std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+        CHECK_CUDNN_STATUS(cudnnReduceTensor(
+            CudaContext::cudnnHandle(),
+            reduceDesc,
+            NULL,
+            0,
+            d_workspace,
+            workspaceSize,
+            &alpha,
+            std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())
+                ->getCudnnTensorDesc(input),
+            input.getImpl()->rawPtr(),
+            &beta,
+            outputDesc,
+            std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+                ->getImpl()
+                ->rawPtr()));
 
         CHECK_CUDNN_STATUS(cudnnDestroyReduceTensorDescriptor(reduceDesc));
         CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(outputDesc));
@@ -154,46 +186,56 @@ void Aidge::ReduceSumImpl_cuda::forward_(const Tensor& input, const std::vector<
 }
 
 void Aidge::ReduceSumImpl_cuda::backward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
-    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing outputGrad in ReduceSum operator");
-    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run ReduceSum backward because the output grad has no implementation.");
-
-    const auto& outGrad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getInput(0)->grad());
-
-    const ReduceSum_Op& rmOp = static_cast<const ReduceSum_Op&>(mOp);
-    auto axes =  rmOp.axes();
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            backward_<double>(outGrad, axes);
-            break;
-        case DataType::Float32:
-            backward_<float>(outGrad, axes);
-            break;
-        case DataType::Float16:
-            backward_<half>(outGrad, axes);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
+    AIDGE_ASSERT(op.getOutput(0)->grad(),
+                 "missing outputGrad in ReduceSum operator");
+    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(),
+                 "cannot run ReduceSum backward because the output grad has "
+                 "no implementation.");
+
+    const auto &outGrad =
+        op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback,
+                                             *op.getInput(0)->grad());
+
+    const ReduceSum_Op &rmOp = static_cast<const ReduceSum_Op &>(mOp);
+    auto axes = rmOp.axes();
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        backward_<double>(outGrad, axes);
+        break;
+    case DataType::Float32:
+        backward_<float>(outGrad, axes);
+        break;
+    case DataType::Float16:
+        backward_<half>(outGrad, axes);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <class T>
-void Aidge::ReduceSumImpl_cuda::backward_(const Tensor& outGrad, const std::vector<int>& axes) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::ReduceSumImpl_cuda::backward_(const Tensor &outGrad,
+                                          const std::vector<int> &axes) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
-    const T * outputGrad = static_cast<const T*>(op.getOutput(0)->grad()->getImpl()->rawPtr());
-    T * inputGrad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr());
+    const T *outputGrad =
+        static_cast<const T *>(op.getOutput(0)->grad()->getImpl()->rawPtr());
+    T *inputGrad =
+        static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr());
 
     std::vector<std::size_t> factors;
-    for (auto axis:axes) {
+    for (auto axis : axes) {
         factors.push_back(op.getInput(0)->grad()->dims()[axis]);
     }
-    
+
     Aidge::ReduceBackward(outputGrad,
-                        inputGrad,
-                        outGrad.dims(),
-                        op.getInput(0)->grad()->dims(),
-                        axes,
-                        factors,
-                        static_cast<int>(op.getInput(0)->grad()->size()));
+                          inputGrad,
+                          outGrad.dims(),
+                          op.getInput(0)->grad()->dims(),
+                          axes,
+                          factors,
+                          static_cast<int>(op.getInput(0)->grad()->size()));
 }
diff --git a/src/operator/ReshapeImpl.cpp b/src/operator/ReshapeImpl.cpp
index 783e244057b0fc42a782fd363c3a99aa6d73b46b..159550fff7b23448bcee7c0f0ecf14412d6910d4 100644
--- a/src/operator/ReshapeImpl.cpp
+++ b/src/operator/ReshapeImpl.cpp
@@ -22,20 +22,29 @@
 #include "aidge/utils/Types.h"
 
 void Aidge::ReshapeImpl_cuda::forward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     // FIXME: uncomment the following code once memory handling will work
     assert(mOp.getRawInput(0) && "missing input #0");
 
-    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+    const auto &input =
+        op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
 
-    std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))-> getImpl() -> setRawPtr(input.getImpl()->rawPtr(), input.getImpl()->size());
+    std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))
+        ->getImpl()
+        ->setRawPtr(input.getImpl()->rawPtr(), input.getImpl()->size());
 }
 
 void Aidge::ReshapeImpl_cuda::backward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output grad #0");
 
-    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+    const auto &output_grad =
+        op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback,
+                                             *op.getOutput(0)->grad());
 
-    std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->grad() -> getImpl() -> setRawPtr(output_grad.getImpl()->rawPtr(), output_grad.getImpl()->size());
+    std::static_pointer_cast<Tensor>(mOp.getRawInput(0))
+        ->grad()
+        ->getImpl()
+        ->setRawPtr(output_grad.getImpl()->rawPtr(),
+                    output_grad.getImpl()->size());
 }
diff --git a/src/operator/ShiftGELUImpl.cpp b/src/operator/ShiftGELUImpl.cpp
index c2774804d04a422aefd0c66ed0d1fc1d949b1f06..4e4329af7cabecb60db2b776812b0fd516c90939 100644
--- a/src/operator/ShiftGELUImpl.cpp
+++ b/src/operator/ShiftGELUImpl.cpp
@@ -11,14 +11,14 @@
  *
  ********************************************************************************/
 
+#include <algorithm> // For std::max
 #include <cassert>
-#include <chrono>  // std::chrono::milliseconds
-#include <numeric> // std::accumulate
-#include <thread>  // std::this_thread::sleep_for
-#include <vector>
-#include <algorithm>  // For std::max
-#include <cmath>      // For pow
+#include <chrono>    // std::chrono::milliseconds
+#include <cmath>     // For pow
+#include <numeric>   // std::accumulate
+#include <thread>    // std::this_thread::sleep_for
 #include <typeinfo>
+#include <vector>
 
 #include "aidge/backend/cuda/data/TensorImpl.hpp"
 #include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"
@@ -30,42 +30,48 @@
 
 void Aidge::ShiftGELUImpl_cuda::forward() {
 
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     assert(mOp.getRawInput(0) && "missing input #0");
-    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
-
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            forward_<double>(input);
-            break;
-        case DataType::Float32:
-            forward_<float>(input);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    const auto &input =
+        op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        forward_<double>(input);
+        break;
+    case DataType::Float32:
+        forward_<float>(input);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
-template<class T>
-void Aidge::ShiftGELUImpl_cuda::forward_(const Tensor& input)
-{
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
-    const T * input_raw = static_cast<const T*>(input.getImpl()->rawPtr());
-    T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
+template <class T>
+void Aidge::ShiftGELUImpl_cuda::forward_(const Tensor &input) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
+    const T *input_raw = static_cast<const T *>(input.getImpl()->rawPtr());
+    T *output =
+        static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))
+                             ->getImpl()
+                             ->rawPtr());
 
     int N = 15;
     int output_bits = 8;
     size_t size = input.size();
     std::vector<DimSize_t> dims_input = input.dims();
 
-    // maybe find a most efficient way to compute scaling factor (a max and min function could help to retrieve scaling factor value)
+    // maybe find a most efficient way to compute scaling factor (a max and min
+    // function could help to retrieve scaling factor value)
 
     double min = std::numeric_limits<double>::max();
     double max = std::numeric_limits<double>::min();
-    for(std::size_t i = 0; i < dims_input[0]; i++) {
-        for(std::size_t j = 0; j < dims_input[1]; j++) {
-            for(std::size_t k = 0; k < dims_input[2]; k++) {
-                for(std::size_t l = 0; l < dims_input[3]; l++) {
+    for (std::size_t i = 0; i < dims_input[0]; i++) {
+        for (std::size_t j = 0; j < dims_input[1]; j++) {
+            for (std::size_t k = 0; k < dims_input[2]; k++) {
+                for (std::size_t l = 0; l < dims_input[3]; l++) {
                     std::vector<std::size_t> coordIdx = {i, j, k, l};
                     std::size_t newFlatIdx = input.getIdx(coordIdx);
                     if (newFlatIdx < min) {
@@ -74,46 +80,57 @@ void Aidge::ShiftGELUImpl_cuda::forward_(const Tensor& input)
                     if (newFlatIdx > max) {
                         max = newFlatIdx;
                     }
-               }
-            }     
+                }
+            }
         }
     }
 
     double m = std::max(std::abs(min), std::abs(max));
-    double normalization_factor = static_cast<double>(1 << (output_bits - 1)) - 1;
-    double scaling_factor =  m / normalization_factor;
-
-    // The new scaling factor that we can use to dequantify the returned tensor (not used here)
-    // double new_SF = 1/std::pow(2,2*output_bits-1);
-
-    ShiftGELUforward(input_raw, output, scaling_factor,N, output_bits, size, dims_input);
+    double normalization_factor =
+        static_cast<double>(1 << (output_bits - 1)) - 1;
+    double scaling_factor = m / normalization_factor;
+
+    // The new scaling factor that we can use to dequantify the returned tensor
+    // (not used here) double new_SF = 1/std::pow(2,2*output_bits-1);
+
+    ShiftGELUforward(input_raw,
+                     output,
+                     scaling_factor,
+                     N,
+                     output_bits,
+                     size,
+                     dims_input);
 }
 
 void Aidge::ShiftGELUImpl_cuda::backward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     assert(op.getOutput(0)->grad() && "missing output #0");
 
-    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+    const auto &output_grad =
+        op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback,
+                                             *op.getOutput(0)->grad());
 
     if (op.getInput(0)->grad()->dataType() == DataType::Float64) {
         backward_<double>(output_grad);
-    }
-    else {
+    } else {
         backward_<float>(output_grad);
     }
 }
 
 template <class T>
-void Aidge::ShiftGELUImpl_cuda::backward_(const Tensor& output_grad) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
-    const T * input = static_cast<const T*>(std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr());
+void Aidge::ShiftGELUImpl_cuda::backward_(const Tensor &output_grad) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
+    const T *input = static_cast<const T *>(
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr());
 
     size_t size = output_grad.size();
 
-    T * output = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr());
+    T *output = static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr());
 
-    const T * output_grad_raw = static_cast<const T*>(output_grad.getImpl()->rawPtr());
+    const T *output_grad_raw =
+        static_cast<const T *>(output_grad.getImpl()->rawPtr());
     ShiftGELUbackward(input, output_grad_raw, output, size);
-
 }
\ No newline at end of file
diff --git a/src/operator/ShiftMaxImpl.cpp b/src/operator/ShiftMaxImpl.cpp
index 1134cc5d6b99e53eb492c82e32d811bc0bcba0e0..2abb85ef435b86fd7c016cc9ae6ed5c83c5cee51 100644
--- a/src/operator/ShiftMaxImpl.cpp
+++ b/src/operator/ShiftMaxImpl.cpp
@@ -11,14 +11,14 @@
  *
  ********************************************************************************/
 
+#include <algorithm> // For std::max
 #include <cassert>
-#include <chrono>  // std::chrono::milliseconds
-#include <numeric> // std::accumulate
-#include <thread>  // std::this_thread::sleep_for
-#include <vector>
-#include <algorithm>  // For std::max
-#include <cmath>      // For pow
+#include <chrono>    // std::chrono::milliseconds
+#include <cmath>     // For pow
+#include <numeric>   // std::accumulate
+#include <thread>    // std::this_thread::sleep_for
 #include <typeinfo>
+#include <vector>
 
 #include "aidge/backend/cuda/data/TensorImpl.hpp"
 #include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
@@ -30,42 +30,48 @@
 
 void Aidge::ShiftMaxImpl_cuda::forward() {
 
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     assert(mOp.getRawInput(0) && "missing input #0");
-    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
-
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            forward_<double>(input);
-            break;
-        case DataType::Float32:
-            forward_<float>(input);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    const auto &input =
+        op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        forward_<double>(input);
+        break;
+    case DataType::Float32:
+        forward_<float>(input);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
-template<class T>
-void Aidge::ShiftMaxImpl_cuda::forward_(const Tensor& input)
-{
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
-    const T * input_raw = static_cast<const T*>(input.getImpl()->rawPtr());
-    T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
+template <class T>
+void Aidge::ShiftMaxImpl_cuda::forward_(const Tensor &input) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
+    const T *input_raw = static_cast<const T *>(input.getImpl()->rawPtr());
+    T *output =
+        static_cast<T *>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))
+                             ->getImpl()
+                             ->rawPtr());
 
     int N = 15;
     int output_bits = 8;
     size_t size = input.size();
     std::vector<DimSize_t> dims_input = input.dims();
 
-    // maybe find a most efficient way to compute scaling factor (a max and min function could help to retrieve scaling factor value)
+    // maybe find a most efficient way to compute scaling factor (a max and min
+    // function could help to retrieve scaling factor value)
 
     double min = std::numeric_limits<double>::max();
     double max = std::numeric_limits<double>::min();
-    for(std::size_t i = 0; i < dims_input[0]; i++) {
-        for(std::size_t j = 0; j < dims_input[1]; j++) {
-            for(std::size_t k = 0; k < dims_input[2]; k++) {
-                for(std::size_t l = 0; l < dims_input[3]; l++) {
+    for (std::size_t i = 0; i < dims_input[0]; i++) {
+        for (std::size_t j = 0; j < dims_input[1]; j++) {
+            for (std::size_t k = 0; k < dims_input[2]; k++) {
+                for (std::size_t l = 0; l < dims_input[3]; l++) {
                     std::vector<std::size_t> coordIdx = {i, j, k, l};
                     std::size_t newFlatIdx = input.getIdx(coordIdx);
                     if (newFlatIdx < min) {
@@ -74,48 +80,63 @@ void Aidge::ShiftMaxImpl_cuda::forward_(const Tensor& input)
                     if (newFlatIdx > max) {
                         max = newFlatIdx;
                     }
-               }
-            }     
+                }
+            }
         }
     }
 
     double m = std::max(std::abs(min), std::abs(max));
-    double normalization_factor = static_cast<double>(1 << (output_bits - 1)) - 1;
-    double scaling_factor =  m / normalization_factor;
-    
-    // The new scaling factor that we can use to dequantify the returned tensor (not used here)
-    // double new_SF = 1/std::pow(2,2*output_bits-1);
-
-    ShiftMaxforward(input_raw, output, scaling_factor,N, output_bits, size, dims_input);
+    double normalization_factor =
+        static_cast<double>(1 << (output_bits - 1)) - 1;
+    double scaling_factor = m / normalization_factor;
+
+    // The new scaling factor that we can use to dequantify the returned tensor
+    // (not used here) double new_SF = 1/std::pow(2,2*output_bits-1);
+
+    ShiftMaxforward(input_raw,
+                    output,
+                    scaling_factor,
+                    N,
+                    output_bits,
+                    size,
+                    dims_input);
 }
 
-
 void Aidge::ShiftMaxImpl_cuda::backward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     assert(op.getOutput(0)->grad() && "missing output #0");
 
-    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+    const auto &output_grad =
+        op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback,
+                                             *op.getOutput(0)->grad());
 
     if (op.getInput(0)->grad()->dataType() == DataType::Float64) {
         backward_<double>(output_grad);
-    }
-    else {
+    } else {
         backward_<float>(output_grad);
     }
 }
 
 template <class T>
-void Aidge::ShiftMaxImpl_cuda::backward_(const Tensor& output_grad) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
-    const T * output_tensor = static_cast<const T*>(std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr());
+void Aidge::ShiftMaxImpl_cuda::backward_(const Tensor &output_grad) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
+    const T *output_tensor = static_cast<const T *>(
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr());
 
     size_t size = output_grad.size();
     std::vector<DimSize_t> dims_output = output_grad.dims();
 
-    T * input_grad = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr());
-
-    const T * output_grad_raw = static_cast<const T*>(output_grad.getImpl()->rawPtr());
-    ShiftMaxbackward(output_tensor, output_grad_raw, input_grad, size, dims_output);
+    T *input_grad =
+        static_cast<T *>(op.getInput(0)->grad()->getImpl()->rawPtr());
 
+    const T *output_grad_raw =
+        static_cast<const T *>(output_grad.getImpl()->rawPtr());
+    ShiftMaxbackward(output_tensor,
+                     output_grad_raw,
+                     input_grad,
+                     size,
+                     dims_output);
 }
\ No newline at end of file
diff --git a/src/operator/SigmoidImpl.cpp b/src/operator/SigmoidImpl.cpp
index 386cd9d821b3019cf8f0de2cc757ae514446f1a6..348d64076bab8d459a3920b35ea1bb750234722e 100644
--- a/src/operator/SigmoidImpl.cpp
+++ b/src/operator/SigmoidImpl.cpp
@@ -20,21 +20,25 @@
 #include "aidge/utils/Types.h"
 
 void Aidge::SigmoidImpl_cuda::forward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     assert(mOp.getRawInput(0) && "missing input #0");
 
-    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+    const auto &input =
+        op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
 
     // Lazy-initialize CuDNN Sigmoid descriptor
     if (mSigmoidDesc == nullptr) {
-		#if CUDNN_VERSION >= 5000
-			CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mSigmoidDesc));
-			CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor(
-				mSigmoidDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0.0));
-		#else
-			mSigmoidDesc = CUDNN_ACTIVATION_SIGMOID;
-		#endif
+#if CUDNN_VERSION >= 5000
+        CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mSigmoidDesc));
+        CHECK_CUDNN_STATUS(
+            cudnnSetActivationDescriptor(mSigmoidDesc,
+                                         CUDNN_ACTIVATION_SIGMOID,
+                                         CUDNN_NOT_PROPAGATE_NAN,
+                                         0.0));
+#else
+        mSigmoidDesc = CUDNN_ACTIVATION_SIGMOID;
+#endif
     }
 
     // Do the actual forward computation
@@ -42,44 +46,52 @@ void Aidge::SigmoidImpl_cuda::forward() {
     // excepted when the convolution is performed in double precision.
     if (op.getOutput(0)->dataType() == DataType::Float64) {
         forward_<double>(input);
-    }
-    else {
+    } else {
         forward_<float>(input);
     }
 }
 
 template <class T>
-void Aidge::SigmoidImpl_cuda::forward_(const Tensor& input) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::SigmoidImpl_cuda::forward_(const Tensor &input) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
-    CHECK_CUDNN_STATUS(
-        cudnnActivationForward(CudaContext::cudnnHandle(),
-                               mSigmoidDesc,
-                               &alpha,
-							   std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
-                               input.getImpl()->rawPtr(),
-                               &beta,
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                               std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+    CHECK_CUDNN_STATUS(cudnnActivationForward(
+        CudaContext::cudnnHandle(),
+        mSigmoidDesc,
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())
+            ->getCudnnTensorDesc(input),
+        input.getImpl()->rawPtr(),
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr()));
 }
 
 void Aidge::SigmoidImpl_cuda::backward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     assert(op.getOutput(0)->grad() && "missing output #0");
 
-    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+    const auto &output_grad =
+        op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback,
+                                             *op.getOutput(0)->grad());
 
     // Lazy-initialize CuDNN Sigmoid descriptor
     if (mSigmoidDesc == nullptr) {
-		#if CUDNN_VERSION >= 5000
-			CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mSigmoidDesc));
-			CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor(
-				mSigmoidDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0.0));
-		#else
-			mSigmoidDesc = CUDNN_ACTIVATION_SIGMOID;
-		#endif
+#if CUDNN_VERSION >= 5000
+        CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mSigmoidDesc));
+        CHECK_CUDNN_STATUS(
+            cudnnSetActivationDescriptor(mSigmoidDesc,
+                                         CUDNN_ACTIVATION_SIGMOID,
+                                         CUDNN_NOT_PROPAGATE_NAN,
+                                         0.0));
+#else
+        mSigmoidDesc = CUDNN_ACTIVATION_SIGMOID;
+#endif
     }
 
     // Do the actual backward computation
@@ -87,37 +99,44 @@ void Aidge::SigmoidImpl_cuda::backward() {
     // excepted when the convolution is performed in double precision.
     if (op.getInput(0)->grad()->dataType() == DataType::Float64) {
         backward_<double>(output_grad);
-    }
-    else {
+    } else {
         backward_<float>(output_grad);
     }
 }
 
 template <class T>
-void Aidge::SigmoidImpl_cuda::backward_(const Tensor& output_grad) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::SigmoidImpl_cuda::backward_(const Tensor &output_grad) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
-    CHECK_CUDNN_STATUS(
-        cudnnActivationBackward(CudaContext::cudnnHandle(),
-                               mSigmoidDesc,
-                               &alpha,
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                               std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(),
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad),
-                               output_grad.getImpl()->rawPtr(),
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
-                               std::static_pointer_cast<Tensor>(op.getRawInput(0))->getImpl()->rawPtr(),
-                               &beta,
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)->grad()),
-                               op.getInput(0)->grad()->getImpl()->rawPtr()));
+    CHECK_CUDNN_STATUS(cudnnActivationBackward(
+        CudaContext::cudnnHandle(),
+        mSigmoidDesc,
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())
+            ->getCudnnTensorDesc(output_grad),
+        output_grad.getImpl()->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getInput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawInput(0))
+            ->getImpl()
+            ->rawPtr(),
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(
+            op.getInput(0)->grad()->getImpl())
+            ->getCudnnTensorDesc(*op.getInput(0)->grad()),
+        op.getInput(0)->grad()->getImpl()->rawPtr()));
 }
 
 Aidge::SigmoidImpl_cuda::~SigmoidImpl_cuda() {
     if (mSigmoidDesc != nullptr) {
-		#if CUDNN_VERSION >= 5000
-            cudnnDestroyActivationDescriptor(mSigmoidDesc);
-		#endif
+#if CUDNN_VERSION >= 5000
+        cudnnDestroyActivationDescriptor(mSigmoidDesc);
+#endif
     }
 }
-
diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp
index a04a1c3018b0c9ba455d21ba563253eb3e004e10..e63b9c04f89cd7cf1919006106dfe97b1833b935 100644
--- a/src/operator/SubImpl.cpp
+++ b/src/operator/SubImpl.cpp
@@ -22,27 +22,39 @@
 #include "aidge/utils/Types.h"
 
 void Aidge::SubImpl_cuda::forward() {
-    const Sub_Op& op = static_cast<const Sub_Op&>(mOp);
+    const Sub_Op &op = static_cast<const Sub_Op &>(mOp);
     // Check inputs
     AIDGE_ASSERT(op.getInput(0), "missing input in Sub operator");
-    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Sub forward because the 0-th input has no implementation.");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(),
+                 "cannot run Sub forward because the 0-th input has no "
+                 "implementation.");
     DataType datatypeFirstInput = op.getInput(0)->dataType();
     for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
         AIDGE_ASSERT(op.getInput(i), "missing input in Sub operator");
-        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Sub forward because the {}-th input has no implementation.", i);
-        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type.");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(),
+                     "cannot run Sub forward because the {}-th input has no "
+                     "implementation.",
+                     i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput,
+                     "Cannot add inputs with two differents data type.");
     }
 
     std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs());
     std::vector<Tensor> inputs(op.nbInputs());
     std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
-    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    std::vector<std::vector<int>> strides(
+        op.nbInputs()); // For the cooresponding strides
     for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
-        inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
+        inputs[i] =
+            op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
 
         // Get tensor dims and broadcast them
-        std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
-        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+        std::copy(inputs[i].dims().begin(),
+                  inputs[i].dims().end(),
+                  std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(),
+                       op.getOutput(0)->nbDims() - dims[i].size(),
+                       int(1));
 
         if (dims[i].size() < 4) {
             dims[i].resize(4, 1);
@@ -58,76 +70,106 @@ void Aidge::SubImpl_cuda::forward() {
         strides[i] = tensorStrides;
     }
 
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            forward_<double>(inputs, dims, strides);
-            break;
-        case DataType::Float32:
-            forward_<float>(inputs, dims, strides);
-            break;
-        case DataType::Float16:
-            forward_<half>(inputs, dims, strides);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        forward_<double>(inputs, dims, strides);
+        break;
+    case DataType::Float32:
+        forward_<float>(inputs, dims, strides);
+        break;
+    case DataType::Float16:
+        forward_<half>(inputs, dims, strides);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <class T>
-void Aidge::SubImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::SubImpl_cuda::forward_(
+    const std::vector<Tensor> &inputs,
+    const std::vector<std::vector<int>> &inputsDims,
+    const std::vector<std::vector<int>> &inputsStrides) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
     const typename Cuda::cudnn_scaling_type<T>::type gamma = -1.0f;
     // Create a Tensor descriptor with the broadcasted dims and strides
     cudnnTensorDescriptor_t tensorDesc;
     CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc));
-    CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[0].size(), inputsDims[0].data(), inputsStrides[0].data()));
-    // Add first input to the output
     CHECK_CUDNN_STATUS(
-        cudnnAddTensor(CudaContext::cudnnHandle(),
-                       &alpha,
-                       tensorDesc,
-                       inputs[0].getImpl()->rawPtr(),
-                       &beta,
-                       std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                       std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())
-    );
+        cudnnSetTensorNdDescriptor(tensorDesc,
+                                   CudaContext::data_type<T>::value,
+                                   inputsDims[0].size(),
+                                   inputsDims[0].data(),
+                                   inputsStrides[0].data()));
+    // Add first input to the output
+    CHECK_CUDNN_STATUS(cudnnAddTensor(
+        CudaContext::cudnnHandle(),
+        &alpha,
+        tensorDesc,
+        inputs[0].getImpl()->rawPtr(),
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr()));
     // Substract other inputs if there are any
-    for (size_t i = 1; i < op.nbInputs(); ++i)
-    {
-        CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc, CudaContext::data_type<T>::value, inputsDims[i].size(), inputsDims[i].data(), inputsStrides[i].data()));
+    for (size_t i = 1; i < op.nbInputs(); ++i) {
+        CHECK_CUDNN_STATUS(
+            cudnnSetTensorNdDescriptor(tensorDesc,
+                                       CudaContext::data_type<T>::value,
+                                       inputsDims[i].size(),
+                                       inputsDims[i].data(),
+                                       inputsStrides[i].data()));
         CHECK_CUDNN_STATUS(
             cudnnAddTensor(CudaContext::cudnnHandle(),
-                        &gamma,
-                        tensorDesc,
-                        inputs[i].getImpl()->rawPtr(),
-                        &alpha,
-                        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                        std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr())
-        );
+                           &gamma,
+                           tensorDesc,
+                           inputs[i].getImpl()->rawPtr(),
+                           &alpha,
+                           std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                               op.getOutput(0)->getImpl())
+                               ->getCudnnTensorDesc(*op.getOutput(0)),
+                           std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+                               ->getImpl()
+                               ->rawPtr()));
     }
     CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc));
 }
 
 void Aidge::SubImpl_cuda::backward() {
-    const Sub_Op& op = static_cast<const Sub_Op&>(mOp);
+    const Sub_Op &op = static_cast<const Sub_Op &>(mOp);
     // Check output
-    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output gradient in Sub operator");
-    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(), "cannot run Sub backward because the output gradient has no implementation.");
+    AIDGE_ASSERT(op.getOutput(0)->grad(),
+                 "missing output gradient in Sub operator");
+    AIDGE_ASSERT(op.getOutput(0)->grad()->hasImpl(),
+                 "cannot run Sub backward because the output gradient has no "
+                 "implementation.");
 
     std::shared_ptr<Tensor> outputGradFallback;
-    const auto& outputGrad = op.getOutput(0)->grad()->refCastFrom(outputGradFallback, *op.getOutput(0)->grad());
+    const auto &outputGrad =
+        op.getOutput(0)->grad()->refCastFrom(outputGradFallback,
+                                             *op.getOutput(0)->grad());
 
     std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
-    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    std::vector<std::vector<int>> strides(
+        op.nbInputs()); // For the cooresponding strides
     for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
         std::shared_ptr<Tensor> inputFallback;
-        const Tensor input = op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0));
+        const Tensor input =
+            op.getInput(i)->refCastFrom(inputFallback, *op.getOutput(0));
 
         // Get tensor dims and broadcast them
-        std::copy(input.dims().begin(), input.dims().end(), std::back_inserter(dims[i]));
-        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+        std::copy(input.dims().begin(),
+                  input.dims().end(),
+                  std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(),
+                       op.getOutput(0)->nbDims() - dims[i].size(),
+                       int(1));
 
         // Compute the corresponding strides
         std::vector<int> tensorStrides(dims[i].size());
@@ -139,83 +181,97 @@ void Aidge::SubImpl_cuda::backward() {
         strides[i] = tensorStrides;
     }
 
-    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
-        case DataType::Float64:
-            backward_<double>(outputGrad, dims, strides);
-            break;
-        case DataType::Float32:
-            backward_<float>(outputGrad, dims, strides);
-            break;
-        case DataType::Float16:
-            backward_<half>(outputGrad, dims, strides);
-            break;
-        default:
-            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    switch (
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+    case DataType::Float64:
+        backward_<double>(outputGrad, dims, strides);
+        break;
+    case DataType::Float32:
+        backward_<float>(outputGrad, dims, strides);
+        break;
+    case DataType::Float16:
+        backward_<half>(outputGrad, dims, strides);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "Data type is not supported by Backend Cuda");
     }
 }
 
 template <class T>
-void Aidge::SubImpl_cuda::backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::SubImpl_cuda::backward_(
+    const Tensor &outputGrad,
+    const std::vector<std::vector<int>> &inputsDims,
+    const std::vector<std::vector<int>> &inputsStrides) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
     const typename Cuda::cudnn_scaling_type<T>::type gamma = -1.0f;
-    for (std::size_t i = 0; i < inputsDims.size(); i++)
-    {
-        if (op.getInput(i)->size() == op.getOutput(0)->size())
-        {
+    for (std::size_t i = 0; i < inputsDims.size(); i++) {
+        if (op.getInput(i)->size() == op.getOutput(0)->size()) {
             CHECK_CUDNN_STATUS(
-            cudnnAddTensor(CudaContext::cudnnHandle(),
-                        i==0 ? &alpha: &gamma,
-                        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                        outputGrad.getImpl()->rawPtr(),
-                        &beta,
-                        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(i)->getImpl())->getCudnnTensorDesc(*op.getInput(i)),
-                        op.getInput(i)->grad()->getImpl()->rawPtr()));
-        }
-        else // In case of broadcasting
+                cudnnAddTensor(CudaContext::cudnnHandle(),
+                               i == 0 ? &alpha : &gamma,
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                                   op.getOutput(0)->getImpl())
+                                   ->getCudnnTensorDesc(*op.getOutput(0)),
+                               outputGrad.getImpl()->rawPtr(),
+                               &beta,
+                               std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                                   op.getInput(i)->getImpl())
+                                   ->getCudnnTensorDesc(*op.getInput(i)),
+                               op.getInput(i)->grad()->getImpl()->rawPtr()));
+        } else // In case of broadcasting
         {
-            // Gradient with respect to input_i: sum outputGrad over the broadcasted dimensions using cudnnReduceTensor
+            // Gradient with respect to input_i: sum outputGrad over the
+            // broadcasted dimensions using cudnnReduceTensor
             cudnnReduceTensorDescriptor_t reduceDesc;
             CHECK_CUDNN_STATUS(cudnnCreateReduceTensorDescriptor(&reduceDesc));
-            CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(reduceDesc,
-                                                              CUDNN_REDUCE_TENSOR_ADD,
-                                                              CudaContext::data_type<T>::value,
-                                                              CUDNN_PROPAGATE_NAN,
-                                                              CUDNN_REDUCE_TENSOR_NO_INDICES,
-                                                              CUDNN_32BIT_INDICES));
-
-            cudnnTensorDescriptor_t outputDesc = std::dynamic_pointer_cast<TensorImpl_cuda_>(outputGrad.getImpl())->getCudnnTensorDesc(*op.getOutput(0));
+            CHECK_CUDNN_STATUS(cudnnSetReduceTensorDescriptor(
+                reduceDesc,
+                CUDNN_REDUCE_TENSOR_ADD,
+                CudaContext::data_type<T>::value,
+                CUDNN_PROPAGATE_NAN,
+                CUDNN_REDUCE_TENSOR_NO_INDICES,
+                CUDNN_32BIT_INDICES));
+
+            cudnnTensorDescriptor_t outputDesc =
+                std::dynamic_pointer_cast<TensorImpl_cuda_>(
+                    outputGrad.getImpl())
+                    ->getCudnnTensorDesc(*op.getOutput(0));
             // Create a Tensor descriptor with the broadcasted dims and strides
             cudnnTensorDescriptor_t tensorDesc;
             CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&tensorDesc));
-            CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(tensorDesc,
-                                                          CudaContext::data_type<T>::value,
-                                                          inputsDims[i].size(),
-                                                          inputsDims[i].data(),
-                                                          inputsStrides[i].data()));
+            CHECK_CUDNN_STATUS(
+                cudnnSetTensorNdDescriptor(tensorDesc,
+                                           CudaContext::data_type<T>::value,
+                                           inputsDims[i].size(),
+                                           inputsDims[i].data(),
+                                           inputsStrides[i].data()));
             size_t workspaceSize;
-            CHECK_CUDNN_STATUS(cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(),
-                               reduceDesc,
-                               outputDesc,
-                               tensorDesc,
-                               &workspaceSize));
+            CHECK_CUDNN_STATUS(
+                cudnnGetReductionWorkspaceSize(CudaContext::cudnnHandle(),
+                                               reduceDesc,
+                                               outputDesc,
+                                               tensorDesc,
+                                               &workspaceSize));
 
             void *d_workspace;
             CHECK_CUDA_STATUS(cudaMalloc(&d_workspace, workspaceSize));
 
-            CHECK_CUDNN_STATUS(cudnnReduceTensor(CudaContext::cudnnHandle(),
-                               reduceDesc,
-                               NULL,
-                               0,
-                               d_workspace,
-                               workspaceSize,
-                               i==0 ? &alpha: &gamma,
-                               outputDesc,
-                               outputGrad.getImpl()->rawPtr(),
-                               &beta,
-                               tensorDesc,
-                               op.getInput(i)->grad()->getImpl()->rawPtr()));
+            CHECK_CUDNN_STATUS(cudnnReduceTensor(
+                CudaContext::cudnnHandle(),
+                reduceDesc,
+                NULL,
+                0,
+                d_workspace,
+                workspaceSize,
+                i == 0 ? &alpha : &gamma,
+                outputDesc,
+                outputGrad.getImpl()->rawPtr(),
+                &beta,
+                tensorDesc,
+                op.getInput(i)->grad()->getImpl()->rawPtr()));
 
             CHECK_CUDNN_STATUS(cudnnDestroyTensorDescriptor(tensorDesc));
         }
diff --git a/src/operator/TanhImpl.cpp b/src/operator/TanhImpl.cpp
index 96c0330febba35cfea04bbbac97d9308195d6309..f217ed867c1357b977cf9a3cf367ab493641b257 100644
--- a/src/operator/TanhImpl.cpp
+++ b/src/operator/TanhImpl.cpp
@@ -20,21 +20,25 @@
 #include "aidge/utils/Types.h"
 
 void Aidge::TanhImpl_cuda::forward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     assert(mOp.getRawInput(0) && "missing input #0");
 
-    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+    const auto &input =
+        op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
 
     // Lazy-initialize CuDNN Tanh descriptor
     if (mTanhDesc == nullptr) {
-		#if CUDNN_VERSION >= 5000
-			CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mTanhDesc));
-			CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor(
-				mTanhDesc, CUDNN_ACTIVATION_TANH, CUDNN_NOT_PROPAGATE_NAN, 0.0));
-		#else
-			mTanhDesc = CUDNN_ACTIVATION_TANH;
-		#endif
+#if CUDNN_VERSION >= 5000
+        CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mTanhDesc));
+        CHECK_CUDNN_STATUS(
+            cudnnSetActivationDescriptor(mTanhDesc,
+                                         CUDNN_ACTIVATION_TANH,
+                                         CUDNN_NOT_PROPAGATE_NAN,
+                                         0.0));
+#else
+        mTanhDesc = CUDNN_ACTIVATION_TANH;
+#endif
     }
 
     // Do the actual forward computation
@@ -42,44 +46,51 @@ void Aidge::TanhImpl_cuda::forward() {
     // excepted when the convolution is performed in double precision.
     if (op.getOutput(0)->dataType() == DataType::Float64) {
         forward_<double>(input);
-    }
-    else {
+    } else {
         forward_<float>(input);
     }
 }
 
-template <class T>
-void Aidge::TanhImpl_cuda::forward_(const Tensor& input) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+template <class T> void Aidge::TanhImpl_cuda::forward_(const Tensor &input) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
-    CHECK_CUDNN_STATUS(
-        cudnnActivationForward(CudaContext::cudnnHandle(),
-                               mTanhDesc,
-                               &alpha,
-							   std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())->getCudnnTensorDesc(input),
-                               input.getImpl()->rawPtr(),
-                               &beta,
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                               std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr()));
+    CHECK_CUDNN_STATUS(cudnnActivationForward(
+        CudaContext::cudnnHandle(),
+        mTanhDesc,
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(input.getImpl())
+            ->getCudnnTensorDesc(input),
+        input.getImpl()->rawPtr(),
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr()));
 }
 
 void Aidge::TanhImpl_cuda::backward() {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
 
     assert(op.getOutput(0)->grad() && "missing output #0");
 
-    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+    const auto &output_grad =
+        op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback,
+                                             *op.getOutput(0)->grad());
 
     // Lazy-initialize CuDNN Tanh descriptor
     if (mTanhDesc == nullptr) {
-		#if CUDNN_VERSION >= 5000
-			CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mTanhDesc));
-			CHECK_CUDNN_STATUS(cudnnSetActivationDescriptor(
-				mTanhDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_NOT_PROPAGATE_NAN, 0.0));
-		#else
-			mTanhDesc = CUDNN_ACTIVATION_SIGMOID;
-		#endif
+#if CUDNN_VERSION >= 5000
+        CHECK_CUDNN_STATUS(cudnnCreateActivationDescriptor(&mTanhDesc));
+        CHECK_CUDNN_STATUS(
+            cudnnSetActivationDescriptor(mTanhDesc,
+                                         CUDNN_ACTIVATION_SIGMOID,
+                                         CUDNN_NOT_PROPAGATE_NAN,
+                                         0.0));
+#else
+        mTanhDesc = CUDNN_ACTIVATION_SIGMOID;
+#endif
     }
 
     // Do the actual backward computation
@@ -87,37 +98,44 @@ void Aidge::TanhImpl_cuda::backward() {
     // excepted when the convolution is performed in double precision.
     if (op.getInput(0)->grad()->dataType() == DataType::Float64) {
         backward_<double>(output_grad);
-    }
-    else {
+    } else {
         backward_<float>(output_grad);
     }
 }
 
 template <class T>
-void Aidge::TanhImpl_cuda::backward_(const Tensor& output_grad) {
-    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+void Aidge::TanhImpl_cuda::backward_(const Tensor &output_grad) {
+    const OperatorTensor &op = static_cast<const OperatorTensor &>(mOp);
     const typename Cuda::cudnn_scaling_type<T>::type alpha = 1.0f;
     const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
-    CHECK_CUDNN_STATUS(
-        cudnnActivationBackward(CudaContext::cudnnHandle(),
-                               mTanhDesc,
-                               &alpha,
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())->getCudnnTensorDesc(*op.getOutput(0)),
-                               std::static_pointer_cast<Tensor>(op.getRawOutput(0))->getImpl()->rawPtr(),
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())->getCudnnTensorDesc(output_grad),
-                               output_grad.getImpl()->rawPtr(),
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())->getCudnnTensorDesc(*op.getInput(0)),
-                               std::static_pointer_cast<Tensor>(op.getRawInput(0))->getImpl()->rawPtr(),
-                               &beta,
-                               std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->grad()->getImpl())->getCudnnTensorDesc(*op.getInput(0)->grad()),
-                               op.getInput(0)->grad()->getImpl()->rawPtr()));
+    CHECK_CUDNN_STATUS(cudnnActivationBackward(
+        CudaContext::cudnnHandle(),
+        mTanhDesc,
+        &alpha,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getOutput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getOutput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawOutput(0))
+            ->getImpl()
+            ->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(output_grad.getImpl())
+            ->getCudnnTensorDesc(output_grad),
+        output_grad.getImpl()->rawPtr(),
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(op.getInput(0)->getImpl())
+            ->getCudnnTensorDesc(*op.getInput(0)),
+        std::static_pointer_cast<Tensor>(op.getRawInput(0))
+            ->getImpl()
+            ->rawPtr(),
+        &beta,
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(
+            op.getInput(0)->grad()->getImpl())
+            ->getCudnnTensorDesc(*op.getInput(0)->grad()),
+        op.getInput(0)->grad()->getImpl()->rawPtr()));
 }
 
 Aidge::TanhImpl_cuda::~TanhImpl_cuda() {
     if (mTanhDesc != nullptr) {
-		#if CUDNN_VERSION >= 5000
-            cudnnDestroyActivationDescriptor(mTanhDesc);
-		#endif
+#if CUDNN_VERSION >= 5000
+        cudnnDestroyActivationDescriptor(mTanhDesc);
+#endif
     }
 }
-
diff --git a/src/utils/CudaUtils.cpp b/src/utils/CudaUtils.cpp
index ca3263a282322e70157b7537c502a63a3edb526f..7e6abadde7b465829b8e49365c4065759e9e850e 100644
--- a/src/utils/CudaUtils.cpp
+++ b/src/utils/CudaUtils.cpp
@@ -1,7 +1,6 @@
 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 
-const char* Aidge::Cuda::cublasGetErrorString(cublasStatus_t error)
-{
+const char *Aidge::Cuda::cublasGetErrorString(cublasStatus_t error) {
     switch (error) {
     case CUBLAS_STATUS_SUCCESS:
         return "CUBLAS_STATUS_SUCCESS";
@@ -28,19 +27,24 @@ const char* Aidge::Cuda::cublasGetErrorString(cublasStatus_t error)
     return "<unknown>";
 }
 
-void Aidge::Cuda::setMultiDevicePeerAccess(unsigned int size, unsigned int* devices)
-{
+void Aidge::Cuda::setMultiDevicePeerAccess(unsigned int size,
+                                           unsigned int *devices) {
     for (unsigned int i = 0; i < size; ++i) {
         for (unsigned int j = 0; j < size; ++j) {
             if (i != j) {
                 int canAccessPeer = 0;
                 CHECK_CUDA_STATUS(cudaDeviceCanAccessPeer(&canAccessPeer,
-                                            devices[j], devices[i]));                     
+                                                          devices[j],
+                                                          devices[i]));
                 if (canAccessPeer) {
                     CHECK_CUDA_STATUS(cudaSetDevice(devices[j]));
-                    const cudaError_t status = cudaDeviceEnablePeerAccess(devices[i], 0);
+                    const cudaError_t status =
+                        cudaDeviceEnablePeerAccess(devices[i], 0);
                     if (status == cudaErrorPeerAccessAlreadyEnabled) {
-                        fmt::print("Peer access already enabled between device {} and device {}\n", devices[j], devices[i]);
+                        fmt::print("Peer access already enabled between "
+                                   "device {} and device {}\n",
+                                   devices[j],
+                                   devices[i]);
                     } else {
                         CHECK_CUDA_STATUS(status);
                     }
diff --git a/unit_tests/Test_AddImpl.cpp b/unit_tests/Test_AddImpl.cpp
index dffabe6aab92bdfdd0c79b61ab59e9bc6efb9d94..3993cdf3f60dd1eaab11b87ede06b53b59d06832 100644
--- a/unit_tests/Test_AddImpl.cpp
+++ b/unit_tests/Test_AddImpl.cpp
@@ -9,9 +9,9 @@
  *
  ********************************************************************************/
 
-#include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
 #include <catch2/catch_test_macros.hpp>
+#include <numeric> // std::accumulate
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include "aidge/backend/cpu.hpp"
 #include "aidge/backend/cuda.hpp"
@@ -23,59 +23,60 @@ using namespace Aidge;
 
 TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") {
     SECTION("Same input") {
-        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> {
-            {                                       //
-                {                                   //
-                    {{20, 47},{21, 48},{22, 49}},   //
-                    {{23, 50},{24, 51},{25, 52}},   //
-                    {{26, 53},{27, 54},{28, 55}}    //
-                },                                  //
-                {                                   //
-                    {{29, 56},{30, 57},{31, 58}},   //
-                    {{32, 59},{33, 60},{34, 61}},   //
-                    {{35, 62},{36, 63},{37, 64}}    //
-                },                                  //
-                {                                   //
-                    {{38, 65},{39, 66},{40, 67}},   //
-                    {{41, 68},{42, 69},{43, 70}},   //
-                    {{44, 71},{45, 72},{46, 73}}    //
-                }                                   //
-            }                                       //
-        });                                         //
-        input1->setBackend("cuda");
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,3,3,3,2> {
-            {
+        std::shared_ptr<Tensor> input1 =
+            std::make_shared<Tensor>(Array4D<float, 3, 3, 3, 2>{
                 {
-                    {{40,  94},{42,  96},{44,  98}},
-                    {{46, 100},{48, 102},{50, 104}},
-                    {{52, 106},{54, 108},{56, 110}}
-                },
-                {
-                    {{58, 112},{60, 114},{62, 116}},
-                    {{64, 118},{66, 120},{68, 122}},
-                    {{70, 124},{72, 126},{74, 128}}
-                },
-                {
-                    {{76, 130},{78, 132},{80, 134}},
-                    {{82, 136},{84, 138},{86, 140}},
-                    {{88, 142},{90, 144},{92, 146}}
-                }
-            }
-        });
+                    //
+                    {
+                        //
+                        {{20, 47}, {21, 48}, {22, 49}}, //
+                        {{23, 50}, {24, 51}, {25, 52}}, //
+                        {{26, 53}, {27, 54}, {28, 55}}  //
+                    },                                  //
+                    {
+                        //
+                        {{29, 56}, {30, 57}, {31, 58}}, //
+                        {{32, 59}, {33, 60}, {34, 61}}, //
+                        {{35, 62}, {36, 63}, {37, 64}}  //
+                    },                                  //
+                    {
+                        //
+                        {{38, 65}, {39, 66}, {40, 67}}, //
+                        {{41, 68}, {42, 69}, {43, 70}}, //
+                        {{44, 71}, {45, 72}, {46, 73}}  //
+                    } //
+                } //
+            }); //
+        input1->setBackend("cuda");
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
+            Array4D<float, 3, 3, 3, 2>{{{{{40, 94}, {42, 96}, {44, 98}},
+                                         {{46, 100}, {48, 102}, {50, 104}},
+                                         {{52, 106}, {54, 108}, {56, 110}}},
+                                        {{{58, 112}, {60, 114}, {62, 116}},
+                                         {{64, 118}, {66, 120}, {68, 122}},
+                                         {{70, 124}, {72, 126}, {74, 128}}},
+                                        {{{76, 130}, {78, 132}, {80, 134}},
+                                         {{82, 136}, {84, 138}, {86, 140}},
+                                         {{88, 142}, {90, 144}, {92, 146}}}}});
 
         std::shared_ptr<Node> myAdd = Add();
-        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myAdd->getOperator());
         op->associateInput(0, input1);
         op->associateInput(1, input1);
         op->setBackend("cuda");
         op->setDataType(DataType::Float32);
         myAdd->forward();
 
-        float* computedOutput   = new float[input1->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput->size(), cudaMemcpyDeviceToHost);
+        float *computedOutput = new float[input1->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * expectedOutput->size(),
+                   cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < expectedOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < expectedOutput->size(); i++) {
+            const float targetOutput = *(
+                static_cast<float *>(expectedOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
@@ -83,99 +84,131 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") {
     }
 
     SECTION("Broadcasting") {
-        std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> {
-        {                                       //
-            {                                   //
-                {{0, 1},{2, 3},{4, 5}}          //
-            },                                  //
-            {                                   //
-                {{6, 7},{8, 9},{10, 11}}        //
-            },                                  //
-            {                                   //
-                {{12, 13},{14, 15},{16, 17}}    //
-            }                                   //
-        }                                       //
-        });                                     //
-        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,1,3,3,2> {
-        {                                       //
-            {                                   //
-                {{20, 21},{22, 23},{24, 25}},   //
-                {{26, 27},{28, 29},{30, 31}},   //
-                {{32, 33},{34, 35},{36, 37}}    //
-            }                                   //
-        }                                       //
-        });                                     //
-
-        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{100,200}});
+        std::shared_ptr<Tensor> input_0 =
+            std::make_shared<Tensor>(Array4D<float, 3, 1, 3, 2>{
+                {
+                    //
+                    {
+                        //
+                        {{0, 1}, {2, 3}, {4, 5}} //
+                    },                           //
+                    {
+                        //
+                        {{6, 7}, {8, 9}, {10, 11}} //
+                    },                             //
+                    {
+                        //
+                        {{12, 13}, {14, 15}, {16, 17}} //
+                    } //
+                } //
+            }); //
+        std::shared_ptr<Tensor> input_1 =
+            std::make_shared<Tensor>(Array4D<float, 1, 3, 3, 2>{
+                {
+                    //
+                    {
+                        //
+                        {{20, 21}, {22, 23}, {24, 25}}, //
+                        {{26, 27}, {28, 29}, {30, 31}}, //
+                        {{32, 33}, {34, 35}, {36, 37}}  //
+                    } //
+                } //
+            }); //
+
+        std::shared_ptr<Tensor> input_2 =
+            std::make_shared<Tensor>(Array1D<float, 2>{{100, 200}});
         input_0->setBackend("cuda");
         input_1->setBackend("cuda");
         input_2->setBackend("cuda");
 
-        /// Input0(d0, 1, d2, d3) + Input1(1, d1, d2, d3) = Output(d0, d1, d2, d3)
-        std::shared_ptr<Tensor> expectedOutput0 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> {
-            {                                         //
-                {                                     //
-                    {{ 20, 22},{ 24, 26},{ 28, 30}},  //
-                    {{ 26, 28},{ 30, 32},{ 34, 36}},  //
-                    {{ 32, 34},{ 36, 38},{ 40, 42}}   //
-                },                                    //
-                {                                     //
-                    {{ 26, 28},{ 30, 32},{ 34, 36}},  //
-                    {{ 32, 34},{ 36, 38},{ 40, 42}},  //
-                    {{ 38, 40},{ 42, 44},{ 46, 48}}   //
-                },                                    //
-                {                                     //
-                    {{ 32, 34},{ 36, 38},{40, 42}},   //
-                    {{ 38, 40},{ 42, 44},{46, 48}},   //
-                    {{ 44, 46},{ 48, 50},{52, 54}}    //
-                }                                     //
-            }                                         //
-        });                                           //
+        /// Input0(d0, 1, d2, d3) + Input1(1, d1, d2, d3) = Output(d0, d1, d2,
+        /// d3)
+        std::shared_ptr<Tensor> expectedOutput0 =
+            std::make_shared<Tensor>(Array4D<float, 3, 3, 3, 2>{
+                {
+                    //
+                    {
+                        //
+                        {{20, 22}, {24, 26}, {28, 30}}, //
+                        {{26, 28}, {30, 32}, {34, 36}}, //
+                        {{32, 34}, {36, 38}, {40, 42}}  //
+                    },                                  //
+                    {
+                        //
+                        {{26, 28}, {30, 32}, {34, 36}}, //
+                        {{32, 34}, {36, 38}, {40, 42}}, //
+                        {{38, 40}, {42, 44}, {46, 48}}  //
+                    },                                  //
+                    {
+                        //
+                        {{32, 34}, {36, 38}, {40, 42}}, //
+                        {{38, 40}, {42, 44}, {46, 48}}, //
+                        {{44, 46}, {48, 50}, {52, 54}}  //
+                    } //
+                } //
+            }); //
 
         std::shared_ptr<Node> myAdd0 = Add();
-        auto op0 = std::static_pointer_cast<OperatorTensor>(myAdd0 -> getOperator());
+        auto op0 =
+            std::static_pointer_cast<OperatorTensor>(myAdd0->getOperator());
         op0->associateInput(0, input_0);
         op0->associateInput(1, input_1);
         op0->setDataType(DataType::Float32);
         op0->setBackend("cuda");
         myAdd0->forward();
 
-        float* computedOutput0   = new float[expectedOutput0->size()]();
-        cudaMemcpy(computedOutput0, op0->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput0->size(), cudaMemcpyDeviceToHost);
+        float *computedOutput0 = new float[expectedOutput0->size()]();
+        cudaMemcpy(computedOutput0,
+                   op0->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * expectedOutput0->size(),
+                   cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < expectedOutput0->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedOutput0->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < expectedOutput0->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(expectedOutput0->getImpl()->rawPtr()) +
+                  i);
             REQUIRE(fabs(computedOutput0[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedOutput0;
 
         /// Input0(d0, d1, d2, d3) + Input1(d3) = Output(d0, d1, d2, d3)
-        std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> {
-        {                                             //
-            {                                         //
-                {{100, 201},{102, 203},{104, 205}}    //
-            },                                        //
-            {                                         //
-                {{106, 207},{108, 209},{110, 211}}    //
-            },                                        //
-            {                                         //
-                {{112, 213},{114, 215},{116, 217}}    //
-            }                                         //
-        }                                             //
-        });                                           //
+        std::shared_ptr<Tensor> expectedOutput1 =
+            std::make_shared<Tensor>(Array4D<float, 3, 1, 3, 2>{
+                {
+                    //
+                    {
+                        //
+                        {{100, 201}, {102, 203}, {104, 205}} //
+                    },                                       //
+                    {
+                        //
+                        {{106, 207}, {108, 209}, {110, 211}} //
+                    },                                       //
+                    {
+                        //
+                        {{112, 213}, {114, 215}, {116, 217}} //
+                    } //
+                } //
+            }); //
         std::shared_ptr<Node> myAdd1 = Add();
-        auto op1 = std::static_pointer_cast<OperatorTensor>(myAdd1 -> getOperator());
+        auto op1 =
+            std::static_pointer_cast<OperatorTensor>(myAdd1->getOperator());
         op1->associateInput(0, input_0);
         op1->associateInput(1, input_2);
         op1->setDataType(DataType::Float32);
         op1->setBackend("cuda");
         myAdd1->forward();
-        float* computedOutput1   = new float[expectedOutput1->size()]();
-        cudaMemcpy(computedOutput1, op1->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * expectedOutput1->size(), cudaMemcpyDeviceToHost);
-
-        for(int i = 0; i < expectedOutput1->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedOutput1->getImpl()->rawPtr()) + i);
+        float *computedOutput1 = new float[expectedOutput1->size()]();
+        cudaMemcpy(computedOutput1,
+                   op1->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * expectedOutput1->size(),
+                   cudaMemcpyDeviceToHost);
+
+        for (int i = 0; i < expectedOutput1->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(expectedOutput1->getImpl()->rawPtr()) +
+                  i);
             REQUIRE(fabs(computedOutput1[i] - targetOutput) < 1e-6);
         }
 
@@ -188,26 +221,30 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") {
         std::random_device rd;
         std::mt19937 gen(rd());
         std::uniform_real_distribution<float> valueDist(
-            0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
-                                                               std::size_t(10));
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(5));
-        std::uniform_int_distribution<int> boolDist(0,1);
+            0.1f,
+            1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(
+            std::size_t(1),
+            std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4),
+                                                              std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0, 1);
 
         // To measure execution time of 'forward()'
         std::chrono::time_point<std::chrono::system_clock> start;
         std::chrono::time_point<std::chrono::system_clock> end;
         std::chrono::duration<double, std::micro> duration{};
         std::size_t number_of_operation = 0;
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
-        {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
             // Create Add Operator CUDA
             std::shared_ptr<Node> myAddCUDA = Add("myaddcuda");
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myAddCUDA -> getOperator());
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(
+                myAddCUDA->getOperator());
 
             // Create Add Operator CPU
             std::shared_ptr<Node> myAddCPU = Add("myaddcpu");
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myAddCPU -> getOperator());
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(
+                myAddCPU->getOperator());
             op_cpu->setDataType(DataType::Float32);
             op_cpu->setBackend("cpu");
 
@@ -218,23 +255,35 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") {
                 // To test broadcasting, set some dims to 1
                 if (boolDist(gen)) {
                     dims0.push_back(1);
-                }else{
+                } else {
                     dims0.push_back(dim);
                 }
                 if (boolDist(gen)) {
                     dims1.push_back(1);
-                }else{
+                } else {
                     dims1.push_back(dim);
                 }
                 dims.push_back(std::max(dims0[i], dims1[i]));
             }
-            const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
-            const std::size_t nb_elements1 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
-            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements0 =
+                std::accumulate(dims0.cbegin(),
+                                dims0.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
+            const std::size_t nb_elements1 =
+                std::accumulate(dims1.cbegin(),
+                                dims1.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
+            const std::size_t nb_elements =
+                std::accumulate(dims.cbegin(),
+                                dims.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
             number_of_operation += nb_elements;
 
-            float* array0 = new float[nb_elements0];
-            float* array1 = new float[nb_elements1];
+            float *array0 = new float[nb_elements0];
+            float *array1 = new float[nb_elements1];
 
             for (std::size_t i = 0; i < nb_elements0; ++i) {
                 array0[i] = valueDist(gen);
@@ -244,23 +293,27 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") {
             }
 
             // input0 CUDA
-            float* array0_d, *array1_d;
+            float *array0_d, *array1_d;
             std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
             T0_cuda->setDataType(DataType::Float32);
             T0_cuda->setBackend("cuda");
             T0_cuda->resize(dims0);
             op_cuda->associateInput(0, T0_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements0);
-            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements0, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                       sizeof(float) * nb_elements0);
+            cudaMemcpy(array0_d,
+                       array0,
+                       sizeof(float) * nb_elements0,
+                       cudaMemcpyHostToDevice);
             T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0);
 
             // input0 CPU
             std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(0,T0_cpu);
+            op_cpu->associateInput(0, T0_cpu);
             T0_cpu->setDataType(DataType::Float32);
             T0_cpu->setBackend("cpu");
             T0_cpu->resize(dims0);
-            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements0);
+            T0_cpu->getImpl()->setRawPtr(array0, nb_elements0);
 
             // input1 CUDA
             std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>();
@@ -268,17 +321,21 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") {
             T1_cuda->setBackend("cuda");
             T1_cuda->resize(dims1);
             op_cuda->associateInput(1, T1_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements1);
-            cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements1, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&array1_d),
+                       sizeof(float) * nb_elements1);
+            cudaMemcpy(array1_d,
+                       array1,
+                       sizeof(float) * nb_elements1,
+                       cudaMemcpyHostToDevice);
             T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1);
 
             // input1 CPU
             std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(1,T1_cpu);
+            op_cpu->associateInput(1, T1_cpu);
             T1_cpu->setDataType(DataType::Float32);
             T1_cpu->setBackend("cpu");
             T1_cpu->resize(dims1);
-            T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1);
+            T1_cpu->getImpl()->setRawPtr(array1, nb_elements1);
 
             // forward CUDA
             op_cuda->setDataType(DataType::Float32);
@@ -286,14 +343,19 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") {
             start = std::chrono::system_clock::now();
             op_cuda->forward();
             end = std::chrono::system_clock::now();
-            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(
+                end - start);
 
             float *computedOutput = new float[nb_elements]();
-            cudaMemcpy(computedOutput, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nb_elements, cudaMemcpyDeviceToHost);
+            cudaMemcpy(computedOutput,
+                       op_cuda->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * nb_elements,
+                       cudaMemcpyDeviceToHost);
 
             // forward CPU
             op_cpu->forward();
-            float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            float *computedCPU = static_cast<float *>(
+                op_cpu->getOutput(0)->getImpl()->rawPtr());
             REQUIRE(approxEq<float>(*computedOutput, *computedCPU));
 
             delete[] array0;
@@ -301,107 +363,134 @@ TEST_CASE("[gpu/operator] Add(forward)", "[Add][GPU]") {
             delete[] computedOutput;
             cudaFree(array0_d);
             cudaFree(array1_d);
-
         }
     }
-
 }
 
 TEST_CASE("[gpu/operator] Add(backward)", "[Add][GPU]") {
-        std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<float,3,1,3,2> {
-        {                                       //
-            {                                   //
-                {{0, 1},{2, 3},{4, 5}}          //
-            },                                  //
-            {                                   //
-                {{6, 7},{8, 9},{10, 11}}        //
-            },                                  //
-            {                                   //
-                {{12, 13},{14, 15},{16, 17}}    //
-            }                                   //
-        }                                       //
-        });                                     //
-        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,1,3,3,2> {
-        {                                       //
-            {                                   //
-                {{20, 21},{22, 23},{24, 25}},   //
-                {{26, 27},{28, 29},{30, 31}},   //
-                {{32, 33},{34, 35},{36, 37}}    //
-            }                                   //
-        }                                       //
-        });                                     //
-
-        input_0->setBackend("cuda");
-        input_1->setBackend("cuda");
-        std::shared_ptr<Node> myAdd = Add();
-        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
-        op->associateInput(0, input_0);
-        op->associateInput(1, input_1);
-        op->setDataType(DataType::Float32);
-        op->setBackend("cuda");
-        myAdd->forward();
-
-        // Run and test backward operation
-        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,3,3,3,2> {
-            {                                         //
-                {                                     //
-                    {{  1,  2},{  3,  4},{  5,  6}},  //
-                    {{  7,  8},{  9, 10},{ 11, 12}},  //
-                    {{ 13, 14},{ 15, 16},{ 17, 18}}   //
-                },                                    //
-                {                                     //
-                    {{ 19, 20},{ 21, 22},{ 23, 24}},  //
-                    {{ 25, 26},{ 27, 28},{ 29, 30}},  //
-                    {{ 31, 32},{ 33, 34},{ 35, 36}}   //
-                },                                    //
-                {                                     //
-                    {{ 37, 38},{ 39, 40},{41, 42}},   //
-                    {{ 43, 44},{ 45, 46},{47, 48}},   //
-                    {{ 49, 50},{ 51, 52},{53, 54}}    //
-                }                                     //
-            }                                         //
-        });                                           //
-        myOutputGrad->setBackend("cuda");
-        op->getOutput(0)->setGrad(myOutputGrad);
-        REQUIRE_NOTHROW(myAdd->backward());
-
-        std::shared_ptr<Tensor> expectedInput1Grad = std::make_shared<Tensor>(Array4D<float,3,1,3,2> {
-        {                                         //
-            {                                     //
-                {{21, 24},{27, 30},{33, 36}}      //
-            },                                    //
-            {                                     //
-                {{75, 78},{81, 84},{87, 90}}      //
-            },                                    //
-            {                                     //
-                {{129, 132},{135, 138},{141, 144}}//
-            }                                     //
-        }                                         //
-        });                                       //
-        std::shared_ptr<Tensor> expectedInput2Grad = std::make_shared<Tensor>(Array4D<float,1,3,3,2> {
-        {                                       //
-            {                                   //
-                {{57, 60},{63, 66},{69, 72}},   //
-                {{75, 78},{81, 84},{87, 90}},   //
-                {{93, 96},{99, 102},{105, 108}} //
-            }                                   //
-        }                                       //
-        });                                     //
-
-        float *computedGrad1Cuda = new float[expectedInput1Grad->size()]();
-        cudaMemcpy(computedGrad1Cuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput1Grad->size(), cudaMemcpyDeviceToHost);
-        float *computedGrad2Cuda = new float[expectedInput2Grad->size()]();
-        cudaMemcpy(computedGrad2Cuda, op->getInput(1)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInput2Grad->size(), cudaMemcpyDeviceToHost);
-
-        for(int i = 0; i < expectedInput1Grad->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedInput1Grad->getImpl()->rawPtr()) + i);
-            REQUIRE(fabs(computedGrad1Cuda[i] - targetOutput) < 1e-6);
-        }
-        for(int i = 0; i < expectedInput2Grad->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedInput2Grad->getImpl()->rawPtr()) + i);
-            REQUIRE(fabs(computedGrad2Cuda[i] - targetOutput) < 1e-6);
-        }
+    std::shared_ptr<Tensor> input_0 =
+        std::make_shared<Tensor>(Array4D<float, 3, 1, 3, 2>{
+            {
+                //
+                {
+                    //
+                    {{0, 1}, {2, 3}, {4, 5}} //
+                },                           //
+                {
+                    //
+                    {{6, 7}, {8, 9}, {10, 11}} //
+                },                             //
+                {
+                    //
+                    {{12, 13}, {14, 15}, {16, 17}} //
+                } //
+            } //
+        }); //
+    std::shared_ptr<Tensor> input_1 =
+        std::make_shared<Tensor>(Array4D<float, 1, 3, 3, 2>{
+            {
+                //
+                {
+                    //
+                    {{20, 21}, {22, 23}, {24, 25}}, //
+                    {{26, 27}, {28, 29}, {30, 31}}, //
+                    {{32, 33}, {34, 35}, {36, 37}}  //
+                } //
+            } //
+        }); //
+
+    input_0->setBackend("cuda");
+    input_1->setBackend("cuda");
+    std::shared_ptr<Node> myAdd = Add();
+    auto op = std::static_pointer_cast<OperatorTensor>(myAdd->getOperator());
+    op->associateInput(0, input_0);
+    op->associateInput(1, input_1);
+    op->setDataType(DataType::Float32);
+    op->setBackend("cuda");
+    myAdd->forward();
+
+    // Run and test backward operation
+    std::shared_ptr<Tensor> myOutputGrad =
+        std::make_shared<Tensor>(Array4D<float, 3, 3, 3, 2>{
+            {
+                //
+                {
+                    //
+                    {{1, 2}, {3, 4}, {5, 6}},      //
+                    {{7, 8}, {9, 10}, {11, 12}},   //
+                    {{13, 14}, {15, 16}, {17, 18}} //
+                },                                 //
+                {
+                    //
+                    {{19, 20}, {21, 22}, {23, 24}}, //
+                    {{25, 26}, {27, 28}, {29, 30}}, //
+                    {{31, 32}, {33, 34}, {35, 36}}  //
+                },                                  //
+                {
+                    //
+                    {{37, 38}, {39, 40}, {41, 42}}, //
+                    {{43, 44}, {45, 46}, {47, 48}}, //
+                    {{49, 50}, {51, 52}, {53, 54}}  //
+                } //
+            } //
+        }); //
+    myOutputGrad->setBackend("cuda");
+    op->getOutput(0)->setGrad(myOutputGrad);
+    REQUIRE_NOTHROW(myAdd->backward());
+
+    std::shared_ptr<Tensor> expectedInput1Grad =
+        std::make_shared<Tensor>(Array4D<float, 3, 1, 3, 2>{
+            {
+                //
+                {
+                    //
+                    {{21, 24}, {27, 30}, {33, 36}} //
+                },                                 //
+                {
+                    //
+                    {{75, 78}, {81, 84}, {87, 90}} //
+                },                                 //
+                {
+                    //
+                    {{129, 132}, {135, 138}, {141, 144}} //
+                } //
+            } //
+        }); //
+    std::shared_ptr<Tensor> expectedInput2Grad =
+        std::make_shared<Tensor>(Array4D<float, 1, 3, 3, 2>{
+            {
+                //
+                {
+                    //
+                    {{57, 60}, {63, 66}, {69, 72}},   //
+                    {{75, 78}, {81, 84}, {87, 90}},   //
+                    {{93, 96}, {99, 102}, {105, 108}} //
+                } //
+            } //
+        }); //
+
+    float *computedGrad1Cuda = new float[expectedInput1Grad->size()]();
+    cudaMemcpy(computedGrad1Cuda,
+               op->getInput(0)->grad()->getImpl()->rawPtr(),
+               sizeof(float) * expectedInput1Grad->size(),
+               cudaMemcpyDeviceToHost);
+    float *computedGrad2Cuda = new float[expectedInput2Grad->size()]();
+    cudaMemcpy(computedGrad2Cuda,
+               op->getInput(1)->grad()->getImpl()->rawPtr(),
+               sizeof(float) * expectedInput2Grad->size(),
+               cudaMemcpyDeviceToHost);
+
+    for (int i = 0; i < expectedInput1Grad->size(); i++) {
+        const float targetOutput = *(
+            static_cast<float *>(expectedInput1Grad->getImpl()->rawPtr()) + i);
+        REQUIRE(fabs(computedGrad1Cuda[i] - targetOutput) < 1e-6);
+    }
+    for (int i = 0; i < expectedInput2Grad->size(); i++) {
+        const float targetOutput = *(
+            static_cast<float *>(expectedInput2Grad->getImpl()->rawPtr()) + i);
+        REQUIRE(fabs(computedGrad2Cuda[i] - targetOutput) < 1e-6);
+    }
 
-        delete[] computedGrad1Cuda;
-        delete[] computedGrad2Cuda;
+    delete[] computedGrad1Cuda;
+    delete[] computedGrad2Cuda;
 }
\ No newline at end of file
diff --git a/unit_tests/Test_AndImpl.cpp b/unit_tests/Test_AndImpl.cpp
index 66de926088bb47c06ea1f9f10655730404787149..1e6ee3396b7ce729c73a6792cf9698cf15cd1013 100644
--- a/unit_tests/Test_AndImpl.cpp
+++ b/unit_tests/Test_AndImpl.cpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
 #include <catch2/catch_test_macros.hpp>
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include "aidge/backend/cpu.hpp"
 #include "aidge/backend/cuda.hpp"
@@ -21,104 +21,113 @@ using namespace Aidge;
 
 TEST_CASE("[gpu/operator] And(forward)", "[And][GPU]") {
     SECTION("Same size inputs") {
-        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> {
-            {                                       //
-                {                                   //
-                    {{20, 15},{31, 11},{22, 49}},   //
-                    {{41, 10},{24, 51},{27, 52}},   //
-                    {{26, 53},{27, 54},{28, 55}}    //
-                },                                  //
-                {                                   //
-                    {{29, 56},{30, 57},{31, 58}},   //
-                    {{32, 59},{33, 60},{34, 61}},   //
-                    {{35, 62},{36, 63},{37, 64}}    //
-                },                                  //
-                {                                   //
-                    {{38, 65},{39, 66},{40, 67}},   //
-                    {{41, 68},{42, 69},{43, 70}},   //
-                    {{44, 71},{45, 72},{46, 73}}    //
-                }                                   //
-            }                                       //
-        });                                         //
-        input_1->setBackend("cuda");
-        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array4D<float,3,3,3,2> {
-            {                                       //
-                {                                   //
-                    {{20, 47},{21, 48},{22, 49}},   //
-                    {{23, 50},{24, 51},{25, 52}},   //
-                    {{17, 53},{27, 26},{14, 33}}    //
-                },                                  //
-                {                                   //
-                    {{29, 56},{30, 57},{31, 58}},   //
-                    {{72, 44},{33, 20},{27, 55}},   //
-                    {{35, 24},{25, 63},{28, 64}}    //
-                },                                  //
-                {                                   //
-                    {{32, 65},{39, 66},{40, 70}},   //
-                    {{41, 53},{42, 60},{34, 70}},   //
-                    {{44, 71},{30, 12},{46, 73}}    //
-                }                                   //
-            }                                       //
-        });                                         //
-        input_2->setBackend("cuda");
-        const Tensor myOutput = Tensor(Array4D<float,3,3,3,2> {
-            {
+        std::shared_ptr<Tensor> input_1 =
+            std::make_shared<Tensor>(Array4D<float, 3, 3, 3, 2>{
                 {
-                    {{1, 0},{0, 0},{1, 1}},
-                    {{0, 0},{1, 1},{0, 1}},
-                    {{0, 1},{1, 0},{0, 0}}
-                },
-                {
-                    {{1, 1},{1, 1},{1, 1}},
-                    {{0, 0},{1, 0},{0, 0}},
-                    {{1, 0},{0, 1},{0, 1}}
-                },
+                    //
+                    {
+                        //
+                        {{20, 15}, {31, 11}, {22, 49}}, //
+                        {{41, 10}, {24, 51}, {27, 52}}, //
+                        {{26, 53}, {27, 54}, {28, 55}}  //
+                    },                                  //
+                    {
+                        //
+                        {{29, 56}, {30, 57}, {31, 58}}, //
+                        {{32, 59}, {33, 60}, {34, 61}}, //
+                        {{35, 62}, {36, 63}, {37, 64}}  //
+                    },                                  //
+                    {
+                        //
+                        {{38, 65}, {39, 66}, {40, 67}}, //
+                        {{41, 68}, {42, 69}, {43, 70}}, //
+                        {{44, 71}, {45, 72}, {46, 73}}  //
+                    } //
+                } //
+            }); //
+        input_1->setBackend("cuda");
+        std::shared_ptr<Tensor> input_2 =
+            std::make_shared<Tensor>(Array4D<float, 3, 3, 3, 2>{
                 {
-                    {{0, 1},{1, 1},{1, 0}},
-                    {{1, 0},{1, 0},{0, 1}},
-                    {{1, 1},{0, 0},{1, 1}}
-                }
-            }
-        });
+                    //
+                    {
+                        //
+                        {{20, 47}, {21, 48}, {22, 49}}, //
+                        {{23, 50}, {24, 51}, {25, 52}}, //
+                        {{17, 53}, {27, 26}, {14, 33}}  //
+                    },                                  //
+                    {
+                        //
+                        {{29, 56}, {30, 57}, {31, 58}}, //
+                        {{72, 44}, {33, 20}, {27, 55}}, //
+                        {{35, 24}, {25, 63}, {28, 64}}  //
+                    },                                  //
+                    {
+                        //
+                        {{32, 65}, {39, 66}, {40, 70}}, //
+                        {{41, 53}, {42, 60}, {34, 70}}, //
+                        {{44, 71}, {30, 12}, {46, 73}}  //
+                    } //
+                } //
+            }); //
+        input_2->setBackend("cuda");
+        const Tensor myOutput =
+            Tensor(Array4D<float, 3, 3, 3, 2>{{{{{1, 0}, {0, 0}, {1, 1}},
+                                                {{0, 0}, {1, 1}, {0, 1}},
+                                                {{0, 1}, {1, 0}, {0, 0}}},
+                                               {{{1, 1}, {1, 1}, {1, 1}},
+                                                {{0, 0}, {1, 0}, {0, 0}},
+                                                {{1, 0}, {0, 1}, {0, 1}}},
+                                               {{{0, 1}, {1, 1}, {1, 0}},
+                                                {{1, 0}, {1, 0}, {0, 1}},
+                                                {{1, 1}, {0, 0}, {1, 1}}}}});
 
         std::shared_ptr<Node> myAnd = And();
-        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
         op->associateInput(0, input_1);
         op->associateInput(1, input_2);
         op->setBackend("cuda");
         op->setDataType(DataType::Float32);
         myAnd->forward();
 
-
         std::shared_ptr<Tensor> outputFallback;
-        const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput);
+        const auto &cudaOutput =
+            op->getOutput(0)->refCastFrom(outputFallback, myOutput);
         REQUIRE(approxEq<float>(cudaOutput, myOutput));
     }
 
     SECTION("Broadcasting") {
-        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float,1,3,3,2> {
-        {                                       //
-            {                                   //
-                {{10, 20},{22, 23},{20, 20}},   //
-                {{10, 15},{10, 29},{20, 20}},   //
-                {{26, 25},{33, 20},{10, 20}}    //
-            }                                   //
-        }                                       //
-        });                                     //
+        std::shared_ptr<Tensor> input_1 =
+            std::make_shared<Tensor>(Array4D<float, 1, 3, 3, 2>{
+                {
+                    //
+                    {
+                        //
+                        {{10, 20}, {22, 23}, {20, 20}}, //
+                        {{10, 15}, {10, 29}, {20, 20}}, //
+                        {{26, 25}, {33, 20}, {10, 20}}  //
+                    } //
+                } //
+            }); //
         input_1->setBackend("cuda");
-        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float,2> {{10, 20}});  
-        const Tensor myOutput = Tensor(Array4D<float,1,3,3,2> {
-            {                                   //
-                {                               //
-                    {{ 1, 1},{ 0, 0},{ 0, 1}},  //
-                    {{ 1, 0},{ 1, 0},{ 0, 1}},  //
-                    {{ 0, 0},{ 0, 1},{ 1, 1}}   //
-                }                               //
-            }                                   //
-        });                                     //
+        std::shared_ptr<Tensor> input_2 =
+            std::make_shared<Tensor>(Array1D<float, 2>{{10, 20}});
+        const Tensor myOutput = Tensor(Array4D<float, 1, 3, 3, 2>{
+            {
+                //
+                {
+                    //
+                    {{1, 1}, {0, 0}, {0, 1}}, //
+                    {{1, 0}, {1, 0}, {0, 1}}, //
+                    {{0, 0}, {0, 1}, {1, 1}}  //
+                } //
+            } //
+        }); //
         input_2->setBackend("cuda");
         std::shared_ptr<Node> myAnd = And();
-        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
         op->associateInput(0, input_1);
         op->associateInput(1, input_2);
         op->setDataType(DataType::Float32);
@@ -126,7 +135,8 @@ TEST_CASE("[gpu/operator] And(forward)", "[And][GPU]") {
         myAnd->forward();
 
         std::shared_ptr<Tensor> outputFallback;
-        const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput);
+        const auto &cudaOutput =
+            op->getOutput(0)->refCastFrom(outputFallback, myOutput);
         REQUIRE(approxEq<float>(cudaOutput, myOutput));
     }
 }
\ No newline at end of file
diff --git a/unit_tests/Test_ArgMaxImpl.cpp b/unit_tests/Test_ArgMaxImpl.cpp
index d123b5bd3376c7169b2e003d8b366bb9045fe3e1..0fe7927fdb8ed9513f0e894aef428ead9abc238d 100644
--- a/unit_tests/Test_ArgMaxImpl.cpp
+++ b/unit_tests/Test_ArgMaxImpl.cpp
@@ -9,9 +9,9 @@
  *
  ********************************************************************************/
 
-#include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
 #include <catch2/catch_test_macros.hpp>
+#include <numeric> // std::accumulate
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include "aidge/backend/cpu.hpp"
 #include "aidge/backend/cuda.hpp"
@@ -23,133 +23,104 @@ using namespace Aidge;
 
 TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") {
     SECTION("3D Tensor") {
-            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,2,3,4> {
-                {
-                    {
-                        { 1.0, 2.0, 3.0, 4.0},
-                        { 8.0, 0.0, 17.0, 1.0},
-                        { 5.0, 10.0, 6.0, 0.0}
-                    },
-                    {
-                        { 7.0, 1.0, 9.0, 4.0},
-                        { 0.0, 8.0, 4.0, 2.0},
-                        { 9.0, 2.0, 0.0, 5.0}
-                    }
-                }
-            });
-            myInput->setBackend("cuda");
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 4>{{{{1.0, 2.0, 3.0, 4.0},
+                                      {8.0, 0.0, 17.0, 1.0},
+                                      {5.0, 10.0, 6.0, 0.0}},
+                                     {{7.0, 1.0, 9.0, 4.0},
+                                      {0.0, 8.0, 4.0, 2.0},
+                                      {9.0, 2.0, 0.0, 5.0}}}});
+        myInput->setBackend("cuda");
         SECTION("Axis 2") {
 
-            const Tensor myOutput = Tensor(Array3D<float,2,3, 1> {
-               { 
-                    { 
-                        {3.0},
-                        {2.0},
-                        {1.0}
-                    },
-                    {
-                        {2.0},
-                        {1.0},
-                        {0.0}
-                    }
-               }
-            });
+            const Tensor myOutput = Tensor(Array3D<float, 2, 3, 1>{
+                {{{3.0}, {2.0}, {1.0}}, {{2.0}, {1.0}, {0.0}}}});
 
             std::shared_ptr<Node> myArgMax = ArgMax(2);
-            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
-            op->associateInput(0,myInput);
+            auto op = std::static_pointer_cast<OperatorTensor>(
+                myArgMax->getOperator());
+            op->associateInput(0, myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cuda");
             myArgMax->forward();
 
             std::shared_ptr<Tensor> outputFallback;
-            const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput);
+            const auto &cudaOutput =
+                op->getOutput(0)->refCastFrom(outputFallback, myOutput);
             REQUIRE(approxEq<float>(cudaOutput, myOutput));
-
         }
         SECTION("Axis 2 with keep_dims false") {
 
-            const Tensor myOutput = Tensor(Array2D<float,2,3> {
-               { 
-                    { 3.0, 2.0, 1.0 },
-                    { 2.0, 1.0, 0.0 }
-               }
-            });
+            const Tensor myOutput = Tensor(
+                Array2D<float, 2, 3>{{{3.0, 2.0, 1.0}, {2.0, 1.0, 0.0}}});
 
-            std::shared_ptr<Node> myArgMax = ArgMax(2,0);
-            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
-            op->associateInput(0,myInput);
+            std::shared_ptr<Node> myArgMax = ArgMax(2, 0);
+            auto op = std::static_pointer_cast<OperatorTensor>(
+                myArgMax->getOperator());
+            op->associateInput(0, myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cuda");
             myArgMax->forward();
 
             std::shared_ptr<Tensor> outputFallback;
-            const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput);
+            const auto &cudaOutput =
+                op->getOutput(0)->refCastFrom(outputFallback, myOutput);
             REQUIRE(approxEq<float>(cudaOutput, myOutput));
         }
         SECTION("Axis 1") {
-            const Tensor myOutput = Tensor(Array3D<float,2,1,4> {
-                {
-                    {
-                        { 1.0, 2.0, 1.0, 0.0 }
-                    },
-                    {
-                        { 2.0, 1.0, 0.0, 2.0 }
-                    }
-                }
-            });
+            const Tensor myOutput = Tensor(Array3D<float, 2, 1, 4>{
+                {{{1.0, 2.0, 1.0, 0.0}}, {{2.0, 1.0, 0.0, 2.0}}}});
 
             std::shared_ptr<Node> myArgMax = ArgMax(1);
-            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
-            op->associateInput(0,myInput);
+            auto op = std::static_pointer_cast<OperatorTensor>(
+                myArgMax->getOperator());
+            op->associateInput(0, myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cuda");
             myArgMax->forward();
 
             std::shared_ptr<Tensor> outputFallback;
-            const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput);
+            const auto &cudaOutput =
+                op->getOutput(0)->refCastFrom(outputFallback, myOutput);
             REQUIRE(approxEq<float>(cudaOutput, myOutput));
         }
         SECTION("Axis 0") {
-            const Tensor myOutput = Tensor(Array3D<float,1,3,4> {
-                {
-                    {
-                        { 1.0, 0.0, 1.0, 0.0 },
-                        { 0.0, 1.0, 0.0, 1.0 },
-                        { 1.0, 0.0, 0.0, 1.0 }
-                    }
-                }
-            });
+            const Tensor myOutput =
+                Tensor(Array3D<float, 1, 3, 4>{{{{1.0, 0.0, 1.0, 0.0},
+                                                 {0.0, 1.0, 0.0, 1.0},
+                                                 {1.0, 0.0, 0.0, 1.0}}}});
 
             std::shared_ptr<Node> myArgMax = ArgMax(0);
-            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
-            op->associateInput(0,myInput);
+            auto op = std::static_pointer_cast<OperatorTensor>(
+                myArgMax->getOperator());
+            op->associateInput(0, myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cuda");
             myArgMax->forward();
 
             std::shared_ptr<Tensor> outputFallback;
-            const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput);
+            const auto &cudaOutput =
+                op->getOutput(0)->refCastFrom(outputFallback, myOutput);
             REQUIRE(approxEq<float>(cudaOutput, myOutput));
         }
     }
     SECTION("Select_Last_Index") {
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array1D<float,10> {
-            {
-                1.0, 5.0, 9.0, 0.0, 6.0, 2.0, 9.0, 4.0, 3.0, 9.0
-            }
-        });
-        const Tensor myOutput = Tensor(Array1D<float,1> {{9}});
+        std::shared_ptr<Tensor> myInput =
+            std::make_shared<Tensor>(Array1D<float, 10>{
+                {1.0, 5.0, 9.0, 0.0, 6.0, 2.0, 9.0, 4.0, 3.0, 9.0}});
+        const Tensor myOutput = Tensor(Array1D<float, 1>{{9}});
 
         std::shared_ptr<Node> myArgMax = ArgMax(0, 1, 1);
-        auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
-        op->associateInput(0,myInput);
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myArgMax->getOperator());
+        op->associateInput(0, myInput);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myArgMax->forward();
 
         std::shared_ptr<Tensor> outputFallback;
-        const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, myOutput);
+        const auto &cudaOutput =
+            op->getOutput(0)->refCastFrom(outputFallback, myOutput);
         REQUIRE(approxEq<float>(cudaOutput, myOutput));
     }
 }
\ No newline at end of file
diff --git a/unit_tests/Test_AvgPoolingImpl.cpp b/unit_tests/Test_AvgPoolingImpl.cpp
index 3dccd6b7f909a9e9b4f8affb151898b77d94a7cf..965585244939792a2500d73b334c0dd2f421c934 100644
--- a/unit_tests/Test_AvgPoolingImpl.cpp
+++ b/unit_tests/Test_AvgPoolingImpl.cpp
@@ -11,8 +11,8 @@
 
 #include <array>
 #include <cuda_fp16.h> // half type
-#include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <numeric>     // std::accumulate
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -24,116 +24,141 @@
 
 using namespace Aidge;
 
-TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]")
-{
-    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float, 2, 2, 5, 5>{// NCHW
-                                                                                          {
-                                                                                              {{{0, 1, 2, 3, 4},
-                                                                                                {5, 6, 7, 8, 9},
-                                                                                                {10, 11, 12, 13, 14},
-                                                                                                {15, 16, 17, 18, 19},
-                                                                                                {20, 21, 22, 23, 24}},
-
-                                                                                               {{25, 26, 27, 28, 29},
-                                                                                                {30, 31, 32, 33, 34},
-                                                                                                {35, 36, 37, 38, 39},
-                                                                                                {40, 41, 42, 43, 44},
-                                                                                                {45, 46, 47, 48, 49}}},
-                                                                                              {{{100, 101, 102, 103, 104},
-                                                                                                {105, 106, 107, 108, 109},
-                                                                                                {110, 111, 112, 113, 114},
-                                                                                                {115, 116, 117, 118, 119},
-                                                                                                {120, 121, 122, 123, 124}},
-
-                                                                                               {{125, 126, 127, 128, 129},
-                                                                                                {130, 131, 132, 133, 134},
-                                                                                                {135, 136, 137, 138, 139},
-                                                                                                {140, 141, 142, 143, 144},
-                                                                                                {145, 146, 147, 148, 149}}}}});
-    SECTION("Stride")
-    {
-        std::shared_ptr<Node> myAvgPool = AvgPooling({2, 2}, "myAvgPool", {2, 2});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
-
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
-            {{{{3, 5},
-               {13, 15}},
-              {{28, 30},
-               {38, 40}}},
-             {{{103, 105},
-               {113, 115}},
-              {{128, 130},
-               {138, 140}}}}});
+TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]") {
+    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+        Array4D<float, 2, 2, 5, 5>{// NCHW
+                                   {{{{0, 1, 2, 3, 4},
+                                      {5, 6, 7, 8, 9},
+                                      {10, 11, 12, 13, 14},
+                                      {15, 16, 17, 18, 19},
+                                      {20, 21, 22, 23, 24}},
+
+                                     {{25, 26, 27, 28, 29},
+                                      {30, 31, 32, 33, 34},
+                                      {35, 36, 37, 38, 39},
+                                      {40, 41, 42, 43, 44},
+                                      {45, 46, 47, 48, 49}}},
+                                    {{{100, 101, 102, 103, 104},
+                                      {105, 106, 107, 108, 109},
+                                      {110, 111, 112, 113, 114},
+                                      {115, 116, 117, 118, 119},
+                                      {120, 121, 122, 123, 124}},
+
+                                     {{125, 126, 127, 128, 129},
+                                      {130, 131, 132, 133, 134},
+                                      {135, 136, 137, 138, 139},
+                                      {140, 141, 142, 143, 144},
+                                      {145, 146, 147, 148, 149}}}}});
+    SECTION("Stride") {
+        std::shared_ptr<Node> myAvgPool =
+            AvgPooling({2, 2}, "myAvgPool", {2, 2});
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
+
+        std::shared_ptr<Tensor> myOutput =
+            std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
+                {{{{3, 5}, {13, 15}}, {{28, 30}, {38, 40}}},
+                 {{{103, 105}, {113, 115}}, {{128, 130}, {138, 140}}}}});
         op->associateInput(0, myInput);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myAvgPool->forward();
 
         float *computedOutput = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-
-        for (int i = 0; i < myOutput->size(); i++)
-        {
-            const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
+
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedOutput;
     }
 
-    SECTION("Stride >= feature dim")
-    {
-        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<float, 1, 1, 3, 3>{// NCHW
-                                                                                               {
-                                                                                                   {{{0.3745, 0.9507, 0.7320},
-                                                                                                     {0.5987, 0.1560, 0.1560},
-                                                                                                     {0.0581, 0.8662, 0.6011}}}}});
-        std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPool", {3, 3});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
-
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 1>{
-            {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011) / 9.0}}}}});
+    SECTION("Stride >= feature dim") {
+        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(
+            Array4D<float, 1, 1, 3, 3>{// NCHW
+                                       {{{{0.3745, 0.9507, 0.7320},
+                                          {0.5987, 0.1560, 0.1560},
+                                          {0.0581, 0.8662, 0.6011}}}}});
+        std::shared_ptr<Node> myAvgPool =
+            AvgPooling({3, 3}, "myAvgPool", {3, 3});
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
+
+        std::shared_ptr<Tensor> myOutput =
+            std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 1>{
+                {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 +
+                     0.0581 + 0.8662 + 0.6011) /
+                    9.0}}}}});
         op->associateInput(0, myInput2);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myAvgPool->forward();
 
         float *computedOutput = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-
-        for (int i = 0; i < myOutput->size(); i++)
-        {
-            const float targetOutput = *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
+
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedOutput;
     }
 
-    SECTION("half")
-    {
-        std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 3, 3>{// NCHW
-                                                                                                          {
-                                                                                                              {{{half_float::half(0.3745), half_float::half(0.9507), half_float::half(0.7320)},
-                                                                                                                {half_float::half(0.5987), half_float::half(0.1560), half_float::half(0.1560)},
-                                                                                                                {half_float::half(0.0581), half_float::half(0.8662), half_float::half(0.6011)}}}}});
+    SECTION("half") {
+        std::shared_ptr<Tensor> myInput2 =
+            std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 3, 3>{
+                // NCHW
+                {{{{half_float::half(0.3745),
+                    half_float::half(0.9507),
+                    half_float::half(0.7320)},
+                   {half_float::half(0.5987),
+                    half_float::half(0.1560),
+                    half_float::half(0.1560)},
+                   {half_float::half(0.0581),
+                    half_float::half(0.8662),
+                    half_float::half(0.6011)}}}}});
         myInput2->setBackend("cuda");
 
-        std::shared_ptr<Node> myAvgPool = AvgPooling({3, 3}, "myAvgPoolcdw", {3, 3});
-        auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 1, 1>{
-            {{{{(half_float::half(0.3745) + half_float::half(0.9507) + half_float::half(0.7320) + half_float::half(0.5987) + half_float::half(0.1560) + half_float::half(0.1560) + half_float::half(0.0581) + half_float::half(0.8662) + half_float::half(0.6011)) / half_float::half(9.0)}}}}});
+        std::shared_ptr<Node> myAvgPool =
+            AvgPooling({3, 3}, "myAvgPoolcdw", {3, 3});
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
+        std::shared_ptr<Tensor> myOutput =
+            std::make_shared<Tensor>(Array4D<half_float::half, 1, 1, 1, 1>{
+                {{{{(half_float::half(0.3745) + half_float::half(0.9507) +
+                     half_float::half(0.7320) + half_float::half(0.5987) +
+                     half_float::half(0.1560) + half_float::half(0.1560) +
+                     half_float::half(0.0581) + half_float::half(0.8662) +
+                     half_float::half(0.6011)) /
+                    half_float::half(9.0)}}}}});
         op->associateInput(0, myInput2);
         op->setDataType(DataType::Float16);
         op->setBackend("cuda");
         myAvgPool->forward();
 
-        half_float::half *computedOutput = new half_float::half[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(half_float::half) * myOutput->size(), cudaMemcpyDeviceToHost);
-
-        for (int i = 0; i < myOutput->size(); i++)
-        {
-            const half_float::half targetOutput = *(static_cast<half_float::half *>(myOutput->getImpl()->rawPtr()) + i);
+        half_float::half *computedOutput =
+            new half_float::half[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(half_float::half) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
+
+        for (int i = 0; i < myOutput->size(); i++) {
+            const half_float::half targetOutput =
+                *(static_cast<half_float::half *>(
+                      myOutput->getImpl()->rawPtr()) +
+                  i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
@@ -148,149 +173,150 @@ TEST_CASE("[gpu/operator] AvgPooling(forward)", "[AvgPooling][GPU]")
         std::random_device rd;
         std::mt19937 gen(rd());
         std::uniform_real_distribution<float> valueDist(
-            0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(kernel),
-                                                               std::size_t(10));
+            0.1f,
+            1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(
+            std::size_t(kernel),
+            std::size_t(10));
 
         // To measure execution time of 'AveragePooling_Op::forward()'
         std::chrono::time_point<std::chrono::system_clock> start;
         std::chrono::time_point<std::chrono::system_clock> end;
         std::chrono::duration<double, std::micro> duration{};
         std::size_t number_of_operation = 0;
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
-        {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
             // Create AveragePooling Operator CUDA
-            std::shared_ptr<Node> myAvgPoolCuda = AvgPooling({kernel, kernel}, "myAvgPoolCuda", {stride, stride});
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myAvgPoolCuda->getOperator());
+            std::shared_ptr<Node> myAvgPoolCuda = AvgPooling({kernel, kernel},
+                                                             "myAvgPoolCuda",
+                                                             {stride, stride});
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(
+                myAvgPoolCuda->getOperator());
             op_cuda->setDataType(DataType::Float32);
             op_cuda->setBackend("cuda");
 
             // Create AveragePooling Operator CUDA
-            std::shared_ptr<Node> myAvgPoolCpu = AvgPooling({kernel, kernel}, "myAvgPoolCpu", {stride, stride});
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myAvgPoolCpu->getOperator());
+            std::shared_ptr<Node> myAvgPoolCpu =
+                AvgPooling({kernel, kernel}, "myAvgPoolCpu", {stride, stride});
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(
+                myAvgPoolCpu->getOperator());
             op_cpu->setDataType(DataType::Float32);
             op_cpu->setBackend("cpu");
 
             // generate a random Tensor
             const std::size_t nbDims = 4;
             std::vector<std::size_t> dims;
-            for (std::size_t i = 0; i < nbDims; ++i)
-            {
+            for (std::size_t i = 0; i < nbDims; ++i) {
                 dims.push_back(dimSizeDist(gen));
             }
 
-            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements =
+                std::accumulate(dims.cbegin(),
+                                dims.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
             number_of_operation += nb_elements;
 
             // Fill input tensor
             float *array0 = new float[nb_elements];
-            for (std::size_t i = 0; i < nb_elements; ++i)
-            {
+            for (std::size_t i = 0; i < nb_elements; ++i) {
                 array0[i] = valueDist(gen);
             }
 
             // input0 CUDA
-            float* array0_d;
+            float *array0_d;
             std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
             T0_cuda->setDataType(DataType::Float32);
             T0_cuda->setBackend("cuda");
             T0_cuda->resize(dims);
             op_cuda->associateInput(0, T0_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
-            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                       sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d,
+                       array0,
+                       sizeof(float) * nb_elements,
+                       cudaMemcpyHostToDevice);
             T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
 
             // input0 CPU
             std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(0,T0_cpu);
+            op_cpu->associateInput(0, T0_cpu);
             T0_cpu->setDataType(DataType::Float32);
             T0_cpu->setBackend("cpu");
             T0_cpu->resize(dims);
-            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+            T0_cpu->getImpl()->setRawPtr(array0, nb_elements);
 
-            // Run inference            
+            // Run inference
             start = std::chrono::system_clock::now();
             op_cuda->forward();
             end = std::chrono::system_clock::now();
-            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(
+                end - start);
 
-            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            const std::size_t outSize = op_cuda->getOutput(0)->size();
             float *computed_cuda = new float[outSize]();
-            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+            cudaMemcpy(computed_cuda,
+                       op_cuda->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * outSize,
+                       cudaMemcpyDeviceToHost);
 
             // forward CPU
             op_cpu->forward();
-            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            float *computed_cpu = static_cast<float *>(
+                op_cpu->getOutput(0)->getImpl()->rawPtr());
             REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
 
             delete[] computed_cuda;
             delete[] array0;
             cudaFree(array0_d);
         }
-        std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
+        std::cout << "number of elements over time spent: "
+                  << (number_of_operation / duration.count()) << std::endl;
         std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
     }
 }
 
-TEST_CASE("[gpu/operator] AvgPooling(backward)", "[AvgPooling][GPU]")
-{
+TEST_CASE("[gpu/operator] AvgPooling(backward)", "[AvgPooling][GPU]") {
     // Run forward operation
-    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float, 1, 1, 4, 4> {// NCHW
-        {
-            {
-                {
-                    {1, 2, 3, 4},
-                    {5, 6, 7, 8},
-                    {9, 10, 11, 12},
-                    {13, 14, 15, 16}
-                }
-            }
-        }
-    });
+    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+        Array4D<float, 1, 1, 4, 4>{// NCHW
+                                   {{{{1, 2, 3, 4},
+                                      {5, 6, 7, 8},
+                                      {9, 10, 11, 12},
+                                      {13, 14, 15, 16}}}}});
     myInput->setBackend("cuda");
 
     std::shared_ptr<Node> myAvgPool = AvgPooling({2, 2}, "myAvgPool", {2, 2});
-    auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
+    auto op =
+        std::static_pointer_cast<OperatorTensor>(myAvgPool->getOperator());
     op->associateInput(0, myInput);
     op->setDataType(DataType::Float32);
     op->setBackend("cuda");
     myAvgPool->forward();
 
     // Run and test backward operation
-    std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float, 1,1,2,2> {
-        {
-            {
-                {
-                    {1, 2},
-                    {3, 4}
-                }
-            }
-        }
-    });
+    std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(
+        Array4D<float, 1, 1, 2, 2>{{{{{1, 2}, {3, 4}}}}});
     myOutputGrad->setBackend("cuda");
     std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
     std::shared_ptr<Tensor> input = op->getInput(0);
     predictedOutput->setGrad(myOutputGrad);
     REQUIRE_NOTHROW(myAvgPool->backward());
 
-    std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array4D<float, 1, 1, 4, 4>{
-        {
-            {
-                {
-                    {0.25, 0.25, 0.5, 0.5},
-                    {0.25, 0.25, 0.5, 0.5},
-                    {0.75, 0.75, 1, 1},
-                    {0.75, 0.75, 1, 1}
-                }
-            }
-        }
-    });
+    std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(
+        Array4D<float, 1, 1, 4, 4>{{{{{0.25, 0.25, 0.5, 0.5},
+                                      {0.25, 0.25, 0.5, 0.5},
+                                      {0.75, 0.75, 1, 1},
+                                      {0.75, 0.75, 1, 1}}}}});
 
     float *computedGradCuda = new float[expectedInputGrad->size()]();
-    cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost);
-    
-    for(int i = 0; i < expectedInputGrad->size(); i++){
-        const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i);
+    cudaMemcpy(computedGradCuda,
+               input->grad()->getImpl()->rawPtr(),
+               sizeof(float) * expectedInputGrad->size(),
+               cudaMemcpyDeviceToHost);
+
+    for (int i = 0; i < expectedInputGrad->size(); i++) {
+        const float targetOutput = *(
+            static_cast<float *>(expectedInputGrad->getImpl()->rawPtr()) + i);
         REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
     }
 
diff --git a/unit_tests/Test_BatchNormImpl.cpp b/unit_tests/Test_BatchNormImpl.cpp
index c83624020d86a2eb786d249c5ee664ca3bfdde3b..d1c8be720a5f76005d37e98604035c793b34103f 100644
--- a/unit_tests/Test_BatchNormImpl.cpp
+++ b/unit_tests/Test_BatchNormImpl.cpp
@@ -11,108 +11,116 @@
 
 #include <array>
 #include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
+#include "Test_cuda.hpp"
 #include "aidge/backend/cpu.hpp"
 #include "aidge/backend/cuda.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
-#include "Test_cuda.hpp"
 
 using namespace Aidge;
 
 TEST_CASE("[gpu/operator] BatchNorm(forward)") {
     SECTION("Static Input") {
-        std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm");
-        auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator());
+        std::shared_ptr<Node> myBatchNorm =
+            BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm");
+        auto op = std::static_pointer_cast<OperatorTensor>(
+            myBatchNorm->getOperator());
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
-        std::shared_ptr<Tensor> myWeights= std::make_shared<Tensor>(Array1D<float,3> {{0.9159252643585205, 0.18772238492965698, 0.4479946792125702}});
-        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.33898890018463135, 0.3167555630207062, 0.7047033309936523}});
-        std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(Array1D<float,3> {{0.45547693967819214, 0.22650663554668427, 0.6612948179244995}});
-        std::shared_ptr<Tensor> myVar = std::make_shared<Tensor>(Array1D<float,3> {{0.02570258639752865, 0.026536229997873306, 0.15111008286476135}});
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> { //NCHW
-            {
-                    {
-                        {{0.12943482, 0.6451229 , 0.24979436},
-                        {0.7551012,  0.32007095, 0.89463896},
-                        {0.7087448,  0.6266124,  0.4782957 }},
-
-                        {{0.13796203, 0.9950787,  0.71555305},
-                        {0.01347321, 0.4395316,  0.43097174},
-                        {0.6056306 , 0.9561122 , 0.5783939 }},
-
-                        {{0.7174486 , 0.503465 ,  0.23695093},
-                        {0.5145477,  0.39576462, 0.02779444},
-                        {0.60789394 ,0.14119725 ,0.20753163}}
-                    },
-
-
-                        {{{0.74452287, 0.5354875 , 0.8148496 },
-                        {0.73356223, 0.4304034 , 0.11783765},
-                        {0.8966221,  0.41049036, 0.95982736}},
-
-                        {{0.03161403, 0.71250844, 0.14337301},
-                        {0.5338889 , 0.13484782, 0.8055851 },
-                        {0.71784616 ,0.8349626 , 0.10107189}},
-
-                        {{0.85701346, 0.58286697, 0.9836816 },
-                        {0.36061534, 0.03660944, 0.7375317 },
-                        {0.6977233,  0.51965624, 0.29440993}}
-                    }
-                }
-        });
-        
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> {
-            {
-                {
-                    {{-1.5233592,   1.4222438,  -0.83586717},
-                    { 2.0504384,  -0.43444824,  2.847476  },
-                    { 1.7856512,   1.3165123,   0.46932936}},
-
-                    {{ 0.21473758 , 1.2022772,   0.8802177 },
-                    { 0.07130594 , 0.5621954,   0.55233306},
-                    { 0.7535689 ,  1.1573814,   0.72218764}},
-
-                    {{ 0.7694162 ,  0.52281666,  0.2156798 },
-                    { 0.5355886  , 0.3987003,  -0.02535689},
-                    { 0.6431629 ,  0.10533108 , 0.18177633}}},
-
-
-                    {{{ 1.990015,    0.7960079,   2.3917203 },
-                    { 1.9274082,   0.19576907, -1.5896021 },
-                    { 2.8588037 ,  0.08202624 , 3.2198315 }},
-
-                    {{ 0.09220716,  0.8767097,   0.22097193},
-                    { 0.6709106 ,  0.2111495,   0.9839494 },
-                    { 0.8828597 ,  1.0177971 ,  0.17223406}},
-
-                    {{ 0.9302539 ,  0.6143213 ,  1.0762292 },
-                    { 0.35819346, -0.01519828,  0.79256046},
-                    { 0.7466844 ,  0.5414758 ,  0.28189686}}
-                }
-            }
-        });
+        std::shared_ptr<Tensor> myWeights =
+            std::make_shared<Tensor>(Array1D<float, 3>{{0.9159252643585205,
+                                                        0.18772238492965698,
+                                                        0.4479946792125702}});
+        std::shared_ptr<Tensor> myBias =
+            std::make_shared<Tensor>(Array1D<float, 3>{{0.33898890018463135,
+                                                        0.3167555630207062,
+                                                        0.7047033309936523}});
+        std::shared_ptr<Tensor> myMean =
+            std::make_shared<Tensor>(Array1D<float, 3>{{0.45547693967819214,
+                                                        0.22650663554668427,
+                                                        0.6612948179244995}});
+        std::shared_ptr<Tensor> myVar =
+            std::make_shared<Tensor>(Array1D<float, 3>{{0.02570258639752865,
+                                                        0.026536229997873306,
+                                                        0.15111008286476135}});
+        std::shared_ptr<Tensor> myInput =
+            std::make_shared<Tensor>(Array4D<float, 2, 3, 3, 3>{
+                // NCHW
+                {{{{0.12943482, 0.6451229, 0.24979436},
+                   {0.7551012, 0.32007095, 0.89463896},
+                   {0.7087448, 0.6266124, 0.4782957}},
+
+                  {{0.13796203, 0.9950787, 0.71555305},
+                   {0.01347321, 0.4395316, 0.43097174},
+                   {0.6056306, 0.9561122, 0.5783939}},
+
+                  {{0.7174486, 0.503465, 0.23695093},
+                   {0.5145477, 0.39576462, 0.02779444},
+                   {0.60789394, 0.14119725, 0.20753163}}},
+
+                 {{{0.74452287, 0.5354875, 0.8148496},
+                   {0.73356223, 0.4304034, 0.11783765},
+                   {0.8966221, 0.41049036, 0.95982736}},
+
+                  {{0.03161403, 0.71250844, 0.14337301},
+                   {0.5338889, 0.13484782, 0.8055851},
+                   {0.71784616, 0.8349626, 0.10107189}},
+
+                  {{0.85701346, 0.58286697, 0.9836816},
+                   {0.36061534, 0.03660944, 0.7375317},
+                   {0.6977233, 0.51965624, 0.29440993}}}}});
+
+        std::shared_ptr<Tensor> myOutput =
+            std::make_shared<Tensor>(Array4D<float, 2, 3, 3, 3>{
+                {{{{-1.5233592, 1.4222438, -0.83586717},
+                   {2.0504384, -0.43444824, 2.847476},
+                   {1.7856512, 1.3165123, 0.46932936}},
+
+                  {{0.21473758, 1.2022772, 0.8802177},
+                   {0.07130594, 0.5621954, 0.55233306},
+                   {0.7535689, 1.1573814, 0.72218764}},
+
+                  {{0.7694162, 0.52281666, 0.2156798},
+                   {0.5355886, 0.3987003, -0.02535689},
+                   {0.6431629, 0.10533108, 0.18177633}}},
+
+                 {{{1.990015, 0.7960079, 2.3917203},
+                   {1.9274082, 0.19576907, -1.5896021},
+                   {2.8588037, 0.08202624, 3.2198315}},
+
+                  {{0.09220716, 0.8767097, 0.22097193},
+                   {0.6709106, 0.2111495, 0.9839494},
+                   {0.8828597, 1.0177971, 0.17223406}},
+
+                  {{0.9302539, 0.6143213, 1.0762292},
+                   {0.35819346, -0.01519828, 0.79256046},
+                   {0.7466844, 0.5414758, 0.28189686}}}}});
         myInput->setBackend("cuda");
         myWeights->setBackend("cuda");
         myBias->setBackend("cuda");
         myMean->setBackend("cuda");
         myVar->setBackend("cuda");
 
-        op->associateInput(0,myInput);
-        op->associateInput(1,myWeights);
-        op->associateInput(2,myBias);
-        op->associateInput(3,myMean);
-        op->associateInput(4,myVar);
+        op->associateInput(0, myInput);
+        op->associateInput(1, myWeights);
+        op->associateInput(2, myBias);
+        op->associateInput(3, myMean);
+        op->associateInput(4, myVar);
         op->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-5);
         }
 
@@ -127,44 +135,50 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") {
         std::random_device rd;
         std::mt19937 gen(rd());
         std::uniform_real_distribution<float> valueDist(
-            0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
-                                                               std::size_t(10));
+            0.1f,
+            1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(
+            std::size_t(1),
+            std::size_t(10));
         // To measure execution time of 'forward()'
         std::chrono::time_point<std::chrono::system_clock> start;
         std::chrono::time_point<std::chrono::system_clock> end;
         std::chrono::duration<double, std::micro> duration{};
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
-        {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
             // generate a random Tensor
             const std::size_t nbDims = 4;
             std::vector<std::size_t> dims;
-            for (std::size_t i = 0; i < nbDims; ++i)
-            {
+            for (std::size_t i = 0; i < nbDims; ++i) {
                 dims.push_back(dimSizeDist(gen));
             }
-            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements =
+                std::accumulate(dims.cbegin(),
+                                dims.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
             const std::size_t nbChannels = dims[1];
 
-
             // Create BatchNorm Operator Cuda
-            std::shared_ptr<Node> myBatchNormCuda = BatchNorm<2>(nbChannels, epsilon, momentum, "mybatchnormcuda");
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myBatchNormCuda -> getOperator());
+            std::shared_ptr<Node> myBatchNormCuda =
+                BatchNorm<2>(nbChannels, epsilon, momentum, "mybatchnormcuda");
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(
+                myBatchNormCuda->getOperator());
             op_cuda->setDataType(DataType::Float32);
             op_cuda->setBackend("cuda");
 
             // Create BatchNorm Operator CPU
-            std::shared_ptr<Node> myBatchNormCpu = BatchNorm<2>(nbChannels, epsilon, momentum, "mybatchnormcuda");
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myBatchNormCpu -> getOperator());
+            std::shared_ptr<Node> myBatchNormCpu =
+                BatchNorm<2>(nbChannels, epsilon, momentum, "mybatchnormcuda");
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(
+                myBatchNormCpu->getOperator());
             op_cpu->setDataType(DataType::Float32);
             op_cpu->setBackend("cpu");
 
-            float* array0 = new float[nb_elements];
-            float* weights = new float[nbChannels];
-            float* bias = new float[nbChannels];
-            float* mean = new float[nbChannels];
-            float* var = new float[nbChannels];
-
+            float *array0 = new float[nb_elements];
+            float *weights = new float[nbChannels];
+            float *bias = new float[nbChannels];
+            float *mean = new float[nbChannels];
+            float *var = new float[nbChannels];
 
             for (std::size_t i = 0; i < nb_elements; ++i) {
                 array0[i] = valueDist(gen);
@@ -177,23 +191,27 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") {
             }
 
             // input0 CUDA
-            float* array0_d, *weight_d, *bias_d, *mean_d, *var_d;
+            float *array0_d, *weight_d, *bias_d, *mean_d, *var_d;
             std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
             T0_cuda->setDataType(DataType::Float32);
             T0_cuda->setBackend("cuda");
             T0_cuda->resize(dims);
             op_cuda->associateInput(0, T0_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
-            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                       sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d,
+                       array0,
+                       sizeof(float) * nb_elements,
+                       cudaMemcpyHostToDevice);
             T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
 
             // input0 CPU
             std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(0,T0_cpu);
+            op_cpu->associateInput(0, T0_cpu);
             T0_cpu->setDataType(DataType::Float32);
             T0_cpu->setBackend("cpu");
             T0_cpu->resize(dims);
-            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+            T0_cpu->getImpl()->setRawPtr(array0, nb_elements);
 
             // weight CUDA
             std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>();
@@ -201,17 +219,21 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") {
             Tw_cuda->setBackend("cuda");
             Tw_cuda->resize({nbChannels});
             op_cuda->associateInput(1, Tw_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * nbChannels);
-            cudaMemcpy(weight_d, weights, sizeof(float) * nbChannels, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&weight_d),
+                       sizeof(float) * nbChannels);
+            cudaMemcpy(weight_d,
+                       weights,
+                       sizeof(float) * nbChannels,
+                       cudaMemcpyHostToDevice);
             Tw_cuda->getImpl()->setRawPtr(weight_d, nbChannels);
 
             // weight CPU
             std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(1,Tw_cpu);
+            op_cpu->associateInput(1, Tw_cpu);
             Tw_cpu->setDataType(DataType::Float32);
             Tw_cpu->setBackend("cpu");
             Tw_cpu->resize({nbChannels});
-            Tw_cpu -> getImpl() -> setRawPtr(weights, nbChannels);
+            Tw_cpu->getImpl()->setRawPtr(weights, nbChannels);
 
             // bias CUDA
             std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>();
@@ -219,17 +241,21 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") {
             Tb_cuda->setBackend("cuda");
             Tb_cuda->resize({nbChannels});
             op_cuda->associateInput(2, Tb_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * nbChannels);
-            cudaMemcpy(bias_d, bias, sizeof(float) * nbChannels, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&bias_d),
+                       sizeof(float) * nbChannels);
+            cudaMemcpy(bias_d,
+                       bias,
+                       sizeof(float) * nbChannels,
+                       cudaMemcpyHostToDevice);
             Tb_cuda->getImpl()->setRawPtr(bias_d, nbChannels);
 
             // bias CPU
             std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(2,Tb_cpu);
+            op_cpu->associateInput(2, Tb_cpu);
             Tb_cpu->setDataType(DataType::Float32);
             Tb_cpu->setBackend("cpu");
             Tb_cpu->resize({nbChannels});
-            Tb_cpu -> getImpl() -> setRawPtr(bias, nbChannels);
+            Tb_cpu->getImpl()->setRawPtr(bias, nbChannels);
 
             // mean CUDA
             std::shared_ptr<Tensor> Tm_cuda = std::make_shared<Tensor>();
@@ -237,17 +263,21 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") {
             Tm_cuda->setBackend("cuda");
             Tm_cuda->resize({nbChannels});
             op_cuda->associateInput(3, Tm_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&mean_d), sizeof(float) * nbChannels);
-            cudaMemcpy(mean_d, mean, sizeof(float) * nbChannels, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&mean_d),
+                       sizeof(float) * nbChannels);
+            cudaMemcpy(mean_d,
+                       mean,
+                       sizeof(float) * nbChannels,
+                       cudaMemcpyHostToDevice);
             Tm_cuda->getImpl()->setRawPtr(mean_d, nbChannels);
 
             // mean CPU
             std::shared_ptr<Tensor> Tm_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(3,Tm_cpu);
+            op_cpu->associateInput(3, Tm_cpu);
             Tm_cpu->setDataType(DataType::Float32);
             Tm_cpu->setBackend("cpu");
             Tm_cpu->resize({nbChannels});
-            Tm_cpu -> getImpl() -> setRawPtr(mean, nbChannels);
+            Tm_cpu->getImpl()->setRawPtr(mean, nbChannels);
 
             // var CUDA
             std::shared_ptr<Tensor> Tv_cuda = std::make_shared<Tensor>();
@@ -255,31 +285,40 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") {
             Tv_cuda->setBackend("cuda");
             Tv_cuda->resize({nbChannels});
             op_cuda->associateInput(4, Tv_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&var_d), sizeof(float) * nbChannels);
-            cudaMemcpy(var_d, var, sizeof(float) * nbChannels, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&var_d),
+                       sizeof(float) * nbChannels);
+            cudaMemcpy(var_d,
+                       var,
+                       sizeof(float) * nbChannels,
+                       cudaMemcpyHostToDevice);
             Tv_cuda->getImpl()->setRawPtr(var_d, nbChannels);
 
             // var CPU
             std::shared_ptr<Tensor> Tv_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(4,Tv_cpu);
+            op_cpu->associateInput(4, Tv_cpu);
             Tv_cpu->setDataType(DataType::Float32);
             Tv_cpu->setBackend("cpu");
             Tv_cpu->resize({nbChannels});
-            Tv_cpu -> getImpl() -> setRawPtr(var, nbChannels);
+            Tv_cpu->getImpl()->setRawPtr(var, nbChannels);
 
             // forward CUDA
             start = std::chrono::system_clock::now();
             op_cuda->forward();
             end = std::chrono::system_clock::now();
-            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(
+                end - start);
 
-            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            const std::size_t outSize = op_cuda->getOutput(0)->size();
             float *computed_cuda = new float[outSize]();
-            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+            cudaMemcpy(computed_cuda,
+                       op_cuda->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * outSize,
+                       cudaMemcpyDeviceToHost);
 
             // forward CPU
             op_cpu->forward();
-            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            float *computed_cpu = static_cast<float *>(
+                op_cpu->getOutput(0)->getImpl()->rawPtr());
             REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
 
             delete[] array0;
@@ -295,52 +334,34 @@ TEST_CASE("[gpu/operator] BatchNorm(forward)") {
             cudaFree(var_d);
         }
         std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
-
     }
 }
 TEST_CASE("[gpu/operator] BatchNorm(backward)") {
     SECTION("Static Input") {
-        std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm");
-        auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator());
+        std::shared_ptr<Node> myBatchNorm =
+            BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm");
+        auto op = std::static_pointer_cast<OperatorTensor>(
+            myBatchNorm->getOperator());
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         // Forward
-        std::shared_ptr<Tensor> myWeights= std::make_shared<Tensor>(Array1D<float,3> {{-1.58390772, -0.48463920,  1.30413496}});
-        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.06150287, -0.03140282, -0.49673468}});
-        std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(Array1D<float,3> {{0.68328333, -0.47286209,  1.11688483}});
-        std::shared_ptr<Tensor> myVar = std::make_shared<Tensor>(Array1D<float,3> {{0.84838068, 1.05930495, 0.53670371}});
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,2,2> { //NCHW
-            {
-                    {
-                        {
-                            {1.46650600,  1.24083233},
-                            {-0.33106008, -0.15137172}
-                        },
-                        {
-                            { 0.06625678, -1.83266091},
-                            { 0.53444749, -0.05167147}
-                        },
-                        {
-                            { 0.41069385, -0.70850474},
-                            { 0.23363227,  0.06111236}
-                        }
-                    },
-                    {
-                        {
-                            { 0.16707586,  1.07217050},
-                            { 1.18544745,  0.03441877}
-                        },
-                        {
-                            { 0.88106865,  0.33312374},
-                            { 0.87147945,  1.46628737}
-                        },
-                       {
-                            { 0.23930393, -0.94172227},
-                            { 1.48735642,  0.46449399}
-                        }
-                    }
-            }
-        });
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(
+            Array1D<float, 3>{{-1.58390772, -0.48463920, 1.30413496}});
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(
+            Array1D<float, 3>{{0.06150287, -0.03140282, -0.49673468}});
+        std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(
+            Array1D<float, 3>{{0.68328333, -0.47286209, 1.11688483}});
+        std::shared_ptr<Tensor> myVar = std::make_shared<Tensor>(
+            Array1D<float, 3>{{0.84838068, 1.05930495, 0.53670371}});
+        std::shared_ptr<Tensor> myInput =
+            std::make_shared<Tensor>(Array4D<float, 2, 3, 2, 2>{
+                // NCHW
+                {{{{1.46650600, 1.24083233}, {-0.33106008, -0.15137172}},
+                  {{0.06625678, -1.83266091}, {0.53444749, -0.05167147}},
+                  {{0.41069385, -0.70850474}, {0.23363227, 0.06111236}}},
+                 {{{0.16707586, 1.07217050}, {1.18544745, 0.03441877}},
+                  {{0.88106865, 0.33312374}, {0.87147945, 1.46628737}},
+                  {{0.23930393, -0.94172227}, {1.48735642, 0.46449399}}}}});
 
         myInput->setBackend("cuda");
         myWeights->setBackend("cuda");
@@ -348,47 +369,24 @@ TEST_CASE("[gpu/operator] BatchNorm(backward)") {
         myMean->setBackend("cuda");
         myVar->setBackend("cuda");
 
-        op->associateInput(0,myInput);
-        op->associateInput(1,myWeights);
-        op->associateInput(2,myBias);
-        op->associateInput(3,myMean);
-        op->associateInput(4,myVar);
+        op->associateInput(0, myInput);
+        op->associateInput(1, myWeights);
+        op->associateInput(2, myBias);
+        op->associateInput(3, myMean);
+        op->associateInput(4, myVar);
         op->forward();
 
         // Backward
-        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,2,3,2,2> {
-            {
-                {
-                    {
-                        { 1.34347093,  0.90813798},
-                        { 0.39607167,  1.20428133}
-                    },
-                    {
-                        { 0.16845724,  0.48487359},
-                        { 0.40748054, -0.21790814}
-                    },
-                    {
-                        {-1.83932650, -0.42746788},
-                        { 0.97129798,  2.04073548}
-                    }
-                },
-                {
-                    {
-                        {-0.95714629,  0.18446854},
-                        { 1.14551663, -1.38118088}
-                    },
-                    {
-                        {-0.44466951,  2.73914146},
-                        { 0.57898718,  2.23699141}
-                    },
-                    {
-                        { 0.25004527, -0.18481003},
-                        {-0.72439206,  0.87744337}
-                    }
-
-                }
-            }
-        });
+        std::shared_ptr<Tensor> myOutputGrad =
+            std::make_shared<Tensor>(Array4D<float, 2, 3, 2, 2>{
+                {{{{1.34347093, 0.90813798}, {0.39607167, 1.20428133}},
+                  {{0.16845724, 0.48487359}, {0.40748054, -0.21790814}},
+                  {{-1.83932650, -0.42746788}, {0.97129798, 2.04073548}}},
+                 {{{-0.95714629, 0.18446854}, {1.14551663, -1.38118088}},
+                  {{-0.44466951, 2.73914146}, {0.57898718, 2.23699141}},
+                  {{0.25004527, -0.18481003}, {-0.72439206, 0.87744337}}
+
+                 }}});
 
         myOutputGrad->setBackend("cuda");
         std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
@@ -398,38 +396,30 @@ TEST_CASE("[gpu/operator] BatchNorm(backward)") {
         predictedOutput->setGrad(myOutputGrad);
         REQUIRE_NOTHROW(myBatchNorm->backward());
 
-        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array4D<float, 2, 3, 2, 2>{
-            {
-                {
-                    {
-                        {-0.92418045, -0.26092845},
-                        {-1.53920066, -3.14756274}},
+        std::shared_ptr<Tensor> expectedInputGrad =
+            std::make_shared<Tensor>(Array4D<float, 2, 3, 2, 2>{
+                {{{{-0.92418045, -0.26092845}, {-1.53920066, -3.14756274}},
 
-                        {{ 0.26948565, -0.18548687},
-                        { 0.21506749,  0.45458069}},
+                  {{0.26948565, -0.18548687}, {0.21506749, 0.45458069}},
 
-                        {{-3.57358932, -1.30609703},
-                        { 1.61337423,  3.55250096}}},
+                  {{-3.57358932, -1.30609703}, {1.61337423, 3.55250096}}},
 
+                 {{{2.41264391, 1.16695499}, {-0.90373814, 3.19601130}},
 
-                        {{{ 2.41264391,  1.16695499},
-                        {-0.90373814,  3.19601130}},
+                  {{0.71554798, -1.04076481}, {0.17618656, -0.60461664}},
 
-                        {{ 0.71554798, -1.04076481},
-                        { 0.17618656, -0.60461664}},
-
-                        {{ 0.26926503, -0.92978811},
-                        {-1.13964832,  1.51398242}
-                    }
-                }
-            }
-        });
+                  {{0.26926503, -0.92978811}, {-1.13964832, 1.51398242}}}}});
 
         float *computedGradCuda = new float[expectedInputGrad->size()]();
-        cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost);
-        
-        for(int i = 0; i < expectedInputGrad->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i);
+        cudaMemcpy(computedGradCuda,
+                   input->grad()->getImpl()->rawPtr(),
+                   sizeof(float) * expectedInputGrad->size(),
+                   cudaMemcpyDeviceToHost);
+
+        for (int i = 0; i < expectedInputGrad->size(); i++) {
+            const float targetOutput = *(
+                static_cast<float *>(expectedInputGrad->getImpl()->rawPtr()) +
+                i);
             REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
         }
 
diff --git a/unit_tests/Test_CastMove.cpp b/unit_tests/Test_CastMove.cpp
index c96600f79967c69e43b3c334d3624f6514b6f936..ef5a334402f9ebc95d4121353261e67a888b7e3f 100644
--- a/unit_tests/Test_CastMove.cpp
+++ b/unit_tests/Test_CastMove.cpp
@@ -14,63 +14,66 @@
 #include <string>
 
 #include "aidge/data/Tensor.hpp"
-#include "aidge/utils/TensorUtils.hpp"
-#include "aidge/graph/Node.hpp"
 #include "aidge/graph/GraphView.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/graph/OpArgs.hpp"
-#include "aidge/scheduler/SequentialScheduler.hpp"
 #include "aidge/recipes/Recipes.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 #include "aidge/backend/cuda.hpp"
 
 using namespace Aidge;
 
 TEST_CASE("[cuda/castmove] CastMove(forward)") {
-    std::shared_ptr<Tensor> inputTensor =
-            std::make_shared<Tensor>(Array4D<int, 2, 1, 5, 5>{{{{{0, 1, 2, 3, 4},
-                                                                 {5, 6, 7, 8, 9},
-                                                                 {10, 11, 12, 13, 14},
-                                                                 {15, 16, 17, 18, 19},
-                                                                 {20, 21, 22, 23, 24}}},
-                                                               {{{25, 26, 27, 28, 29},
-                                                                 {30, 31, 32, 33, 34},
-                                                                 {35, 36, 37, 38, 39},
-                                                                 {40, 41, 42, 43, 44},
-                                                                 {45, 46, 47, 48, 49}}}}});
-
-    std::shared_ptr<Tensor> weight1 = std::make_shared<Tensor>(
-            Array4D<int, 3, 1, 3, 3>{{{{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}},
-                                      {{{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}},
-                                      {{{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}}});
-
-    std::shared_ptr<Tensor> bias1 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
+    std::shared_ptr<Tensor> inputTensor = std::make_shared<Tensor>(
+        Array4D<int, 2, 1, 5, 5>{{{{{0, 1, 2, 3, 4},
+                                    {5, 6, 7, 8, 9},
+                                    {10, 11, 12, 13, 14},
+                                    {15, 16, 17, 18, 19},
+                                    {20, 21, 22, 23, 24}}},
+                                  {{{25, 26, 27, 28, 29},
+                                    {30, 31, 32, 33, 34},
+                                    {35, 36, 37, 38, 39},
+                                    {40, 41, 42, 43, 44},
+                                    {45, 46, 47, 48, 49}}}}});
+
+    std::shared_ptr<Tensor> weight1 =
+        std::make_shared<Tensor>(Array4D<int, 3, 1, 3, 3>{
+            {{{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}},
+             {{{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}},
+             {{{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}}});
+
+    std::shared_ptr<Tensor> bias1 =
+        std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
 
     SECTION("Test implicit") {
         std::shared_ptr<GraphView> g =
-                Sequential({
-                    Conv(1, 3, {3, 3}, "conv1"),
-                    Conv(3, 4, {1, 1}, "conv2"),
-                    Conv(4, 3, {1, 1}, "conv3")});
+            Sequential({Conv(1, 3, {3, 3}, "conv1"),
+                        Conv(3, 4, {1, 1}, "conv2"),
+                        Conv(4, 3, {1, 1}, "conv3")});
 
         g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
         g->getNode("conv1")->getOperator()->setInput(1, weight1);
         g->getNode("conv1")->getOperator()->setInput(2, bias1);
 
-        std::shared_ptr<Tensor> weight2 =
-                std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}},
-                                                                   {{{4}}, {{5}}, {{6}}},
-                                                                   {{{7}}, {{8}}, {{9}}},
-                                                                   {{{10}}, {{11}}, {{12}}}}});
-        std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}});
+        std::shared_ptr<Tensor> weight2 = std::make_shared<Tensor>(
+            Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}},
+                                      {{{4}}, {{5}}, {{6}}},
+                                      {{{7}}, {{8}}, {{9}}},
+                                      {{{10}}, {{11}}, {{12}}}}});
+        std::shared_ptr<Tensor> bias2 =
+            std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}});
         g->getNode("conv2")->getOperator()->setInput(1, weight2);
         g->getNode("conv2")->getOperator()->setInput(2, bias2);
         // *(g->getNode("conv2")->getOperator()->input(1, weight2);
 
         std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>(
-                Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}},
-                                          {{{5}}, {{6}}, {{7}}, {{8}}},
-                                          {{{9}}, {{10}}, {{11}}, {{12}}}}});
-        std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
+            Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}},
+                                      {{{5}}, {{6}}, {{7}}, {{8}}},
+                                      {{{9}}, {{10}}, {{11}}, {{12}}}}});
+        std::shared_ptr<Tensor> bias3 =
+            std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
         g->getNode("conv3")->getOperator()->setInput(1, weight3);
         g->getNode("conv3")->getOperator()->setInput(2, bias3);
 
@@ -85,77 +88,121 @@ TEST_CASE("[cuda/castmove] CastMove(forward)") {
         REQUIRE_NOTHROW(scheduler.forward());
         scheduler.saveSchedulingDiagram("schedulingSequential");
 
-        std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
-                {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
-                  {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
-                  {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
-                 {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
-                  {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
-                  {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}});
-
-        std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{
-                {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}},
-                  {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}},
-                  {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}},
-                  {{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}},
-                 {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}},
-                  {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}},
-                  {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}},
-                  {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}});
-
-        std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
-                {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}},
-                  {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}},
-                  {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}},
-                 {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}},
-                  {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}},
-                  {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}});
-
-        std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0);
+        std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<
+            Tensor>(Array4D<int, 2, 3, 3, 3>{
+            {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
+              {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
+              {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
+             {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
+              {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
+              {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}});
+
+        std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(
+            Array4D<int, 2, 4, 3, 3>{{{{{6099, 7017, 7935},
+                                        {10689, 11607, 12525},
+                                        {15279, 16197, 17115}},
+                                       {{13786, 15838, 17890},
+                                        {24046, 26098, 28150},
+                                        {34306, 36358, 38410}},
+                                       {{21473, 24659, 27845},
+                                        {37403, 40589, 43775},
+                                        {53333, 56519, 59705}},
+                                       {{29160, 33480, 37800},
+                                        {50760, 55080, 59400},
+                                        {72360, 76680, 81000}}},
+                                      {{{29049, 29967, 30885},
+                                        {33639, 34557, 35475},
+                                        {38229, 39147, 40065}},
+                                       {{65086, 67138, 69190},
+                                        {75346, 77398, 79450},
+                                        {85606, 87658, 89710}},
+                                       {{101123, 104309, 107495},
+                                        {117053, 120239, 123425},
+                                        {132983, 136169, 139355}},
+                                       {{137160, 141480, 145800},
+                                        {158760, 163080, 167400},
+                                        {180360, 184680, 189000}}}}});
+
+        std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(
+            Array4D<int, 2, 3, 3, 3>{{{{{214731, 246591, 278451},
+                                        {374031, 405891, 437751},
+                                        {533331, 565191, 597051}},
+                                       {{496804, 570568, 644332},
+                                        {865624, 939388, 1013152},
+                                        {1234444, 1308208, 1381972}},
+                                       {{778877, 894545, 1010213},
+                                        {1357217, 1472885, 1588553},
+                                        {1935557, 2051225, 2166893}}},
+                                      {{{1011231, 1043091, 1074951},
+                                        {1170531, 1202391, 1234251},
+                                        {1329831, 1361691, 1393551}},
+                                       {{2340904, 2414668, 2488432},
+                                        {2709724, 2783488, 2857252},
+                                        {3078544, 3152308, 3226072}},
+                                       {{3670577, 3786245, 3901913},
+                                        {4248917, 4364585, 4480253},
+                                        {4827257, 4942925, 5058593}}}}});
+
+        std::shared_ptr<Tensor> other1 =
+            std::static_pointer_cast<OperatorTensor>(
+                g->getNode("conv1")->getOperator())
+                ->getOutput(0);
         Tensor hostOther1(other1->dataType());
         hostOther1.setBackend("cpu");
         hostOther1.copyCastFrom(*other1);
-        REQUIRE(approxEq<half_float::half, int>(hostOther1, *expectedOutput1, 0.001, 0.0));
-
-        std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<half_float::half, int>(hostOther1,
+                                                *expectedOutput1,
+                                                0.001,
+                                                0.0));
+
+        std::shared_ptr<Tensor> other2 =
+            std::static_pointer_cast<OperatorTensor>(
+                g->getNode("conv2")->getOperator())
+                ->getOutput(0);
         Tensor hostOther2(other2->dataType());
         hostOther2.setBackend("cpu");
         hostOther2.copyCastFrom(*other2);
-        REQUIRE(approxEq<float, int>(hostOther2, *expectedOutput2, 0.001, 0.0));
+        REQUIRE(
+            approxEq<float, int>(hostOther2, *expectedOutput2, 0.001, 0.0));
 
-        std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0);
+        std::shared_ptr<Tensor> other3 =
+            std::static_pointer_cast<OperatorTensor>(
+                g->getNode("conv3")->getOperator())
+                ->getOutput(0);
         Tensor hostOther3(other3->dataType());
         hostOther3.setBackend("cpu");
         hostOther3.copyCastFrom(*other3);
-        REQUIRE(approxEq<double, int>(hostOther3, *expectedOutput3, 0.001, 0.0));
+        REQUIRE(
+            approxEq<double, int>(hostOther3, *expectedOutput3, 0.001, 0.0));
     }
 
     SECTION("Test explicit") {
         std::shared_ptr<GraphView> g =
-                Sequential({
-                    Conv(1, 3, {3, 3}, "conv1"),
-                    Conv(3, 4, {1, 1}, "conv2"),
-                    Conv(4, 3, {1, 1}, "conv3")});
+            Sequential({Conv(1, 3, {3, 3}, "conv1"),
+                        Conv(3, 4, {1, 1}, "conv2"),
+                        Conv(4, 3, {1, 1}, "conv3")});
 
         g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
         g->getNode("conv1")->getOperator()->setInput(1, weight1);
         g->getNode("conv1")->getOperator()->setInput(2, bias1);
 
-        std::shared_ptr<Tensor> weight2 =
-                std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}},
-                                                                   {{{4}}, {{5}}, {{6}}},
-                                                                   {{{7}}, {{8}}, {{9}}},
-                                                                   {{{10}}, {{11}}, {{12}}}}});
-        std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}});
+        std::shared_ptr<Tensor> weight2 = std::make_shared<Tensor>(
+            Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}},
+                                      {{{4}}, {{5}}, {{6}}},
+                                      {{{7}}, {{8}}, {{9}}},
+                                      {{{10}}, {{11}}, {{12}}}}});
+        std::shared_ptr<Tensor> bias2 =
+            std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}});
         g->getNode("conv2")->getOperator()->setInput(1, weight2);
         g->getNode("conv2")->getOperator()->setInput(2, bias2);
         // *(g->getNode("conv2")->getOperator()->input(1, weight2);
 
         std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>(
-                Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}},
-                                          {{{5}}, {{6}}, {{7}}, {{8}}},
-                                          {{{9}}, {{10}}, {{11}}, {{12}}}}});
-        std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
+            Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}},
+                                      {{{5}}, {{6}}, {{7}}, {{8}}},
+                                      {{{9}}, {{10}}, {{11}}, {{12}}}}});
+        std::shared_ptr<Tensor> bias3 =
+            std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
         g->getNode("conv3")->getOperator()->setInput(1, weight3);
         g->getNode("conv3")->getOperator()->setInput(2, bias3);
 
@@ -172,48 +219,91 @@ TEST_CASE("[cuda/castmove] CastMove(forward)") {
         REQUIRE_NOTHROW(scheduler.forward());
         scheduler.saveSchedulingDiagram("schedulingSequential");
 
-        std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
-                {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
-                  {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
-                  {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
-                 {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
-                  {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
-                  {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}});
-
-        std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{
-                {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}},
-                  {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}},
-                  {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}},
-                  {{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}},
-                 {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}},
-                  {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}},
-                  {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}},
-                  {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}});
-
-        std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
-                {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}},
-                  {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}},
-                  {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}},
-                 {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}},
-                  {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}},
-                  {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}});
-
-        std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0);
+        std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<
+            Tensor>(Array4D<int, 2, 3, 3, 3>{
+            {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
+              {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
+              {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
+             {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
+              {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
+              {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}});
+
+        std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(
+            Array4D<int, 2, 4, 3, 3>{{{{{6099, 7017, 7935},
+                                        {10689, 11607, 12525},
+                                        {15279, 16197, 17115}},
+                                       {{13786, 15838, 17890},
+                                        {24046, 26098, 28150},
+                                        {34306, 36358, 38410}},
+                                       {{21473, 24659, 27845},
+                                        {37403, 40589, 43775},
+                                        {53333, 56519, 59705}},
+                                       {{29160, 33480, 37800},
+                                        {50760, 55080, 59400},
+                                        {72360, 76680, 81000}}},
+                                      {{{29049, 29967, 30885},
+                                        {33639, 34557, 35475},
+                                        {38229, 39147, 40065}},
+                                       {{65086, 67138, 69190},
+                                        {75346, 77398, 79450},
+                                        {85606, 87658, 89710}},
+                                       {{101123, 104309, 107495},
+                                        {117053, 120239, 123425},
+                                        {132983, 136169, 139355}},
+                                       {{137160, 141480, 145800},
+                                        {158760, 163080, 167400},
+                                        {180360, 184680, 189000}}}}});
+
+        std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(
+            Array4D<int, 2, 3, 3, 3>{{{{{214731, 246591, 278451},
+                                        {374031, 405891, 437751},
+                                        {533331, 565191, 597051}},
+                                       {{496804, 570568, 644332},
+                                        {865624, 939388, 1013152},
+                                        {1234444, 1308208, 1381972}},
+                                       {{778877, 894545, 1010213},
+                                        {1357217, 1472885, 1588553},
+                                        {1935557, 2051225, 2166893}}},
+                                      {{{1011231, 1043091, 1074951},
+                                        {1170531, 1202391, 1234251},
+                                        {1329831, 1361691, 1393551}},
+                                       {{2340904, 2414668, 2488432},
+                                        {2709724, 2783488, 2857252},
+                                        {3078544, 3152308, 3226072}},
+                                       {{3670577, 3786245, 3901913},
+                                        {4248917, 4364585, 4480253},
+                                        {4827257, 4942925, 5058593}}}}});
+
+        std::shared_ptr<Tensor> other1 =
+            std::static_pointer_cast<OperatorTensor>(
+                g->getNode("conv1")->getOperator())
+                ->getOutput(0);
         Tensor hostOther1(other1->dataType());
         hostOther1.setBackend("cpu");
         hostOther1.copyCastFrom(*other1);
-        REQUIRE(approxEq<half_float::half, int>(hostOther1, *expectedOutput1, 0.001, 0.0));
-
-        std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<half_float::half, int>(hostOther1,
+                                                *expectedOutput1,
+                                                0.001,
+                                                0.0));
+
+        std::shared_ptr<Tensor> other2 =
+            std::static_pointer_cast<OperatorTensor>(
+                g->getNode("conv2")->getOperator())
+                ->getOutput(0);
         Tensor hostOther2(other2->dataType());
         hostOther2.setBackend("cpu");
         hostOther2.copyCastFrom(*other2);
-        REQUIRE(approxEq<float, int>(hostOther2, *expectedOutput2, 0.001, 0.0));
+        REQUIRE(
+            approxEq<float, int>(hostOther2, *expectedOutput2, 0.001, 0.0));
 
-        std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0);
+        std::shared_ptr<Tensor> other3 =
+            std::static_pointer_cast<OperatorTensor>(
+                g->getNode("conv3")->getOperator())
+                ->getOutput(0);
         Tensor hostOther3(other3->dataType());
         hostOther3.setBackend("cpu");
         hostOther3.copyCastFrom(*other3);
-        REQUIRE(approxEq<double, int>(hostOther3, *expectedOutput3, 0.001, 0.0));
+        REQUIRE(
+            approxEq<double, int>(hostOther3, *expectedOutput3, 0.001, 0.0));
     }
 }
diff --git a/unit_tests/Test_ConvDepthWiseImpl.cpp b/unit_tests/Test_ConvDepthWiseImpl.cpp
index 4655de069cce86e80881a06673621c8159be18f6..130c634b0c0475ae371f8e135fbb66fdc641148a 100644
--- a/unit_tests/Test_ConvDepthWiseImpl.cpp
+++ b/unit_tests/Test_ConvDepthWiseImpl.cpp
@@ -11,7 +11,7 @@
 
 #include <array>
 #include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -24,143 +24,117 @@ using namespace Aidge;
 
 TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") {
     SECTION("Deterministic Input") {
-        std::shared_ptr<Node> myCDW = ConvDepthWise(4, {3,3}, "mycdw");
-        auto op = std::static_pointer_cast<OperatorTensor>(myCDW -> getOperator());
-        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,1,3,3> {
-            {
-                {{
-                    {  0,  1,  2},
-                    {  3,  4,  5},
-                    {  6,  7,  8}
-
-                }},
-                {{
-                    { 27, 28, 29},
-                    { 30, 31, 32},
-                    { 33, 34, 35}
-
-                }},
-                {{
-                    { 54, 55, 56},
-                    { 57, 58, 59},
-                    { 60, 61, 62}
-                }},
-                {{
-                    { 81, 82, 83},
-                    { 84, 85, 86},
-                    { 87, 88, 89}
-                }}
-            }
-        });
-        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,4> {{7,0,9,0}});
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,4,5,5> { //NCHW
-            {
-                {
-                    {{  0,   1,   2,   3,   4},
-                    {  5,   6,   7,   8,   9},
-                    { 10,  11,  12,  13,  14},
-                    { 15,  16,  17,  18,  19},
-                    { 20,  21,  22,  23,  24}},
-
-                    {{ 25,  26,  27,  28,  29},
-                    { 30,  31,  32,  33,  34},
-                    { 35,  36,  37,  38,  39},
-                    { 40,  41,  42,  43,  44},
-                    { 45,  46,  47,  48,  49}},
-
-                    {{ 50,  51,  52,  53,  54},
-                    { 55,  56,  57,  58,  59},
-                    { 60,  61,  62,  63,  64},
-                    { 65,  66,  67,  68,  69},
-                    { 70,  71,  72,  73,  74}},
-
-                    {{ 75,  76,  77,  78,  79},
-                    { 80,  81,  82,  83,  84},
-                    { 85,  86,  87,  88,  89},
-                    { 90,  91,  92,  93,  94},
-                    { 95,  96,  97,  98,  99}}
-                },
-                {
-                    {{100, 101, 102, 103, 104},
-                    {105, 106, 107, 108, 109},
-                    {110, 111, 112, 113, 114},
-                    {115, 116, 117, 118, 119},
-                    {120, 121, 122, 123, 124}},
-
-                    {{125, 126, 127, 128, 129},
-                    {130, 131, 132, 133, 134},
-                    {135, 136, 137, 138, 139},
-                    {140, 141, 142, 143, 144},
-                    {145, 146, 147, 148, 149}},
-
-                    {{150, 151, 152, 153, 154},
-                    {155, 156, 157, 158, 159},
-                    {160, 161, 162, 163, 164},
-                    {165, 166, 167, 168, 169},
-                    {170, 171, 172, 173, 174}},
-
-                    {{175, 176, 177, 178, 179},
-                    {180, 181, 182, 183, 184},
-                    {185, 186, 187, 188, 189},
-                    {190, 191, 192, 193, 194},
-                    {195, 196, 197, 198, 199}}
-                }
-            }
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,3,3> {
-            {
-                {
-                    {{   319,    355,    391},
-                    {   499,    535,    571},
-                    {   679,    715,    751}},
-
-                    {{  8745,   9024,   9303},
-                    { 10140,  10419,  10698},
-                    { 11535,  11814,  12093}},
-
-                    {{ 29337,  29859,  30381},
-                    { 31947,  32469,  32991},
-                    { 34557,  35079,  35601}},
-
-                    {{ 62061,  62826,  63591},
-                    { 65886,  66651,  67416},
-                    { 69711,  70476,  71241}}
-                },
-                {
-                    {{  3919,   3955,   3991},
-                    {  4099,   4135,   4171},
-                    {  4279,   4315,   4351}},
-
-                    {{ 36645,  36924,  37203},
-                    { 38040,  38319,  38598},
-                    { 39435,  39714,  39993}},
-
-                    {{ 81537,  82059,  82581},
-                    { 84147,  84669,  85191},
-                    { 86757,  87279,  87801}},
-
-                    {{138561, 139326, 140091},
-                    {142386, 143151, 143916},
-                    {146211, 146976, 147741}}
-                }
-            }
-        });
+        std::shared_ptr<Node> myCDW = ConvDepthWise(4, {3, 3}, "mycdw");
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myCDW->getOperator());
+        std::shared_ptr<Tensor> myWeights =
+            std::make_shared<Tensor>(Array4D<float, 4, 1, 3, 3>{
+                {{{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}
+
+                 }},
+                 {{{27, 28, 29}, {30, 31, 32}, {33, 34, 35}
+
+                 }},
+                 {{{54, 55, 56}, {57, 58, 59}, {60, 61, 62}}},
+                 {{{81, 82, 83}, {84, 85, 86}, {87, 88, 89}}}}});
+        std::shared_ptr<Tensor> myBias =
+            std::make_shared<Tensor>(Array1D<float, 4>{{7, 0, 9, 0}});
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array4D<float, 2, 4, 5, 5>{// NCHW
+                                       {{{{0, 1, 2, 3, 4},
+                                          {5, 6, 7, 8, 9},
+                                          {10, 11, 12, 13, 14},
+                                          {15, 16, 17, 18, 19},
+                                          {20, 21, 22, 23, 24}},
+
+                                         {{25, 26, 27, 28, 29},
+                                          {30, 31, 32, 33, 34},
+                                          {35, 36, 37, 38, 39},
+                                          {40, 41, 42, 43, 44},
+                                          {45, 46, 47, 48, 49}},
+
+                                         {{50, 51, 52, 53, 54},
+                                          {55, 56, 57, 58, 59},
+                                          {60, 61, 62, 63, 64},
+                                          {65, 66, 67, 68, 69},
+                                          {70, 71, 72, 73, 74}},
+
+                                         {{75, 76, 77, 78, 79},
+                                          {80, 81, 82, 83, 84},
+                                          {85, 86, 87, 88, 89},
+                                          {90, 91, 92, 93, 94},
+                                          {95, 96, 97, 98, 99}}},
+                                        {{{100, 101, 102, 103, 104},
+                                          {105, 106, 107, 108, 109},
+                                          {110, 111, 112, 113, 114},
+                                          {115, 116, 117, 118, 119},
+                                          {120, 121, 122, 123, 124}},
+
+                                         {{125, 126, 127, 128, 129},
+                                          {130, 131, 132, 133, 134},
+                                          {135, 136, 137, 138, 139},
+                                          {140, 141, 142, 143, 144},
+                                          {145, 146, 147, 148, 149}},
+
+                                         {{150, 151, 152, 153, 154},
+                                          {155, 156, 157, 158, 159},
+                                          {160, 161, 162, 163, 164},
+                                          {165, 166, 167, 168, 169},
+                                          {170, 171, 172, 173, 174}},
+
+                                         {{175, 176, 177, 178, 179},
+                                          {180, 181, 182, 183, 184},
+                                          {185, 186, 187, 188, 189},
+                                          {190, 191, 192, 193, 194},
+                                          {195, 196, 197, 198, 199}}}}});
+        std::shared_ptr<Tensor> myOutput =
+            std::make_shared<Tensor>(Array4D<float, 2, 4, 3, 3>{
+                {{{{319, 355, 391}, {499, 535, 571}, {679, 715, 751}},
+
+                  {{8745, 9024, 9303},
+                   {10140, 10419, 10698},
+                   {11535, 11814, 12093}},
+
+                  {{29337, 29859, 30381},
+                   {31947, 32469, 32991},
+                   {34557, 35079, 35601}},
+
+                  {{62061, 62826, 63591},
+                   {65886, 66651, 67416},
+                   {69711, 70476, 71241}}},
+                 {{{3919, 3955, 3991}, {4099, 4135, 4171}, {4279, 4315, 4351}},
+
+                  {{36645, 36924, 37203},
+                   {38040, 38319, 38598},
+                   {39435, 39714, 39993}},
+
+                  {{81537, 82059, 82581},
+                   {84147, 84669, 85191},
+                   {86757, 87279, 87801}},
+
+                  {{138561, 139326, 140091},
+                   {142386, 143151, 143916},
+                   {146211, 146976, 147741}}}}});
         myInput->setBackend("cuda");
         myWeights->setBackend("cuda");
         myBias->setBackend("cuda");
-        op -> associateInput(0, myInput);
-        op -> associateInput(1, myWeights);
-        op -> associateInput(2, myBias);
+        op->associateInput(0, myInput);
+        op->associateInput(1, myWeights);
+        op->associateInput(2, myBias);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
 
-        myCDW -> forward();
+        myCDW->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
@@ -173,50 +147,69 @@ TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") {
         std::random_device rd;
         std::mt19937 gen(rd());
         std::uniform_real_distribution<float> valueDist(
-            0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> kernelDist(1, std::size_t(5));
-        std::uniform_int_distribution<std::size_t> dimSizeDist(1, std::size_t(10));
+            0.1f,
+            1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> kernelDist(1,
+                                                              std::size_t(5));
+        std::uniform_int_distribution<std::size_t> dimSizeDist(
+            1,
+            std::size_t(10));
 
         // To measure execution time of 'forward()'
         std::chrono::time_point<std::chrono::system_clock> start;
         std::chrono::time_point<std::chrono::system_clock> end;
         std::chrono::duration<double, std::micro> duration{};
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
-        {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
             const std::size_t kernel = kernelDist(gen);
-            std::uniform_int_distribution<std::size_t> resolutionDist(std::size_t(kernel+2),
-                                                               std::size_t(10));
+            std::uniform_int_distribution<std::size_t> resolutionDist(
+                std::size_t(kernel + 2),
+                std::size_t(10));
             const std::size_t nbDims = 4;
             // input  (batch, ch, Xin, Yin)
             // weight (outCh, ch, kernelX, kernelY)
             std::vector<std::size_t> dims;
             for (std::size_t i = 0; i < nbDims; ++i) {
-                if(i < 2)
+                if (i < 2)
                     dims.push_back(dimSizeDist(gen));
                 else
                     dims.push_back(resolutionDist(gen));
             }
-            dims[1] = 1; // TODO FIX: ConvDepthWise doesn't give the same output in CUDA as in CPU unless channels is 1
+            dims[1] = 1; // TODO FIX: ConvDepthWise doesn't give the same
+                         // output in CUDA as in CPU unless channels is 1
             const std::size_t nbChannels = dims[1];
-            const std::vector<std::size_t> dimsW{nbChannels,nbChannels,kernel,kernel};
-
-            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
-            const std::size_t wieghtSize = std::accumulate(dimsW.cbegin(), dimsW.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::vector<std::size_t> dimsW{nbChannels,
+                                                 nbChannels,
+                                                 kernel,
+                                                 kernel};
+
+            const std::size_t nb_elements =
+                std::accumulate(dims.cbegin(),
+                                dims.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
+            const std::size_t wieghtSize =
+                std::accumulate(dimsW.cbegin(),
+                                dimsW.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
 
             // Create ConvDepthWise Operator CUDA
-            std::shared_ptr<Node> myConvCUDA = ConvDepthWise(nbChannels,{kernel,kernel}, "myconvcuda");
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myConvCUDA -> getOperator());
+            std::shared_ptr<Node> myConvCUDA =
+                ConvDepthWise(nbChannels, {kernel, kernel}, "myconvcuda");
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(
+                myConvCUDA->getOperator());
 
             // Create ConvDepthWise Operator CPU
-            std::shared_ptr<Node> myConvCPU = ConvDepthWise(nbChannels,{kernel,kernel}, "myconvcpu");
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myConvCPU -> getOperator());
+            std::shared_ptr<Node> myConvCPU =
+                ConvDepthWise(nbChannels, {kernel, kernel}, "myconvcpu");
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(
+                myConvCPU->getOperator());
             op_cpu->setDataType(DataType::Float32);
             op_cpu->setBackend("cpu");
 
-
-            float* array0 = new float[nb_elements];
-            float* weights = new float[wieghtSize];
-            float* bias = new float[nbChannels];
+            float *array0 = new float[nb_elements];
+            float *weights = new float[wieghtSize];
+            float *bias = new float[nbChannels];
 
             for (std::size_t i = 0; i < nb_elements; ++i) {
                 array0[i] = valueDist(gen);
@@ -229,23 +222,27 @@ TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") {
             }
 
             // input0 CUDA
-            float* array0_d, *weight_d, *bias_d;
+            float *array0_d, *weight_d, *bias_d;
             std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
             T0_cuda->setDataType(DataType::Float32);
             T0_cuda->setBackend("cuda");
             T0_cuda->resize(dims);
             op_cuda->associateInput(0, T0_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
-            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                       sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d,
+                       array0,
+                       sizeof(float) * nb_elements,
+                       cudaMemcpyHostToDevice);
             T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
 
             // input0 CPU
             std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(0,T0_cpu);
+            op_cpu->associateInput(0, T0_cpu);
             T0_cpu->setDataType(DataType::Float32);
             T0_cpu->setBackend("cpu");
             T0_cpu->resize(dims);
-            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+            T0_cpu->getImpl()->setRawPtr(array0, nb_elements);
 
             // weight CUDA
             std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>();
@@ -253,17 +250,21 @@ TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") {
             Tw_cuda->setBackend("cuda");
             Tw_cuda->resize(dimsW);
             op_cuda->associateInput(1, Tw_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * wieghtSize);
-            cudaMemcpy(weight_d, weights, sizeof(float) * wieghtSize, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&weight_d),
+                       sizeof(float) * wieghtSize);
+            cudaMemcpy(weight_d,
+                       weights,
+                       sizeof(float) * wieghtSize,
+                       cudaMemcpyHostToDevice);
             Tw_cuda->getImpl()->setRawPtr(weight_d, wieghtSize);
 
             // weight CPU
             std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(1,Tw_cpu);
+            op_cpu->associateInput(1, Tw_cpu);
             Tw_cpu->setDataType(DataType::Float32);
             Tw_cpu->setBackend("cpu");
             Tw_cpu->resize(dimsW);
-            Tw_cpu -> getImpl() -> setRawPtr(weights, wieghtSize);
+            Tw_cpu->getImpl()->setRawPtr(weights, wieghtSize);
 
             // bias CUDA
             std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>();
@@ -271,17 +272,21 @@ TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") {
             Tb_cuda->setBackend("cuda");
             Tb_cuda->resize({nbChannels});
             op_cuda->associateInput(2, Tb_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * nbChannels);
-            cudaMemcpy(bias_d, bias, sizeof(float) * nbChannels, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&bias_d),
+                       sizeof(float) * nbChannels);
+            cudaMemcpy(bias_d,
+                       bias,
+                       sizeof(float) * nbChannels,
+                       cudaMemcpyHostToDevice);
             Tb_cuda->getImpl()->setRawPtr(bias_d, nbChannels);
 
             // bias CPU
             std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(2,Tb_cpu);
+            op_cpu->associateInput(2, Tb_cpu);
             Tb_cpu->setDataType(DataType::Float32);
             Tb_cpu->setBackend("cpu");
             Tb_cpu->resize({nbChannels});
-            Tb_cpu -> getImpl() -> setRawPtr(bias, nbChannels);
+            Tb_cpu->getImpl()->setRawPtr(bias, nbChannels);
 
             // forward CUDA
             op_cuda->setDataType(DataType::Float32);
@@ -289,15 +294,20 @@ TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") {
             start = std::chrono::system_clock::now();
             op_cuda->forward();
             end = std::chrono::system_clock::now();
-            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(
+                end - start);
 
-            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            const std::size_t outSize = op_cuda->getOutput(0)->size();
             float *computed_cuda = new float[outSize]();
-            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+            cudaMemcpy(computed_cuda,
+                       op_cuda->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * outSize,
+                       cudaMemcpyDeviceToHost);
 
             // forward CPU
             op_cpu->forward();
-            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            float *computed_cpu = static_cast<float *>(
+                op_cpu->getOutput(0)->getImpl()->rawPtr());
 
             REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
 
diff --git a/unit_tests/Test_ConvImpl.cpp b/unit_tests/Test_ConvImpl.cpp
index 72a4040a8ecbd091e24f8441d9c29970ea82c606..5c5d9dafd543fe22f437fb6f382857fcbca9ce06 100644
--- a/unit_tests/Test_ConvImpl.cpp
+++ b/unit_tests/Test_ConvImpl.cpp
@@ -11,7 +11,7 @@
 
 #include <array>
 #include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -24,43 +24,38 @@ using namespace Aidge;
 
 TEST_CASE("[gpu/operator] Conv(forward)") {
     SECTION("Simple Conv no bias") {
-        std::shared_ptr<Node> myConv = Conv(1,1,{3,3}, "myconv");
-        auto op = std::static_pointer_cast<OperatorTensor>(myConv->getOperator());
+        std::shared_ptr<Node> myConv = Conv(1, 1, {3, 3}, "myconv");
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myConv->getOperator());
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
-        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
-            {
-                {
-                    {{  0,   1,   2},
-                    {  3,   4,   5},
-                    {  6,   7,   8}}
-                }
-            }
-        });
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { //NCHW
-            {
-                {
-                    {{  0,   1,   2},
-                    {  3,   4,   5},
-                    {  6,   7,   8}}
-                }
-            }
-        });
-        const float myOutput = 0*0+1*1+2*2+3*3+4*4+5*5+6*6+7*7+8*8;
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(
+            Array4D<float, 1, 1, 3, 3>{{{{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}}}});
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array4D<float, 1, 1, 3, 3>{// NCHW
+                                       {{{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}}}});
+        const float myOutput = 0 * 0 + 1 * 1 + 2 * 2 + 3 * 3 + 4 * 4 + 5 * 5 +
+                               6 * 6 + 7 * 7 + 8 * 8;
 
         myInput->setBackend("cuda");
         myWeights->setBackend("cuda");
 
-        op->associateInput(0,myInput);
-        op->associateInput(1,myWeights);
+        op->associateInput(0, myInput);
+        op->associateInput(1, myWeights);
         myConv->forward();
 
         REQUIRE(op->getOutput(0)->size() == 1);
 
         std::array<float, 9> kernel;
-        cudaMemcpy(&kernel[0], myWeights->getImpl()->rawPtr(), 9 * sizeof(float), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&kernel[0],
+                   myWeights->getImpl()->rawPtr(),
+                   9 * sizeof(float),
+                   cudaMemcpyDeviceToHost);
         std::array<float, 9> input;
-        cudaMemcpy(&input[0], myInput->getImpl()->rawPtr(), 9 * sizeof(float), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&input[0],
+                   myInput->getImpl()->rawPtr(),
+                   9 * sizeof(float),
+                   cudaMemcpyDeviceToHost);
 
         for (int i = 0; i < 9; ++i) {
             REQUIRE(kernel[i] == i);
@@ -68,155 +63,117 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
         }
 
         float computedOutput;
-        cudaMemcpy(&computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float),
+                   cudaMemcpyDeviceToHost);
 
         REQUIRE(fabs(computedOutput - myOutput) < 1e-6);
     }
 
     SECTION("Classic Conv") {
-        std::shared_ptr<Node> myConv = Conv(3,4,{3,3}, "myconv");
-        auto op = std::static_pointer_cast<OperatorTensor>(myConv->getOperator());
+        std::shared_ptr<Node> myConv = Conv(3, 4, {3, 3}, "myconv");
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myConv->getOperator());
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
-        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,3,3,3> {
-            {
-                {
-                    {{  0,   1,   2},
-                    {  3,   4,   5},
-                    {  6,   7,   8}},
-                    {{  9,  10,  11},
-                    { 12,  13,  14},
-                    { 15,  16,  17}},
-                    {{ 18,  19,  20},
-                    { 21,  22,  23},
-                    { 24,  25,  26}}
-                },
-                {
-                    {{ 27,  28,  29},
-                    { 30,  31,  32},
-                    { 33,  34,  35}},
-                    {{ 36,  37,  38},
-                    { 39,  40,  41},
-                    { 42,  43,  44}},
-                    {{ 45,  46,  47},
-                    { 48,  49,  50},
-                    { 51,  52,  53}}
-                },
-                {
-                    {{ 54,  55,  56},
-                    { 57,  58,  59},
-                    { 60,  61,  62}},
-                    {{ 63,  64,  65},
-                    { 66,  67,  68},
-                    { 69,  70,  71}},
-                    {{ 72,  73,  74},
-                    { 75,  76,  77},
-                    { 78,  79,  80}}
-                },
-                {
-                    {{ 81,  82,  83},
-                    { 84,  85,  86},
-                    { 87,  88,  89}},
-                    {{ 90,  91,  92},
-                    { 93,  94,  95},
-                    { 96,  97,  98}},
-                    {{ 99, 100, 101},
-                    {102, 103, 104},
-                    {105, 106, 107}}
-                }
-            }
-        });
-        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,4> {{7,0,9,0}});
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW
-            {
-                {
-                    {{  0,   1,   2,   3,   4},
-                    {  5,   6,   7,   8,   9},
-                    { 10,  11,  12,  13,  14},
-                    { 15,  16,  17,  18,  19},
-                    { 20,  21,  22,  23,  24}},
-
-                    {{ 25,  26,  27,  28,  29},
-                    { 30,  31,  32,  33,  34},
-                    { 35,  36,  37,  38,  39},
-                    { 40,  41,  42,  43,  44},
-                    { 45,  46,  47,  48,  49}},
-
-                    {{ 50,  51,  52,  53,  54},
-                    { 55,  56,  57,  58,  59},
-                    { 60,  61,  62,  63,  64},
-                    { 65,  66,  67,  68,  69},
-                    { 70,  71,  72,  73,  74}}
-                },
-                {
-                    {{ 75,  76,  77,  78,  79},
-                    { 80,  81,  82,  83,  84},
-                    { 85,  86,  87,  88,  89},
-                    { 90,  91,  92,  93,  94},
-                    { 95,  96,  97,  98,  99}},
-
-                    {{100, 101, 102, 103, 104},
-                    {105, 106, 107, 108, 109},
-                    {110, 111, 112, 113, 114},
-                    {115, 116, 117, 118, 119},
-                    {120, 121, 122, 123, 124}},
-
-                    {{125, 126, 127, 128, 129},
-                    {130, 131, 132, 133, 134},
-                    {135, 136, 137, 138, 139},
-                    {140, 141, 142, 143, 144},
-                    {145, 146, 147, 148, 149}}
-                }
-            }
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,3,3> { 
-            {
-                {
-                    {{ 15226,  15577,  15928},
-                    { 16981,  17332,  17683},
-                    { 18736,  19087,  19438}},
-                    {{ 37818,  38898,  39978},
-                    { 43218,  44298,  45378},
-                    { 48618,  49698,  50778}},
-                    {{ 60426,  62235,  64044},
-                    { 69471,  71280,  73089},
-                    { 78516,  80325,  82134}},
-                    {{ 83016,  85554,  88092},
-                    { 95706,  98244, 100782},
-                    {108396, 110934, 113472}}
-                },
-                {
-                    {{ 41551,  41902,  42253},
-                    { 43306,  43657,  44008},
-                    { 45061,  45412,  45763}},
-                    {{118818, 119898, 120978},
-                    {124218, 125298, 126378},
-                    {129618, 130698, 131778}},
-                    {{196101, 197910, 199719},
-                    {205146, 206955, 208764},
-                    {214191, 216000, 217809}},
-                    {{273366, 275904, 278442},
-                    {286056, 288594, 291132},
-                    {298746, 301284, 303822}}
-                }
-            }
-        });
+        std::shared_ptr<Tensor> myWeights =
+            std::make_shared<Tensor>(Array4D<float, 4, 3, 3, 3>{
+                {{{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}},
+                  {{9, 10, 11}, {12, 13, 14}, {15, 16, 17}},
+                  {{18, 19, 20}, {21, 22, 23}, {24, 25, 26}}},
+                 {{{27, 28, 29}, {30, 31, 32}, {33, 34, 35}},
+                  {{36, 37, 38}, {39, 40, 41}, {42, 43, 44}},
+                  {{45, 46, 47}, {48, 49, 50}, {51, 52, 53}}},
+                 {{{54, 55, 56}, {57, 58, 59}, {60, 61, 62}},
+                  {{63, 64, 65}, {66, 67, 68}, {69, 70, 71}},
+                  {{72, 73, 74}, {75, 76, 77}, {78, 79, 80}}},
+                 {{{81, 82, 83}, {84, 85, 86}, {87, 88, 89}},
+                  {{90, 91, 92}, {93, 94, 95}, {96, 97, 98}},
+                  {{99, 100, 101}, {102, 103, 104}, {105, 106, 107}}}}});
+        std::shared_ptr<Tensor> myBias =
+            std::make_shared<Tensor>(Array1D<float, 4>{{7, 0, 9, 0}});
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array4D<float, 2, 3, 5, 5>{// NCHW
+                                       {{{{0, 1, 2, 3, 4},
+                                          {5, 6, 7, 8, 9},
+                                          {10, 11, 12, 13, 14},
+                                          {15, 16, 17, 18, 19},
+                                          {20, 21, 22, 23, 24}},
+
+                                         {{25, 26, 27, 28, 29},
+                                          {30, 31, 32, 33, 34},
+                                          {35, 36, 37, 38, 39},
+                                          {40, 41, 42, 43, 44},
+                                          {45, 46, 47, 48, 49}},
+
+                                         {{50, 51, 52, 53, 54},
+                                          {55, 56, 57, 58, 59},
+                                          {60, 61, 62, 63, 64},
+                                          {65, 66, 67, 68, 69},
+                                          {70, 71, 72, 73, 74}}},
+                                        {{{75, 76, 77, 78, 79},
+                                          {80, 81, 82, 83, 84},
+                                          {85, 86, 87, 88, 89},
+                                          {90, 91, 92, 93, 94},
+                                          {95, 96, 97, 98, 99}},
+
+                                         {{100, 101, 102, 103, 104},
+                                          {105, 106, 107, 108, 109},
+                                          {110, 111, 112, 113, 114},
+                                          {115, 116, 117, 118, 119},
+                                          {120, 121, 122, 123, 124}},
+
+                                         {{125, 126, 127, 128, 129},
+                                          {130, 131, 132, 133, 134},
+                                          {135, 136, 137, 138, 139},
+                                          {140, 141, 142, 143, 144},
+                                          {145, 146, 147, 148, 149}}}}});
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(
+            Array4D<float, 2, 4, 3, 3>{{{{{15226, 15577, 15928},
+                                          {16981, 17332, 17683},
+                                          {18736, 19087, 19438}},
+                                         {{37818, 38898, 39978},
+                                          {43218, 44298, 45378},
+                                          {48618, 49698, 50778}},
+                                         {{60426, 62235, 64044},
+                                          {69471, 71280, 73089},
+                                          {78516, 80325, 82134}},
+                                         {{83016, 85554, 88092},
+                                          {95706, 98244, 100782},
+                                          {108396, 110934, 113472}}},
+                                        {{{41551, 41902, 42253},
+                                          {43306, 43657, 44008},
+                                          {45061, 45412, 45763}},
+                                         {{118818, 119898, 120978},
+                                          {124218, 125298, 126378},
+                                          {129618, 130698, 131778}},
+                                         {{196101, 197910, 199719},
+                                          {205146, 206955, 208764},
+                                          {214191, 216000, 217809}},
+                                         {{273366, 275904, 278442},
+                                          {286056, 288594, 291132},
+                                          {298746, 301284, 303822}}}}});
 
         myInput->setBackend("cuda");
         myWeights->setBackend("cuda");
         myBias->setBackend("cuda");
 
-        op->associateInput(0,myInput);
-        op->associateInput(1,myWeights);
-        op->associateInput(2,myBias);
+        op->associateInput(0, myInput);
+        op->associateInput(1, myWeights);
+        op->associateInput(2, myBias);
         myConv->forward();
         // op->getOutput(0)->print();
 
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
@@ -229,48 +186,66 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
         std::random_device rd;
         std::mt19937 gen(rd());
         std::uniform_real_distribution<float> valueDist(
-            0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> kernelDist(1, std::size_t(5));
-        std::uniform_int_distribution<std::size_t> dimSizeDist(1, std::size_t(10));
+            0.1f,
+            1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> kernelDist(1,
+                                                              std::size_t(5));
+        std::uniform_int_distribution<std::size_t> dimSizeDist(
+            1,
+            std::size_t(10));
 
         // To measure execution time of 'forward()'
         std::chrono::time_point<std::chrono::system_clock> start;
         std::chrono::time_point<std::chrono::system_clock> end;
         std::chrono::duration<double, std::micro> duration{};
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
-        {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
             const std::size_t kernel = kernelDist(gen);
-            std::uniform_int_distribution<std::size_t> resolutionDist(std::size_t(kernel),
-                                                                      std::size_t(10));
+            std::uniform_int_distribution<std::size_t> resolutionDist(
+                std::size_t(kernel),
+                std::size_t(10));
             const std::size_t nbDims = 4;
             std::vector<std::size_t> dims;
             for (std::size_t i = 0; i < nbDims; ++i) {
-                if(i < 2)
+                if (i < 2)
                     dims.push_back(dimSizeDist(gen));
                 else
                     dims.push_back(resolutionDist(gen));
             }
             const std::size_t outChannels = dimSizeDist(gen);
-            const std::vector<std::size_t> dimsW{outChannels,dims[1],kernel,kernel};
+            const std::vector<std::size_t> dimsW{outChannels,
+                                                 dims[1],
+                                                 kernel,
+                                                 kernel};
             const std::size_t inChannels = dims[1];
 
-            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
-            const std::size_t wieghtSize = std::accumulate(dimsW.cbegin(), dimsW.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements =
+                std::accumulate(dims.cbegin(),
+                                dims.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
+            const std::size_t wieghtSize =
+                std::accumulate(dimsW.cbegin(),
+                                dimsW.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
 
             // Create Conv Operator CUDA
-            std::shared_ptr<Node> myConvCUDA = Conv(inChannels,outChannels,{kernel,kernel}, "myconvcuda");
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myConvCUDA -> getOperator());
+            std::shared_ptr<Node> myConvCUDA =
+                Conv(inChannels, outChannels, {kernel, kernel}, "myconvcuda");
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(
+                myConvCUDA->getOperator());
 
             // Create Conv Operator CPU
-            std::shared_ptr<Node> myConvCPU = Conv(inChannels,outChannels,{kernel,kernel}, "myconvcpu");
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myConvCPU -> getOperator());
+            std::shared_ptr<Node> myConvCPU =
+                Conv(inChannels, outChannels, {kernel, kernel}, "myconvcpu");
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(
+                myConvCPU->getOperator());
             op_cpu->setDataType(DataType::Float32);
             op_cpu->setBackend("cpu");
 
-
-            float* array0 = new float[nb_elements];
-            float* weights = new float[wieghtSize];
-            float* bias = new float[outChannels];
+            float *array0 = new float[nb_elements];
+            float *weights = new float[wieghtSize];
+            float *bias = new float[outChannels];
 
             for (std::size_t i = 0; i < nb_elements; ++i) {
                 array0[i] = valueDist(gen);
@@ -283,23 +258,27 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
             }
 
             // input0 CUDA
-            float* array0_d, *weight_d, *bias_d;
+            float *array0_d, *weight_d, *bias_d;
             std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
             T0_cuda->setDataType(DataType::Float32);
             T0_cuda->setBackend("cuda");
             T0_cuda->resize(dims);
             op_cuda->associateInput(0, T0_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
-            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                       sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d,
+                       array0,
+                       sizeof(float) * nb_elements,
+                       cudaMemcpyHostToDevice);
             T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
 
             // input0 CPU
             std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(0,T0_cpu);
+            op_cpu->associateInput(0, T0_cpu);
             T0_cpu->setDataType(DataType::Float32);
             T0_cpu->setBackend("cpu");
             T0_cpu->resize(dims);
-            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+            T0_cpu->getImpl()->setRawPtr(array0, nb_elements);
 
             // weight CUDA
             std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>();
@@ -307,17 +286,21 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
             Tw_cuda->setBackend("cuda");
             Tw_cuda->resize(dimsW);
             op_cuda->associateInput(1, Tw_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * wieghtSize);
-            cudaMemcpy(weight_d, weights, sizeof(float) * wieghtSize, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&weight_d),
+                       sizeof(float) * wieghtSize);
+            cudaMemcpy(weight_d,
+                       weights,
+                       sizeof(float) * wieghtSize,
+                       cudaMemcpyHostToDevice);
             Tw_cuda->getImpl()->setRawPtr(weight_d, wieghtSize);
 
             // weight CPU
             std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(1,Tw_cpu);
+            op_cpu->associateInput(1, Tw_cpu);
             Tw_cpu->setDataType(DataType::Float32);
             Tw_cpu->setBackend("cpu");
             Tw_cpu->resize(dimsW);
-            Tw_cpu -> getImpl() -> setRawPtr(weights, wieghtSize);
+            Tw_cpu->getImpl()->setRawPtr(weights, wieghtSize);
 
             // bias CUDA
             std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>();
@@ -325,17 +308,21 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
             Tb_cuda->setBackend("cuda");
             Tb_cuda->resize({outChannels});
             op_cuda->associateInput(2, Tb_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * outChannels);
-            cudaMemcpy(bias_d, bias, sizeof(float) * outChannels, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&bias_d),
+                       sizeof(float) * outChannels);
+            cudaMemcpy(bias_d,
+                       bias,
+                       sizeof(float) * outChannels,
+                       cudaMemcpyHostToDevice);
             Tb_cuda->getImpl()->setRawPtr(bias_d, outChannels);
 
             // bias CPU
             std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(2,Tb_cpu);
+            op_cpu->associateInput(2, Tb_cpu);
             Tb_cpu->setDataType(DataType::Float32);
             Tb_cpu->setBackend("cpu");
             Tb_cpu->resize({outChannels});
-            Tb_cpu -> getImpl() -> setRawPtr(bias, outChannels);
+            Tb_cpu->getImpl()->setRawPtr(bias, outChannels);
 
             // forward CUDA
             op_cuda->setDataType(DataType::Float32);
@@ -343,16 +330,22 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
             start = std::chrono::system_clock::now();
             op_cuda->forward();
             end = std::chrono::system_clock::now();
-            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(
+                end - start);
 
-            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            const std::size_t outSize = op_cuda->getOutput(0)->size();
             float *computed_cuda = new float[outSize]();
-            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+            cudaMemcpy(computed_cuda,
+                       op_cuda->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * outSize,
+                       cudaMemcpyDeviceToHost);
 
             // forward CPU
             op_cpu->forward();
             std::shared_ptr<Tensor> outputFallback;
-            const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));
+            const auto &cudaOutput =
+                op_cuda->getOutput(0)->refCastFrom(outputFallback,
+                                                   *op_cpu->getOutput(0));
             REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
 
             delete[] array0;
@@ -365,5 +358,4 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
         }
         std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
     }
-
 }
diff --git a/unit_tests/Test_DivImpl.cpp b/unit_tests/Test_DivImpl.cpp
index 07cde5d6acb8eeeff2667e5c67aedb87b893e84c..1a7d2719f0c5ddbe85bb6564f6b553c6fd716c8b 100644
--- a/unit_tests/Test_DivImpl.cpp
+++ b/unit_tests/Test_DivImpl.cpp
@@ -11,7 +11,7 @@
 
 #include <array>
 #include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -23,118 +23,145 @@
 namespace Aidge {
 
 TEST_CASE("[gpu/operator] Div", "[Div][GPU]") {
-constexpr std::uint16_t NBTRIALS = 10;
-        // Create a random number generator
-        std::random_device rd;
-        std::mt19937 gen(rd());
-        std::uniform_real_distribution<float> valueDist(
-            0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
-                                                               std::size_t(10));
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(5));
-        std::uniform_int_distribution<int> boolDist(0,1);
-
-        // To measure execution time of 'forward()'
-        std::chrono::time_point<std::chrono::system_clock> start;
-        std::chrono::time_point<std::chrono::system_clock> end;
-        std::chrono::duration<double, std::micro> duration{};
-        std::size_t number_of_operation = 0;
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
-        {
-            // Create Div Operator CUDA
-            std::shared_ptr<Node> myDivCUDA = Div();
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myDivCUDA -> getOperator());
-
-            // Create Div Operator CPU
-            std::shared_ptr<Node> myDivCPU = Div();
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myDivCPU -> getOperator());
-            op_cpu->setDataType(DataType::Float32);
-            op_cpu->setBackend("cpu");
-
-            const std::size_t nbDims = nbDimsDist(gen);
-            std::vector<std::size_t> dims0, dims1, dims;
-            for (std::size_t i = 0; i < nbDims; ++i) {
-                const std::size_t dim = dimSizeDist(gen);
-                    dims0.push_back(dim);
-                if (boolDist(gen)) {
-                    dims1.push_back(1);
-                }else{
-                    dims1.push_back(dim);
-                }
-                dims.push_back(std::max(dims0[i], dims1[i]));
-            }
-
-            const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
-            const std::size_t nb_elements1 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
-            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
-            number_of_operation += nb_elements;
-            float* array0 = new float[nb_elements0];
-            float* array1 = new float[nb_elements1];
-
-            for (std::size_t i = 0; i < nb_elements0; ++i) {
-                array0[i] = valueDist(gen);
-            }
-            for (std::size_t i = 0; i < nb_elements1; ++i) {
-                array1[i] = valueDist(gen);
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(
+        0.1f,
+        1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
+                                                           std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4),
+                                                          std::size_t(5));
+    std::uniform_int_distribution<int> boolDist(0, 1);
+
+    // To measure execution time of 'forward()'
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+    std::size_t number_of_operation = 0;
+    for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+        // Create Div Operator CUDA
+        std::shared_ptr<Node> myDivCUDA = Div();
+        auto op_cuda =
+            std::static_pointer_cast<OperatorTensor>(myDivCUDA->getOperator());
+
+        // Create Div Operator CPU
+        std::shared_ptr<Node> myDivCPU = Div();
+        auto op_cpu =
+            std::static_pointer_cast<OperatorTensor>(myDivCPU->getOperator());
+        op_cpu->setDataType(DataType::Float32);
+        op_cpu->setBackend("cpu");
+
+        const std::size_t nbDims = nbDimsDist(gen);
+        std::vector<std::size_t> dims0, dims1, dims;
+        for (std::size_t i = 0; i < nbDims; ++i) {
+            const std::size_t dim = dimSizeDist(gen);
+            dims0.push_back(dim);
+            if (boolDist(gen)) {
+                dims1.push_back(1);
+            } else {
+                dims1.push_back(dim);
             }
+            dims.push_back(std::max(dims0[i], dims1[i]));
+        }
 
-            // input0 CUDA
-            float* array0_d, *array1_d;
-            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
-            T0_cuda->setDataType(DataType::Float32);
-            T0_cuda->setBackend("cuda");
-            T0_cuda->resize(dims0);
-            op_cuda->associateInput(0, T0_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements0);
-            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements0, cudaMemcpyHostToDevice);
-            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0);
-
-            // input0 CPU
-            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(0,T0_cpu);
-            T0_cpu->setDataType(DataType::Float32);
-            T0_cpu->setBackend("cpu");
-            T0_cpu->resize(dims0);
-            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements0);
-
-            // input1 CUDA
-            std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>();
-            T1_cuda->setDataType(DataType::Float32);
-            T1_cuda->setBackend("cuda");
-            T1_cuda->resize(dims1);
-            op_cuda->associateInput(1, T1_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements1);
-            cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements1, cudaMemcpyHostToDevice);
-            T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1);
-
-            // input1 CPU
-            std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(1,T1_cpu);
-            T1_cpu->setDataType(DataType::Float32);
-            T1_cpu->setBackend("cpu");
-            T1_cpu->resize(dims1);
-            T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1);
-
-            // forward CUDA
-            op_cuda->setDataType(DataType::Float32);
-            op_cuda->setBackend("cuda");
-            start = std::chrono::system_clock::now();
-            op_cuda->forward();
-            end = std::chrono::system_clock::now();
-            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
-
-            // forward CPU
-            op_cpu->forward();
-            float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
-
-            std::shared_ptr<Tensor> outputFallback;
-            const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));
-            REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
-
-            delete[] array0;
-            delete[] array1;
-            cudaFree(array0_d);
-            cudaFree(array1_d);
+        const std::size_t nb_elements0 =
+            std::accumulate(dims0.cbegin(),
+                            dims0.cend(),
+                            std::size_t(1),
+                            std::multiplies<std::size_t>());
+        const std::size_t nb_elements1 =
+            std::accumulate(dims1.cbegin(),
+                            dims1.cend(),
+                            std::size_t(1),
+                            std::multiplies<std::size_t>());
+        const std::size_t nb_elements =
+            std::accumulate(dims.cbegin(),
+                            dims.cend(),
+                            std::size_t(1),
+                            std::multiplies<std::size_t>());
+        number_of_operation += nb_elements;
+        float *array0 = new float[nb_elements0];
+        float *array1 = new float[nb_elements1];
+
+        for (std::size_t i = 0; i < nb_elements0; ++i) {
+            array0[i] = valueDist(gen);
+        }
+        for (std::size_t i = 0; i < nb_elements1; ++i) {
+            array1[i] = valueDist(gen);
         }
+
+        // input0 CUDA
+        float *array0_d, *array1_d;
+        std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+        T0_cuda->setDataType(DataType::Float32);
+        T0_cuda->setBackend("cuda");
+        T0_cuda->resize(dims0);
+        op_cuda->associateInput(0, T0_cuda);
+        cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                   sizeof(float) * nb_elements0);
+        cudaMemcpy(array0_d,
+                   array0,
+                   sizeof(float) * nb_elements0,
+                   cudaMemcpyHostToDevice);
+        T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0);
+
+        // input0 CPU
+        std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+        op_cpu->associateInput(0, T0_cpu);
+        T0_cpu->setDataType(DataType::Float32);
+        T0_cpu->setBackend("cpu");
+        T0_cpu->resize(dims0);
+        T0_cpu->getImpl()->setRawPtr(array0, nb_elements0);
+
+        // input1 CUDA
+        std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>();
+        T1_cuda->setDataType(DataType::Float32);
+        T1_cuda->setBackend("cuda");
+        T1_cuda->resize(dims1);
+        op_cuda->associateInput(1, T1_cuda);
+        cudaMalloc(reinterpret_cast<void **>(&array1_d),
+                   sizeof(float) * nb_elements1);
+        cudaMemcpy(array1_d,
+                   array1,
+                   sizeof(float) * nb_elements1,
+                   cudaMemcpyHostToDevice);
+        T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1);
+
+        // input1 CPU
+        std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>();
+        op_cpu->associateInput(1, T1_cpu);
+        T1_cpu->setDataType(DataType::Float32);
+        T1_cpu->setBackend("cpu");
+        T1_cpu->resize(dims1);
+        T1_cpu->getImpl()->setRawPtr(array1, nb_elements1);
+
+        // forward CUDA
+        op_cuda->setDataType(DataType::Float32);
+        op_cuda->setBackend("cuda");
+        start = std::chrono::system_clock::now();
+        op_cuda->forward();
+        end = std::chrono::system_clock::now();
+        duration +=
+            std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+        // forward CPU
+        op_cpu->forward();
+        float *computedCPU =
+            static_cast<float *>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+
+        std::shared_ptr<Tensor> outputFallback;
+        const auto &cudaOutput =
+            op_cuda->getOutput(0)->refCastFrom(outputFallback,
+                                               *op_cpu->getOutput(0));
+        REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
+
+        delete[] array0;
+        delete[] array1;
+        cudaFree(array0_d);
+        cudaFree(array1_d);
+    }
 }
 } // namespace Aidge
diff --git a/unit_tests/Test_FCImpl.cpp b/unit_tests/Test_FCImpl.cpp
index 472fd273b1b5eff49e0d05ebd499afdb1435770c..b95151d26866a689af3aa525350953472e87453c 100644
--- a/unit_tests/Test_FCImpl.cpp
+++ b/unit_tests/Test_FCImpl.cpp
@@ -11,7 +11,7 @@
 
 #include <array>
 #include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -24,106 +24,128 @@ using namespace Aidge;
 
 TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") {
     SECTION("Static Input") {
-        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 5, 75>{
-                {{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-                5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-                9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-                13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
-                {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-                5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-                9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-                13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
-                {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-                5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-                9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-                13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
-                {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-                5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-                9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-                13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
-                {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
-                5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
-                9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-                13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}}});
-        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 5>{{1, 2, 3, 4, 5}});
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 2, 5>{
-                {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}});
+        std::shared_ptr<Tensor> myWeights =
+            std::make_shared<Tensor>(Array2D<float, 5, 75>{
+                {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}});
+        std::shared_ptr<Tensor> myBias =
+            std::make_shared<Tensor>(Array1D<float, 5>{{1, 2, 3, 4, 5}});
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(
+            Array2D<float, 2, 5>{{{23601, 23602, 23603, 23604, 23605},
+                                  {68601, 68602, 68603, 68604, 68605}}});
         myWeights->setBackend("cuda");
         myBias->setBackend("cuda");
         std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc");
-        auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
-        op -> associateInput(1, myWeights);
-        op -> associateInput(2, myBias);
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myFC->getOperator());
+        op->associateInput(1, myWeights);
+        op->associateInput(2, myBias);
         SECTION("2D input") {
-            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 2, 75>{
-                    {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18,
-                    19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
-                    38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
-                    57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74},
-                    {75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
-                    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
-                    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
-                    120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
-                    135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}});
+            std::shared_ptr<Tensor> myInput =
+                std::make_shared<Tensor>(Array2D<float, 2, 75>{
+                    {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+                      26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+                      39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
+                      52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+                      65, 66, 67, 68, 69, 70, 71, 72, 73, 74},
+                     {75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,
+                      86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,
+                      97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
+                      108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
+                      119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
+                      130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
+                      141, 142, 143, 144, 145, 146, 147, 148, 149}}});
             myInput->setBackend("cuda");
             op->associateInput(0, myInput);
-            op -> setDataType(DataType::Float32);
-            op -> setBackend("cuda");
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
             myFC->forward();
 
-            float* computedOutput   = new float[myOutput->size()]();
-            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+            float *computedOutput = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput,
+                       op->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * myOutput->size(),
+                       cudaMemcpyDeviceToHost);
 
-            for(int i = 0; i < myOutput->size(); i++){
-                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            for (int i = 0; i < myOutput->size(); i++) {
+                const float targetOutput =
+                    *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
                 REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
             }
 
             delete[] computedOutput;
         }
         SECTION("4D input") {
-            std::shared_ptr<Tensor> myInput =
-                    std::make_shared<Tensor>(Array4D<float, 2, 3, 5, 5>{{{{{0, 1, 2, 3, 4},
-                                                                        {5, 6, 7, 8, 9},
-                                                                        {10, 11, 12, 13, 14},
-                                                                        {15, 16, 17, 18, 19},
-                                                                        {20, 21, 22, 23, 24}},
-                                                                        {{25, 26, 27, 28, 29},
-                                                                        {30, 31, 32, 33, 34},
-                                                                        {35, 36, 37, 38, 39},
-                                                                        {40, 41, 42, 43, 44},
-                                                                        {45, 46, 47, 48, 49}},
-                                                                        {{50, 51, 52, 53, 54},
-                                                                        {55, 56, 57, 58, 59},
-                                                                        {60, 61, 62, 63, 64},
-                                                                        {65, 66, 67, 68, 69},
-                                                                        {70, 71, 72, 73, 74}}},
-                                                                    {{{75, 76, 77, 78, 79},
-                                                                        {80, 81, 82, 83, 84},
-                                                                        {85, 86, 87, 88, 89},
-                                                                        {90, 91, 92, 93, 94},
-                                                                        {95, 96, 97, 98, 99}},
-                                                                        {{100, 101, 102, 103, 104},
-                                                                        {105, 106, 107, 108, 109},
-                                                                        {110, 111, 112, 113, 114},
-                                                                        {115, 116, 117, 118, 119},
-                                                                        {120, 121, 122, 123, 124}},
-                                                                        {{125, 126, 127, 128, 129},
-                                                                        {130, 131, 132, 133, 134},
-                                                                        {135, 136, 137, 138, 139},
-                                                                        {140, 141, 142, 143, 144},
-                                                                        {145, 146, 147, 148, 149}}}}});
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+                Array4D<float, 2, 3, 5, 5>{{{{{0, 1, 2, 3, 4},
+                                              {5, 6, 7, 8, 9},
+                                              {10, 11, 12, 13, 14},
+                                              {15, 16, 17, 18, 19},
+                                              {20, 21, 22, 23, 24}},
+                                             {{25, 26, 27, 28, 29},
+                                              {30, 31, 32, 33, 34},
+                                              {35, 36, 37, 38, 39},
+                                              {40, 41, 42, 43, 44},
+                                              {45, 46, 47, 48, 49}},
+                                             {{50, 51, 52, 53, 54},
+                                              {55, 56, 57, 58, 59},
+                                              {60, 61, 62, 63, 64},
+                                              {65, 66, 67, 68, 69},
+                                              {70, 71, 72, 73, 74}}},
+                                            {{{75, 76, 77, 78, 79},
+                                              {80, 81, 82, 83, 84},
+                                              {85, 86, 87, 88, 89},
+                                              {90, 91, 92, 93, 94},
+                                              {95, 96, 97, 98, 99}},
+                                             {{100, 101, 102, 103, 104},
+                                              {105, 106, 107, 108, 109},
+                                              {110, 111, 112, 113, 114},
+                                              {115, 116, 117, 118, 119},
+                                              {120, 121, 122, 123, 124}},
+                                             {{125, 126, 127, 128, 129},
+                                              {130, 131, 132, 133, 134},
+                                              {135, 136, 137, 138, 139},
+                                              {140, 141, 142, 143, 144},
+                                              {145, 146, 147, 148, 149}}}}});
             myInput->setBackend("cuda");
             op->associateInput(0, myInput);
-            op -> setDataType(DataType::Float32);
-            op -> setBackend("cuda");
+            op->setDataType(DataType::Float32);
+            op->setBackend("cuda");
             myFC->forward();
 
-            float* computedOutput   = new float[myOutput->size()]();
-            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+            float *computedOutput = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput,
+                       op->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * myOutput->size(),
+                       cudaMemcpyDeviceToHost);
 
-            for(int i = 0; i < myOutput->size(); i++){
-                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            for (int i = 0; i < myOutput->size(); i++) {
+                const float targetOutput =
+                    *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
                 REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
             }
 
@@ -131,21 +153,23 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") {
         }
     }
 
-    SECTION("Random Input"){
+    SECTION("Random Input") {
         constexpr std::uint16_t NBTRIALS = 10;
         // Create a random number generator
         std::random_device rd;
         std::mt19937 gen(rd());
         std::uniform_real_distribution<float> valueDist(
-            0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(1, std::size_t(10));
+            0.1f,
+            1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(
+            1,
+            std::size_t(10));
 
         // To measure execution time of 'forward()'
         std::chrono::time_point<std::chrono::system_clock> start;
         std::chrono::time_point<std::chrono::system_clock> end;
         std::chrono::duration<double, std::micro> duration{};
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
-        {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
             const std::size_t nbDims = 4;
             std::vector<std::size_t> dims;
             for (std::size_t i = 0; i < nbDims; ++i) {
@@ -153,28 +177,35 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") {
             }
             const std::size_t outChannels = dimSizeDist(gen);
 
-            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements =
+                std::accumulate(dims.cbegin(),
+                                dims.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
             const std::size_t inChannels = nb_elements / dims[0];
 
             const std::vector<std::size_t> dimsW{outChannels, inChannels};
             const std::size_t wieghtSize = outChannels * inChannels;
 
             // Create FC Operator CUDA
-            std::shared_ptr<Node> myFCCUDA = FC(inChannels, outChannels, false, "myfccuda");
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myFCCUDA -> getOperator());
+            std::shared_ptr<Node> myFCCUDA =
+                FC(inChannels, outChannels, false, "myfccuda");
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(
+                myFCCUDA->getOperator());
             op_cuda->setDataType(DataType::Float32);
             op_cuda->setBackend("cuda");
 
             // Create FC Operator CPU
-            std::shared_ptr<Node> myFCCPU = FC(inChannels, outChannels, false, "myfccpu");
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myFCCPU -> getOperator());
+            std::shared_ptr<Node> myFCCPU =
+                FC(inChannels, outChannels, false, "myfccpu");
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(
+                myFCCPU->getOperator());
             op_cpu->setDataType(DataType::Float32);
             op_cpu->setBackend("cpu");
 
-
-            float* array0 = new float[nb_elements];
-            float* weights = new float[wieghtSize];
-            float* bias = new float[outChannels];
+            float *array0 = new float[nb_elements];
+            float *weights = new float[wieghtSize];
+            float *bias = new float[outChannels];
 
             for (std::size_t i = 0; i < nb_elements; ++i) {
                 array0[i] = valueDist(gen);
@@ -187,23 +218,27 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") {
             }
 
             // input0 CUDA
-            float* array0_d, *weight_d, *bias_d;
+            float *array0_d, *weight_d, *bias_d;
             std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
             T0_cuda->setDataType(DataType::Float32);
             T0_cuda->setBackend("cuda");
             T0_cuda->resize(dims);
             op_cuda->associateInput(0, T0_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
-            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                       sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d,
+                       array0,
+                       sizeof(float) * nb_elements,
+                       cudaMemcpyHostToDevice);
             T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
 
             // input0 CPU
             std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(0,T0_cpu);
+            op_cpu->associateInput(0, T0_cpu);
             T0_cpu->setDataType(DataType::Float32);
             T0_cpu->setBackend("cpu");
             T0_cpu->resize(dims);
-            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+            T0_cpu->getImpl()->setRawPtr(array0, nb_elements);
 
             // weight CUDA
             std::shared_ptr<Tensor> Tw_cuda = std::make_shared<Tensor>();
@@ -211,17 +246,21 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") {
             Tw_cuda->setBackend("cuda");
             Tw_cuda->resize(dimsW);
             op_cuda->associateInput(1, Tw_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&weight_d), sizeof(float) * wieghtSize);
-            cudaMemcpy(weight_d, weights, sizeof(float) * wieghtSize, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&weight_d),
+                       sizeof(float) * wieghtSize);
+            cudaMemcpy(weight_d,
+                       weights,
+                       sizeof(float) * wieghtSize,
+                       cudaMemcpyHostToDevice);
             Tw_cuda->getImpl()->setRawPtr(weight_d, wieghtSize);
 
             // weight CPU
             std::shared_ptr<Tensor> Tw_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(1,Tw_cpu);
+            op_cpu->associateInput(1, Tw_cpu);
             Tw_cpu->setDataType(DataType::Float32);
             Tw_cpu->setBackend("cpu");
             Tw_cpu->resize(dimsW);
-            Tw_cpu -> getImpl() -> setRawPtr(weights, wieghtSize);
+            Tw_cpu->getImpl()->setRawPtr(weights, wieghtSize);
 
             // bias CUDA
             std::shared_ptr<Tensor> Tb_cuda = std::make_shared<Tensor>();
@@ -229,31 +268,40 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") {
             Tb_cuda->setBackend("cuda");
             Tb_cuda->resize({outChannels});
             op_cuda->associateInput(2, Tb_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&bias_d), sizeof(float) * outChannels);
-            cudaMemcpy(bias_d, bias, sizeof(float) * outChannels, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&bias_d),
+                       sizeof(float) * outChannels);
+            cudaMemcpy(bias_d,
+                       bias,
+                       sizeof(float) * outChannels,
+                       cudaMemcpyHostToDevice);
             Tb_cuda->getImpl()->setRawPtr(bias_d, outChannels);
 
             // bias CPU
             std::shared_ptr<Tensor> Tb_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(2,Tb_cpu);
+            op_cpu->associateInput(2, Tb_cpu);
             Tb_cpu->setDataType(DataType::Float32);
             Tb_cpu->setBackend("cpu");
             Tb_cpu->resize({outChannels});
-            Tb_cpu -> getImpl() -> setRawPtr(bias, outChannels);
+            Tb_cpu->getImpl()->setRawPtr(bias, outChannels);
 
             // forward CUDA
             start = std::chrono::system_clock::now();
             op_cuda->forward();
             end = std::chrono::system_clock::now();
-            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(
+                end - start);
 
-            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            const std::size_t outSize = op_cuda->getOutput(0)->size();
             float *computed_cuda = new float[outSize]();
-            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+            cudaMemcpy(computed_cuda,
+                       op_cuda->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * outSize,
+                       cudaMemcpyDeviceToHost);
 
             // forward CPU
             op_cpu->forward();
-            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            float *computed_cpu = static_cast<float *>(
+                op_cpu->getOutput(0)->getImpl()->rawPtr());
             REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
 
             delete[] array0;
@@ -270,80 +318,78 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") {
 
 TEST_CASE("[gpu/operator] FC(backward)", "[FC][GPU]") {
     SECTION("2D input") {
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 2, 3>{
-            {
-                {0.1, 0.2, 0.3},
-                {0.4, 0.5, 0.6}
-        }});
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>{{{0.1, 0.2, 0.3}, {0.4, 0.5, 0.6}}});
         myInput->setBackend("cuda");
-        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{
-                {{0.1, 0.2, 0.3},
-                {0.4, 0.5, 0.6}}});
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>{{{0.1, 0.2, 0.3}, {0.4, 0.5, 0.6}}});
 
-        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{0.1, 0.2}});
+        std::shared_ptr<Tensor> myBias =
+            std::make_shared<Tensor>(Array1D<float, 2>{{0.1, 0.2}});
         myWeights->setBackend("cuda");
         myBias->setBackend("cuda");
         std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc");
-        auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myFC->getOperator());
 
         op->associateInput(0, myInput);
-        op -> associateInput(1, myWeights);
-        op -> associateInput(2, myBias);
-        op -> setDataType(DataType::Float32);
-        op -> setBackend("cuda");
+        op->associateInput(1, myWeights);
+        op->associateInput(2, myBias);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
         myFC->forward();
 
         // Run and test backward operation
-        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 2, 2> {
-            {
-                {0.1, 0.2},
-                {0.3, 0.4}
-            }
-        });
+        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(
+            Array2D<float, 2, 2>{{{0.1, 0.2}, {0.3, 0.4}}});
         myOutputGrad->setBackend("cuda");
         std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
         std::shared_ptr<Tensor> input = op->getInput(0);
         predictedOutput->setGrad(myOutputGrad);
         REQUIRE_NOTHROW(myFC->backward());
 
-        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float,2,3> {
-            {
-                {0.09, 0.12, 0.15},
-                {0.19, 0.26, 0.33}
-            }
-        });
-        std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float,2> {
-            {0.4, 0.6}
-        });
-        std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float,2,3> {
-            {
-                {0.13, 0.17, 0.21},
-                {0.18, 0.24, 0.3 }
-            }
-        });
+        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>{{{0.09, 0.12, 0.15}, {0.19, 0.26, 0.33}}});
+        std::shared_ptr<Tensor> expectedBiasGrad =
+            std::make_shared<Tensor>(Array1D<float, 2>{{0.4, 0.6}});
+        std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>{{{0.13, 0.17, 0.21}, {0.18, 0.24, 0.3}}});
         float *computedGradCuda = new float[expectedInputGrad->size()]();
-        cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost);
+        cudaMemcpy(computedGradCuda,
+                   input->grad()->getImpl()->rawPtr(),
+                   sizeof(float) * expectedInputGrad->size(),
+                   cudaMemcpyDeviceToHost);
         float *computedGradWCuda = new float[expectedWeightsGrad->size()]();
-        cudaMemcpy(computedGradWCuda, op->getInput(1)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedWeightsGrad->size(), cudaMemcpyDeviceToHost);
+        cudaMemcpy(computedGradWCuda,
+                   op->getInput(1)->grad()->getImpl()->rawPtr(),
+                   sizeof(float) * expectedWeightsGrad->size(),
+                   cudaMemcpyDeviceToHost);
         float *computedGradBCuda = new float[expectedBiasGrad->size()]();
-        cudaMemcpy(computedGradBCuda, op->getInput(2)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedBiasGrad->size(), cudaMemcpyDeviceToHost);
-
-        for(int i = 0; i < expectedInputGrad->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i);
+        cudaMemcpy(computedGradBCuda,
+                   op->getInput(2)->grad()->getImpl()->rawPtr(),
+                   sizeof(float) * expectedBiasGrad->size(),
+                   cudaMemcpyDeviceToHost);
+
+        for (int i = 0; i < expectedInputGrad->size(); i++) {
+            const float targetOutput = *(
+                static_cast<float *>(expectedInputGrad->getImpl()->rawPtr()) +
+                i);
             REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
         }
-        for(int i = 0; i < expectedBiasGrad->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedBiasGrad->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < expectedBiasGrad->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(expectedBiasGrad->getImpl()->rawPtr()) +
+                  i);
             REQUIRE(fabs(computedGradBCuda[i] - targetOutput) < 1e-6);
         }
-        for(int i = 0; i < expectedWeightsGrad->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedWeightsGrad->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < expectedWeightsGrad->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(
+                      expectedWeightsGrad->getImpl()->rawPtr()) +
+                  i);
             REQUIRE(fabs(computedGradWCuda[i] - targetOutput) < 1e-6);
         }
 
-
-
-
         delete[] computedGradCuda;
         delete[] computedGradWCuda;
         delete[] computedGradBCuda;
diff --git a/unit_tests/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/Test_GlobalAveragePoolingImpl.cpp
index 0a0f22ab60ced3a3f7648ce798484f72bd67839a..b6eac105e3ea700e13282f111a25a4a7b7a3e0bc 100644
--- a/unit_tests/Test_GlobalAveragePoolingImpl.cpp
+++ b/unit_tests/Test_GlobalAveragePoolingImpl.cpp
@@ -16,7 +16,7 @@
 // #include <memory>
 #include <array>
 #include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -28,56 +28,49 @@ namespace Aidge {
 TEST_CASE("[gpu/operator] GlobalAveragePooling",
           "[GlobalAveragePooling][GPU]") {
 
-    SECTION("4D-Tensor")
-    {
-      std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,1,3,4,4> { //NCHW
-        {
-            {
-                {{0, 1, 2, 3},
-                 {4, 5, 6, 7},
-            	   {8, 9, 10, 11},
-                 {12, 13, 14, 15}},
-
-                {{16, 17, 18, 19},
-                 {20, 21, 22, 23},
-                 {24, 25, 26, 27},
-                 {28, 29, 30, 31}},
-
-                {{32, 33, 34, 35},
-                 {36, 37, 38, 39},
-                 {40, 41, 42, 43},
-                 {44, 45, 46, 47}}
-            }
-        }
-     });
-    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,1,3,1,1> {
-        {
-          {
-            {{ 7.5 }},
-            {{ 23.5 }},
-            {{ 39.5 }}
-          }
+    SECTION("4D-Tensor") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array4D<float, 1, 3, 4, 4>{// NCHW
+                                       {{{{0, 1, 2, 3},
+                                          {4, 5, 6, 7},
+                                          {8, 9, 10, 11},
+                                          {12, 13, 14, 15}},
+
+                                         {{16, 17, 18, 19},
+                                          {20, 21, 22, 23},
+                                          {24, 25, 26, 27},
+                                          {28, 29, 30, 31}},
+
+                                         {{32, 33, 34, 35},
+                                          {36, 37, 38, 39},
+                                          {40, 41, 42, 43},
+                                          {44, 45, 46, 47}}}}});
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(
+            Array4D<float, 1, 3, 1, 1>{{{{{7.5}}, {{23.5}}, {{39.5}}}}});
+        myInput->setBackend("cuda");
+        myInput->setDataType(DataType::Float32);
+        // Create MyGlobalAveragePooling Operator
+        std::shared_ptr<Node> globAvgPool = GlobalAveragePooling();
+        auto op = std::static_pointer_cast<OperatorTensor>(
+            globAvgPool->getOperator());
+        op->setDataType(DataType::Float32);
+        op->setBackend("cuda");
+        op->associateInput(0, myInput);
+
+        globAvgPool->forward();
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
+
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
+            REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
-      });
-      myInput->setBackend("cuda");
-      myInput->setDataType(DataType::Float32);
-      // Create MyGlobalAveragePooling Operator
-      std::shared_ptr<Node> globAvgPool = GlobalAveragePooling();
-      auto op = std::static_pointer_cast<OperatorTensor>(globAvgPool->getOperator());
-      op->setDataType(DataType::Float32);
-      op->setBackend("cuda");
-      op->associateInput(0, myInput);
-
-      globAvgPool->forward();
-      float* computedOutput   = new float[myOutput->size()]();
-      cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-
-      for(int i = 0; i < myOutput->size(); i++){
-          const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
-          REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
-      }
-
-      delete[] computedOutput;
+
+        delete[] computedOutput;
     }
 
     SECTION("Random Input") {
@@ -86,86 +79,101 @@ TEST_CASE("[gpu/operator] GlobalAveragePooling",
         std::random_device rd;
         std::mt19937 gen(rd());
         std::uniform_real_distribution<float> valueDist(
-            0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
-                                                               std::size_t(10));
+            0.1f,
+            1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(
+            std::size_t(1),
+            std::size_t(10));
 
         // To measure execution time of 'AveragePooling_Op::forward()'
         std::chrono::time_point<std::chrono::system_clock> start;
         std::chrono::time_point<std::chrono::system_clock> end;
         std::chrono::duration<double, std::micro> duration{};
         std::size_t number_of_operation = 0;
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
-        {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
             // Create GlobalAveragePooling Operator CUDA
             std::shared_ptr<Node> myGAvgPoolCuda = GlobalAveragePooling();
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myGAvgPoolCuda->getOperator());
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(
+                myGAvgPoolCuda->getOperator());
             op_cuda->setDataType(DataType::Float32);
             op_cuda->setBackend("cuda");
 
             // Create GlobalAveragePooling Operator CUDA
             std::shared_ptr<Node> myGAvgPoolCpu = GlobalAveragePooling();
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myGAvgPoolCpu->getOperator());
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(
+                myGAvgPoolCpu->getOperator());
             op_cpu->setDataType(DataType::Float32);
             op_cpu->setBackend("cpu");
 
             // generate a random Tensor
             const std::size_t nbDims = 4;
             std::vector<std::size_t> dims;
-            for (std::size_t i = 0; i < nbDims; ++i)
-            {
+            for (std::size_t i = 0; i < nbDims; ++i) {
                 dims.push_back(dimSizeDist(gen));
             }
 
-            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements =
+                std::accumulate(dims.cbegin(),
+                                dims.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
             number_of_operation += nb_elements;
 
             // Fill input tensor
             float *array0 = new float[nb_elements];
-            for (std::size_t i = 0; i < nb_elements; ++i)
-            {
+            for (std::size_t i = 0; i < nb_elements; ++i) {
                 array0[i] = valueDist(gen);
             }
 
             // input0 CUDA
-            float* array0_d;
+            float *array0_d;
             std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
             T0_cuda->setDataType(DataType::Float32);
             T0_cuda->setBackend("cuda");
             T0_cuda->resize(dims);
             op_cuda->associateInput(0, T0_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
-            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                       sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d,
+                       array0,
+                       sizeof(float) * nb_elements,
+                       cudaMemcpyHostToDevice);
             T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
 
             // input0 CPU
             std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(0,T0_cpu);
+            op_cpu->associateInput(0, T0_cpu);
             T0_cpu->setDataType(DataType::Float32);
             T0_cpu->setBackend("cpu");
             T0_cpu->resize(dims);
-            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+            T0_cpu->getImpl()->setRawPtr(array0, nb_elements);
 
-            // Run inference            
+            // Run inference
             start = std::chrono::system_clock::now();
             op_cuda->forward();
             end = std::chrono::system_clock::now();
-            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(
+                end - start);
 
-            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            const std::size_t outSize = op_cuda->getOutput(0)->size();
             float *computed_cuda = new float[outSize]();
-            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+            cudaMemcpy(computed_cuda,
+                       op_cuda->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * outSize,
+                       cudaMemcpyDeviceToHost);
 
             // forward CPU
             op_cpu->forward();
-            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            float *computed_cpu = static_cast<float *>(
+                op_cpu->getOutput(0)->getImpl()->rawPtr());
             REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
 
             delete[] computed_cuda;
             delete[] array0;
             cudaFree(array0_d);
         }
-        std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
+        std::cout << "number of elements over time spent: "
+                  << (number_of_operation / duration.count()) << std::endl;
         std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
     }
 }
diff --git a/unit_tests/Test_ILayerNormImpl.cpp b/unit_tests/Test_ILayerNormImpl.cpp
index 0487b7c4716596e0d2e7bcbdaf812358be4de3bf..ad95d58c1982a2209c10f1da11c522b6d4695588 100644
--- a/unit_tests/Test_ILayerNormImpl.cpp
+++ b/unit_tests/Test_ILayerNormImpl.cpp
@@ -26,150 +26,230 @@ using namespace Aidge;
 
 TEST_CASE("[gpu/operator] ILayerNorm(forward)", "[ILayerNorm][GPU]") {
     SECTION("4D Tensor") {
-        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
-            {
-                {
-                    {
-                        {0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61},
-                        {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16}
-                    },
-                    {
-                        {0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09},
-                        {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79}
-                    }
-                },
-                {
-                    {
-                        {0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85},
-                        {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02}
-                    },
-                    {
-                        {0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06},
-                        {0.89, 0.32, 0.61, 0.24, 0.70, 0.23, 0.09, 0.03, 0.14, 0.80}
-                    }
-                }
-            }
-        });
-
-        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 10>{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}});
-        std::shared_ptr<Tensor> myWeight = std::make_shared<Tensor>(Array1D<float, 10>{{0.1617684f, 0.3833238f ,-0.6842308f ,-0.4342245f ,-0.4717381f ,-0.1776187f, -0.2728751f, -0.4638580f, 0.2936697f, -0.9011016f}});
+        std::shared_ptr<Tensor> input0 = std::make_shared<
+            Tensor>(Array4D<float, 2, 2, 2, 10>{
+            {{{{0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61},
+               {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16}},
+              {{0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09},
+               {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79}}},
+             {{{0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85},
+               {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02}},
+              {{0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06},
+               {0.89,
+                0.32,
+                0.61,
+                0.24,
+                0.70,
+                0.23,
+                0.09,
+                0.03,
+                0.14,
+                0.80}}}}});
+
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(
+            Array1D<float, 10>{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}});
+        std::shared_ptr<Tensor> myWeight =
+            std::make_shared<Tensor>(Array1D<float, 10>{{0.1617684f,
+                                                         0.3833238f,
+                                                         -0.6842308f,
+                                                         -0.4342245f,
+                                                         -0.4717381f,
+                                                         -0.1776187f,
+                                                         -0.2728751f,
+                                                         -0.4638580f,
+                                                         0.2936697f,
+                                                         -0.9011016f}});
 
         myWeight->setBackend("cuda");
         myBias->setBackend("cuda");
 
         std::shared_ptr<Node> myILayerNorm = ILayerNorm();
-        auto op = std::static_pointer_cast<OperatorTensor>(myILayerNorm -> getOperator());
+        auto op = std::static_pointer_cast<OperatorTensor>(
+            myILayerNorm->getOperator());
 
-        op -> associateInput(1, myWeight);
-        op -> associateInput(2, myBias);
+        op->associateInput(1, myWeight);
+        op->associateInput(2, myBias);
 
         input0->setBackend("cuda");
 
-        op -> associateInput(0,input0);
+        op->associateInput(0, input0);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         op->forward();
 
         // expected output
-        std::shared_ptr<Tensor> output_ilayernorm = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
-        {
-            {
-                {
-                    {9.8821178e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02},
-                    {4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00}
-                },
-                {
-                    {0.0000000e+00, 4.9410585e-02, 9.8821178e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 9.8821178e-02, 4.9410585e-02, 0.0000000e+00},
-                    {4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02}
-                }
-            },
-            {
-                {
-                    {0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02},
-                    {9.8821178e-02, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00}
-                },
-                {
-                    {4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00},
-                    {4.9410585e-02, 4.9410585e-02, 4.9410585e-02, 0.0000000e+00, 4.9410585e-02, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 4.9410585e-02}
-                }
-            }
-        }
-    });
-
-
-        float* computedOutput   = new float[output_ilayernorm->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_ilayernorm->size(), cudaMemcpyDeviceToHost);
-
-        //test if forward result are as expected
-        for(int i = 0; i < output_ilayernorm->size(); i++){
-            const float targetOutput = *(static_cast<float*>(output_ilayernorm->getImpl()->rawPtr()) + i);
+        std::shared_ptr<Tensor> output_ilayernorm = std::make_shared<Tensor>(
+            Array4D<float, 2, 2, 2, 10>{{{{{9.8821178e-02,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            0.0000000e+00,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            4.9410585e-02},
+                                           {4.9410585e-02,
+                                            0.0000000e+00,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            0.0000000e+00}},
+                                          {{0.0000000e+00,
+                                            4.9410585e-02,
+                                            9.8821178e-02,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            9.8821178e-02,
+                                            4.9410585e-02,
+                                            0.0000000e+00},
+                                           {4.9410585e-02,
+                                            4.9410585e-02,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            0.0000000e+00,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            4.9410585e-02}}},
+                                         {{{0.0000000e+00,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            4.9410585e-02},
+                                           {9.8821178e-02,
+                                            4.9410585e-02,
+                                            0.0000000e+00,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            0.0000000e+00}},
+                                          {{4.9410585e-02,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            0.0000000e+00},
+                                           {4.9410585e-02,
+                                            4.9410585e-02,
+                                            4.9410585e-02,
+                                            0.0000000e+00,
+                                            4.9410585e-02,
+                                            0.0000000e+00,
+                                            0.0000000e+00,
+                                            0.0000000e+00,
+                                            0.0000000e+00,
+                                            4.9410585e-02}}}}});
+
+        float *computedOutput = new float[output_ilayernorm->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * output_ilayernorm->size(),
+                   cudaMemcpyDeviceToHost);
+
+        // test if forward result are as expected
+        for (int i = 0; i < output_ilayernorm->size(); i++) {
+            const float targetOutput = *(
+                static_cast<float *>(output_ilayernorm->getImpl()->rawPtr()) +
+                i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
-
-        }
-
+    }
 }
 
 TEST_CASE("[gpu/operator] ILayerNorm(backward)", "[ILayerNorm][GPU]")
 
-{   
-    std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW
-            {
-                    {
-                        {
-                            {1.46650600,  1.24083233, -0.33106008, -0.15137172, 0.06625678, -1.8326609, 0.53444749, -0.05167147},
-                        },
-                    },
-            }
-        });
-    
-    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW
+{
+    std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(
+        Array4D<float, 1, 1, 1, 8>{// NCHW
+                                   {
+                                       {
+                                           {
+                                               {1.46650600,
+                                                1.24083233,
+                                                -0.33106008,
+                                                -0.15137172,
+                                                0.06625678,
+                                                -1.8326609,
+                                                0.53444749,
+                                                -0.05167147},
+                                           },
+                                       },
+                                   }});
+
+    std::shared_ptr<Tensor> myBias =
+        std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{
+            // NCHW
             {
+                {
                     {
-                        {
-                            {0.96, 0.54, 0.22, -0.15, 0.17, 0.26, -0.85, 0.5},
-                        },
+                        {0.96, 0.54, 0.22, -0.15, 0.17, 0.26, -0.85, 0.5},
                     },
-            }
-        });
-    
-    std::shared_ptr<Tensor> myWeight = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW
+                },
+            }});
+
+    std::shared_ptr<Tensor> myWeight =
+        std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{
+            // NCHW
             {
+                {
                     {
-                        {
-                            {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0},
-                        },
+                        {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0},
                     },
-            }
-        });
-    
+                },
+            }});
 
-        myWeight->setBackend("cuda");
-        myBias->setBackend("cuda");
+    myWeight->setBackend("cuda");
+    myBias->setBackend("cuda");
 
-        std::shared_ptr<Node> myILayerNorm = ILayerNorm();
-        auto op = std::static_pointer_cast<OperatorTensor>(myILayerNorm -> getOperator());
+    std::shared_ptr<Node> myILayerNorm = ILayerNorm();
+    auto op =
+        std::static_pointer_cast<OperatorTensor>(myILayerNorm->getOperator());
 
-        op -> associateInput(1, myWeight);
-        op -> associateInput(2, myBias);
+    op->associateInput(1, myWeight);
+    op->associateInput(2, myBias);
 
-        input0->setBackend("cuda");
+    input0->setBackend("cuda");
 
-        op -> associateInput(0,input0);
-        op->setDataType(DataType::Float32);
-        op->setBackend("cuda");
-        myILayerNorm->forward();
+    op->associateInput(0, input0);
+    op->setDataType(DataType::Float32);
+    op->setBackend("cuda");
+    myILayerNorm->forward();
 
-    std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+    std::shared_ptr<Tensor> myOutputGrad =
+        std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{
             {
                 {
-                    {
-                        { 1.34347093,  0.90813798, 0.39607167,  1.20428133, 0.16845724,  0.48487359, 0.40748054, -0.21790814},
-                    },
+                    {1.34347093,
+                     0.90813798,
+                     0.39607167,
+                     1.20428133,
+                     0.16845724,
+                     0.48487359,
+                     0.40748054,
+                     -0.21790814},
                 },
-            }
-        });
-
+            },
+        }});
 
     myOutputGrad->setBackend("cuda");
     std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
@@ -177,24 +257,35 @@ TEST_CASE("[gpu/operator] ILayerNorm(backward)", "[ILayerNorm][GPU]")
     predictedOutput->setGrad(myOutputGrad);
     REQUIRE_NOTHROW(myILayerNorm->backward());
 
-    std::shared_ptr<Tensor> expectedInputGradILayerNorm = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+    std::shared_ptr<Tensor> expectedInputGradILayerNorm =
+        std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{
             {
                 {
-                    {
-                        { 0.467678, 0.310749, 0.1129, 0.351786, 0.0507252, 0.101587, 0.130249, -0.0646476},
-                    },
+                    {0.467678,
+                     0.310749,
+                     0.1129,
+                     0.351786,
+                     0.0507252,
+                     0.101587,
+                     0.130249,
+                     -0.0646476},
                 },
-            }
-        });
-
+            },
+        }});
 
     float *computedInputGradCuda = new float[myOutputGrad->size()]();
-    cudaMemcpy(computedInputGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * myOutputGrad->size(), cudaMemcpyDeviceToHost);
-
-    //test if backward result are as expected
-    for(int i = 0; i < expectedInputGradILayerNorm->size(); i++){
-        const float targetOutput = *(static_cast<float*>(expectedInputGradILayerNorm->getImpl()->rawPtr()) + i);
-        REQUIRE(fabs(computedInputGradCuda[i] - targetOutput) < 2e-6);  
+    cudaMemcpy(computedInputGradCuda,
+               op->getInput(0)->grad()->getImpl()->rawPtr(),
+               sizeof(float) * myOutputGrad->size(),
+               cudaMemcpyDeviceToHost);
+
+    // test if backward result are as expected
+    for (int i = 0; i < expectedInputGradILayerNorm->size(); i++) {
+        const float targetOutput =
+            *(static_cast<float *>(
+                  expectedInputGradILayerNorm->getImpl()->rawPtr()) +
+              i);
+        REQUIRE(fabs(computedInputGradCuda[i] - targetOutput) < 2e-6);
     }
 
     delete[] computedInputGradCuda;
diff --git a/unit_tests/Test_LnImpl.cpp b/unit_tests/Test_LnImpl.cpp
index 06e2205ba38ce0becd0326bf4d258b9f55a228bd..9933b4b3108cf467cfd601dfba62c20b88e0b44e 100644
--- a/unit_tests/Test_LnImpl.cpp
+++ b/unit_tests/Test_LnImpl.cpp
@@ -11,7 +11,7 @@
 
 #include <array>
 #include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -23,84 +23,99 @@
 namespace Aidge {
 
 TEST_CASE("[gpu/operator] Ln", "[Ln][GPU]") {
-constexpr std::uint16_t NBTRIALS = 10;
-        // Create a random number generator
-        std::random_device rd;
-        std::mt19937 gen(rd());
-        std::uniform_real_distribution<float> valueDist(
-            0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
-                                                               std::size_t(10));
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(8));
-
-        // To measure execution time of 'forward()'
-        std::chrono::time_point<std::chrono::system_clock> start;
-        std::chrono::time_point<std::chrono::system_clock> end;
-        std::chrono::duration<double, std::micro> duration{};
-        std::size_t number_of_operation = 0;
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
-        {
-            // Create Ln Operator CUDA
-            std::shared_ptr<Node> myLnCUDA = Ln();
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myLnCUDA -> getOperator());
-
-            // Create Ln Operator CPU
-            std::shared_ptr<Node> myLnCPU = Ln();
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myLnCPU -> getOperator());
-            op_cpu->setDataType(DataType::Float32);
-            op_cpu->setBackend("cpu");
-
-            const std::size_t nbDims = nbDimsDist(gen);
-            std::vector<std::size_t> dims;
-            for (std::size_t i = 0; i < nbDims; ++i) {
-                dims.push_back(dimSizeDist(gen));
-            }
-
-            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
-            number_of_operation += nb_elements;
-            float* array0 = new float[nb_elements];
-
-            for (std::size_t i = 0; i < nb_elements; ++i) {
-                array0[i] = valueDist(gen);
-            }
-
-            // input0 CUDA
-            float* array0_d;
-            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
-            T0_cuda->setDataType(DataType::Float32);
-            T0_cuda->setBackend("cuda");
-            T0_cuda->resize(dims);
-            op_cuda->associateInput(0, T0_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
-            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
-            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
-
-            // input0 CPU
-            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(0,T0_cpu);
-            T0_cpu->setDataType(DataType::Float32);
-            T0_cpu->setBackend("cpu");
-            T0_cpu->resize(dims);
-            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
-
-            // forward CUDA
-            op_cuda->setDataType(DataType::Float32);
-            op_cuda->setBackend("cuda");
-            start = std::chrono::system_clock::now();
-            op_cuda->forward();
-            end = std::chrono::system_clock::now();
-            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
-
-            // forward CPU
-            op_cpu->forward();
-            float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
-
-            std::shared_ptr<Tensor> outputFallback;
-            const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));
-            REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
-
-            delete[] array0;
-            cudaFree(array0_d);
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(
+        0.1f,
+        1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
+                                                           std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1),
+                                                          std::size_t(8));
+
+    // To measure execution time of 'forward()'
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+    std::size_t number_of_operation = 0;
+    for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+        // Create Ln Operator CUDA
+        std::shared_ptr<Node> myLnCUDA = Ln();
+        auto op_cuda =
+            std::static_pointer_cast<OperatorTensor>(myLnCUDA->getOperator());
+
+        // Create Ln Operator CPU
+        std::shared_ptr<Node> myLnCPU = Ln();
+        auto op_cpu =
+            std::static_pointer_cast<OperatorTensor>(myLnCPU->getOperator());
+        op_cpu->setDataType(DataType::Float32);
+        op_cpu->setBackend("cpu");
+
+        const std::size_t nbDims = nbDimsDist(gen);
+        std::vector<std::size_t> dims;
+        for (std::size_t i = 0; i < nbDims; ++i) {
+            dims.push_back(dimSizeDist(gen));
         }
+
+        const std::size_t nb_elements =
+            std::accumulate(dims.cbegin(),
+                            dims.cend(),
+                            std::size_t(1),
+                            std::multiplies<std::size_t>());
+        number_of_operation += nb_elements;
+        float *array0 = new float[nb_elements];
+
+        for (std::size_t i = 0; i < nb_elements; ++i) {
+            array0[i] = valueDist(gen);
+        }
+
+        // input0 CUDA
+        float *array0_d;
+        std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+        T0_cuda->setDataType(DataType::Float32);
+        T0_cuda->setBackend("cuda");
+        T0_cuda->resize(dims);
+        op_cuda->associateInput(0, T0_cuda);
+        cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                   sizeof(float) * nb_elements);
+        cudaMemcpy(array0_d,
+                   array0,
+                   sizeof(float) * nb_elements,
+                   cudaMemcpyHostToDevice);
+        T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
+
+        // input0 CPU
+        std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+        op_cpu->associateInput(0, T0_cpu);
+        T0_cpu->setDataType(DataType::Float32);
+        T0_cpu->setBackend("cpu");
+        T0_cpu->resize(dims);
+        T0_cpu->getImpl()->setRawPtr(array0, nb_elements);
+
+        // forward CUDA
+        op_cuda->setDataType(DataType::Float32);
+        op_cuda->setBackend("cuda");
+        start = std::chrono::system_clock::now();
+        op_cuda->forward();
+        end = std::chrono::system_clock::now();
+        duration +=
+            std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+        // forward CPU
+        op_cpu->forward();
+        float *computedCPU =
+            static_cast<float *>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+
+        std::shared_ptr<Tensor> outputFallback;
+        const auto &cudaOutput =
+            op_cuda->getOutput(0)->refCastFrom(outputFallback,
+                                               *op_cpu->getOutput(0));
+        REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
+
+        delete[] array0;
+        cudaFree(array0_d);
+    }
 }
 } // namespace Aidge
diff --git a/unit_tests/Test_MaxPoolingImpl.cpp b/unit_tests/Test_MaxPoolingImpl.cpp
index 99850a0715cf8feb3164d58c410a1ef689feece1..9990a4e60970c336be426bb8bfdc2c412bd85f29 100644
--- a/unit_tests/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/Test_MaxPoolingImpl.cpp
@@ -11,7 +11,7 @@
 
 #include <array>
 #include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -22,68 +22,57 @@
 
 using namespace Aidge;
 
-
 TEST_CASE("[gpu/operator] MaxPooling(forward)", "[MaxPooling][GPU]") {
-    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW
-        {
-            {
-                {{-0.3848,  0.2166, -0.4373,  0.6142,  0.5277},
-                 {0.7995,  0.3638, -1.4589, -1.0843,  1.0918},
-            	 {0.7147,  0.0936, -1.2902,  1.2037,  0.4874},
-                 {-0.5981,  2.1184, -0.9175,  1.3859,  0.3305},
-                 {-1.7700,  0.0563, -0.3914,  0.0538, -0.3955}},
-
-                {{-3.1409, -0.4554,  0.0524,  2.2291,  0.4859},
-                 {-0.7465, -0.6567, -2.3703, -0.6386, -1.4152},
-                 { 2.2329, -0.5850,  0.0700,  1.2838, -1.7363},
-                 { 0.2139,  0.0624, -1.0689, -0.8221, -0.8038},
-                 { 0.1886, -0.7840, -0.2313,  0.2651, -1.6244}}
-            },
-            {
-                {{ 0.4371,  1.6417,  0.9129,  0.6325,  0.5438},
-                 {-2.3552, -0.8850, -0.0232, -0.5462, -1.2011},
-                 {1.7653, -1.6668, -1.0814,  0.6182,  1.2071},
-                 {0.9541, -0.5133,  0.8664, -0.8892,  1.4585},
-                 {1.0220, -0.5107,  0.1829, -0.2301, -0.4268}},
-
-                {{ 1.0429,  0.6279, -0.2875,  0.7187, -0.1500},
-                 {1.6041,  2.9635,  1.4172, -0.7517,  0.5441},
-                 {-0.2276,  0.0857,  0.6776, -0.1389, -0.0614},
-                 {-0.1547, -0.3435,  0.0650, -0.5095, -1.8073},
-                 {1.7217,  0.3999, -0.5953,  1.0604, -0.4126}}
-            }
-        }
-    });
+    std::shared_ptr<Tensor> myInput =
+        std::make_shared<Tensor>(Array4D<float, 2, 2, 5, 5>{
+            // NCHW
+            {{{{-0.3848, 0.2166, -0.4373, 0.6142, 0.5277},
+               {0.7995, 0.3638, -1.4589, -1.0843, 1.0918},
+               {0.7147, 0.0936, -1.2902, 1.2037, 0.4874},
+               {-0.5981, 2.1184, -0.9175, 1.3859, 0.3305},
+               {-1.7700, 0.0563, -0.3914, 0.0538, -0.3955}},
+
+              {{-3.1409, -0.4554, 0.0524, 2.2291, 0.4859},
+               {-0.7465, -0.6567, -2.3703, -0.6386, -1.4152},
+               {2.2329, -0.5850, 0.0700, 1.2838, -1.7363},
+               {0.2139, 0.0624, -1.0689, -0.8221, -0.8038},
+               {0.1886, -0.7840, -0.2313, 0.2651, -1.6244}}},
+             {{{0.4371, 1.6417, 0.9129, 0.6325, 0.5438},
+               {-2.3552, -0.8850, -0.0232, -0.5462, -1.2011},
+               {1.7653, -1.6668, -1.0814, 0.6182, 1.2071},
+               {0.9541, -0.5133, 0.8664, -0.8892, 1.4585},
+               {1.0220, -0.5107, 0.1829, -0.2301, -0.4268}},
+
+              {{1.0429, 0.6279, -0.2875, 0.7187, -0.1500},
+               {1.6041, 2.9635, 1.4172, -0.7517, 0.5441},
+               {-0.2276, 0.0857, 0.6776, -0.1389, -0.0614},
+               {-0.1547, -0.3435, 0.0650, -0.5095, -1.8073},
+               {1.7217, 0.3999, -0.5953, 1.0604, -0.4126}}}}});
     SECTION("Stride") {
-        std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2});
-        auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator());
-
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
-            {
-                {
-                    {{  0.7995,  0.6142},
-                     { 2.1184,  1.3859}},
-                    {{ -0.4554,  2.2291},
-                     {  2.2329,  1.2838}}
-                },
-                {
-                    {{1.6417,  0.9129},
-                     {1.7653,  0.8664}},
-                    {{2.9635,  1.4172},
-                     {0.0857,  0.6776}}
-                }
-            }
-        });
-        myMaxPool->getOperator()->associateInput(0,myInput);
+        std::shared_ptr<Node> myMaxPool = MaxPooling({2, 2}, "mycdw", {2, 2});
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myMaxPool->getOperator());
+
+        std::shared_ptr<Tensor> myOutput =
+            std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
+                {{{{0.7995, 0.6142}, {2.1184, 1.3859}},
+                  {{-0.4554, 2.2291}, {2.2329, 1.2838}}},
+                 {{{1.6417, 0.9129}, {1.7653, 0.8664}},
+                  {{2.9635, 1.4172}, {0.0857, 0.6776}}}}});
+        myMaxPool->getOperator()->associateInput(0, myInput);
         myMaxPool->getOperator()->setDataType(DataType::Float32);
         myMaxPool->getOperator()->setBackend("cuda");
         myMaxPool->forward();
-        
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
+
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
@@ -98,86 +87,104 @@ TEST_CASE("[gpu/operator] MaxPooling(forward)", "[MaxPooling][GPU]") {
         std::random_device rd;
         std::mt19937 gen(rd());
         std::uniform_real_distribution<float> valueDist(
-            0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(kernel),
-                                                               std::size_t(10));
+            0.1f,
+            1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(
+            std::size_t(kernel),
+            std::size_t(10));
 
         // To measure execution time of 'forward()'
         std::chrono::time_point<std::chrono::system_clock> start;
         std::chrono::time_point<std::chrono::system_clock> end;
         std::chrono::duration<double, std::micro> duration{};
         std::size_t number_of_operation = 0;
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
-        {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
             // Create MaxPooling Operator CUDA
-            std::shared_ptr<Node> myMaxPoolCuda = MaxPooling({kernel, kernel}, "myMaxPoolCuda", {stride, stride});
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myMaxPoolCuda->getOperator());
+            std::shared_ptr<Node> myMaxPoolCuda = MaxPooling({kernel, kernel},
+                                                             "myMaxPoolCuda",
+                                                             {stride, stride});
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(
+                myMaxPoolCuda->getOperator());
             op_cuda->setDataType(DataType::Float32);
             op_cuda->setBackend("cuda");
 
             // Create MaxPooling Operator CUDA
-            std::shared_ptr<Node> myMaxPoolCpu = MaxPooling({kernel, kernel}, "myMaxPoolCpu", {stride, stride});
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myMaxPoolCpu->getOperator());
+            std::shared_ptr<Node> myMaxPoolCpu =
+                MaxPooling({kernel, kernel}, "myMaxPoolCpu", {stride, stride});
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(
+                myMaxPoolCpu->getOperator());
             op_cpu->setDataType(DataType::Float32);
             op_cpu->setBackend("cpu");
 
             // generate a random Tensor
             const std::size_t nbDims = 4;
             std::vector<std::size_t> dims;
-            for (std::size_t i = 0; i < nbDims; ++i)
-            {
+            for (std::size_t i = 0; i < nbDims; ++i) {
                 dims.push_back(dimSizeDist(gen));
             }
 
-            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements =
+                std::accumulate(dims.cbegin(),
+                                dims.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
             number_of_operation += nb_elements;
 
             // Fill input tensor
             float *array0 = new float[nb_elements];
-            for (std::size_t i = 0; i < nb_elements; ++i)
-            {
+            for (std::size_t i = 0; i < nb_elements; ++i) {
                 array0[i] = valueDist(gen);
             }
 
             // input0 CUDA
-            float* array0_d;
+            float *array0_d;
             std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
             T0_cuda->setDataType(DataType::Float32);
             T0_cuda->setBackend("cuda");
             T0_cuda->resize(dims);
             op_cuda->associateInput(0, T0_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
-            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                       sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d,
+                       array0,
+                       sizeof(float) * nb_elements,
+                       cudaMemcpyHostToDevice);
             T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
 
             // input0 CPU
             std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(0,T0_cpu);
+            op_cpu->associateInput(0, T0_cpu);
             T0_cpu->setDataType(DataType::Float32);
             T0_cpu->setBackend("cpu");
             T0_cpu->resize(dims);
-            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+            T0_cpu->getImpl()->setRawPtr(array0, nb_elements);
 
-            // Run inference            
+            // Run inference
             start = std::chrono::system_clock::now();
             op_cuda->forward();
             end = std::chrono::system_clock::now();
-            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(
+                end - start);
 
-            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            const std::size_t outSize = op_cuda->getOutput(0)->size();
             float *computed_cuda = new float[outSize]();
-            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+            cudaMemcpy(computed_cuda,
+                       op_cuda->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * outSize,
+                       cudaMemcpyDeviceToHost);
 
             // forward CPU
             op_cpu->forward();
-            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            float *computed_cpu = static_cast<float *>(
+                op_cpu->getOutput(0)->getImpl()->rawPtr());
             REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
 
             delete[] computed_cuda;
             delete[] array0;
             cudaFree(array0_d);
         }
-        std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
+        std::cout << "number of elements over time spent: "
+                  << (number_of_operation / duration.count()) << std::endl;
         std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
     }
 }
\ No newline at end of file
diff --git a/unit_tests/Test_MulImpl.cpp b/unit_tests/Test_MulImpl.cpp
index 9eaba6e80971a7075576cd3d4d409b79dac4eb0c..f4996f85269a6c1a777c2c3fa38ffcee1afc81b9 100644
--- a/unit_tests/Test_MulImpl.cpp
+++ b/unit_tests/Test_MulImpl.cpp
@@ -11,7 +11,7 @@
 
 #include <array>
 #include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -23,118 +23,145 @@
 namespace Aidge {
 
 TEST_CASE("[gpu/operator] Mul", "[Mul][GPU]") {
-constexpr std::uint16_t NBTRIALS = 10;
-        // Create a random number generator
-        std::random_device rd;
-        std::mt19937 gen(rd());
-        std::uniform_real_distribution<float> valueDist(
-            0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
-                                                               std::size_t(10));
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4), std::size_t(5));
-        std::uniform_int_distribution<int> boolDist(0,1);
-
-        // To measure execution time of 'forward()'
-        std::chrono::time_point<std::chrono::system_clock> start;
-        std::chrono::time_point<std::chrono::system_clock> end;
-        std::chrono::duration<double, std::micro> duration{};
-        std::size_t number_of_operation = 0;
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
-        {
-            // Create Mul Operator CUDA
-            std::shared_ptr<Node> myMulCUDA = Mul();
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myMulCUDA -> getOperator());
-
-            // Create Mul Operator CPU
-            std::shared_ptr<Node> myMulCPU = Mul();
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myMulCPU -> getOperator());
-            op_cpu->setDataType(DataType::Float32);
-            op_cpu->setBackend("cpu");
-
-            const std::size_t nbDims = nbDimsDist(gen);
-            std::vector<std::size_t> dims0, dims1, dims;
-            for (std::size_t i = 0; i < nbDims; ++i) {
-                const std::size_t dim = dimSizeDist(gen);
-                    dims0.push_back(dim);
-                if (boolDist(gen)) {
-                    dims1.push_back(1);
-                }else{
-                    dims1.push_back(dim);
-                }
-                dims.push_back(std::max(dims0[i], dims1[i]));
-            }
-
-            const std::size_t nb_elements0 = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
-            const std::size_t nb_elements1 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
-            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
-            number_of_operation += nb_elements;
-            float* array0 = new float[nb_elements0];
-            float* array1 = new float[nb_elements1];
-
-            for (std::size_t i = 0; i < nb_elements0; ++i) {
-                array0[i] = valueDist(gen);
-            }
-            for (std::size_t i = 0; i < nb_elements1; ++i) {
-                array1[i] = valueDist(gen);
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(
+        0.1f,
+        1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
+                                                           std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(4),
+                                                          std::size_t(5));
+    std::uniform_int_distribution<int> boolDist(0, 1);
+
+    // To measure execution time of 'forward()'
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+    std::size_t number_of_operation = 0;
+    for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+        // Create Mul Operator CUDA
+        std::shared_ptr<Node> myMulCUDA = Mul();
+        auto op_cuda =
+            std::static_pointer_cast<OperatorTensor>(myMulCUDA->getOperator());
+
+        // Create Mul Operator CPU
+        std::shared_ptr<Node> myMulCPU = Mul();
+        auto op_cpu =
+            std::static_pointer_cast<OperatorTensor>(myMulCPU->getOperator());
+        op_cpu->setDataType(DataType::Float32);
+        op_cpu->setBackend("cpu");
+
+        const std::size_t nbDims = nbDimsDist(gen);
+        std::vector<std::size_t> dims0, dims1, dims;
+        for (std::size_t i = 0; i < nbDims; ++i) {
+            const std::size_t dim = dimSizeDist(gen);
+            dims0.push_back(dim);
+            if (boolDist(gen)) {
+                dims1.push_back(1);
+            } else {
+                dims1.push_back(dim);
             }
+            dims.push_back(std::max(dims0[i], dims1[i]));
+        }
 
-            // input0 CUDA
-            float* array0_d, *array1_d;
-            std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
-            T0_cuda->setDataType(DataType::Float32);
-            T0_cuda->setBackend("cuda");
-            T0_cuda->resize(dims0);
-            op_cuda->associateInput(0, T0_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements0);
-            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements0, cudaMemcpyHostToDevice);
-            T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0);
-
-            // input0 CPU
-            std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(0,T0_cpu);
-            T0_cpu->setDataType(DataType::Float32);
-            T0_cpu->setBackend("cpu");
-            T0_cpu->resize(dims0);
-            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements0);
-
-            // input1 CUDA
-            std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>();
-            T1_cuda->setDataType(DataType::Float32);
-            T1_cuda->setBackend("cuda");
-            T1_cuda->resize(dims1);
-            op_cuda->associateInput(1, T1_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements1);
-            cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements1, cudaMemcpyHostToDevice);
-            T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1);
-
-            // input1 CPU
-            std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(1,T1_cpu);
-            T1_cpu->setDataType(DataType::Float32);
-            T1_cpu->setBackend("cpu");
-            T1_cpu->resize(dims1);
-            T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements1);
-
-            // forward CUDA
-            op_cuda->setDataType(DataType::Float32);
-            op_cuda->setBackend("cuda");
-            start = std::chrono::system_clock::now();
-            op_cuda->forward();
-            end = std::chrono::system_clock::now();
-            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
-
-            // forward CPU
-            op_cpu->forward();
-            float *computedCPU = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
-
-            std::shared_ptr<Tensor> outputFallback;
-            const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));
-            REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
-
-            delete[] array0;
-            delete[] array1;
-            cudaFree(array0_d);
-            cudaFree(array1_d);
+        const std::size_t nb_elements0 =
+            std::accumulate(dims0.cbegin(),
+                            dims0.cend(),
+                            std::size_t(1),
+                            std::multiplies<std::size_t>());
+        const std::size_t nb_elements1 =
+            std::accumulate(dims1.cbegin(),
+                            dims1.cend(),
+                            std::size_t(1),
+                            std::multiplies<std::size_t>());
+        const std::size_t nb_elements =
+            std::accumulate(dims.cbegin(),
+                            dims.cend(),
+                            std::size_t(1),
+                            std::multiplies<std::size_t>());
+        number_of_operation += nb_elements;
+        float *array0 = new float[nb_elements0];
+        float *array1 = new float[nb_elements1];
+
+        for (std::size_t i = 0; i < nb_elements0; ++i) {
+            array0[i] = valueDist(gen);
+        }
+        for (std::size_t i = 0; i < nb_elements1; ++i) {
+            array1[i] = valueDist(gen);
         }
+
+        // input0 CUDA
+        float *array0_d, *array1_d;
+        std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
+        T0_cuda->setDataType(DataType::Float32);
+        T0_cuda->setBackend("cuda");
+        T0_cuda->resize(dims0);
+        op_cuda->associateInput(0, T0_cuda);
+        cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                   sizeof(float) * nb_elements0);
+        cudaMemcpy(array0_d,
+                   array0,
+                   sizeof(float) * nb_elements0,
+                   cudaMemcpyHostToDevice);
+        T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements0);
+
+        // input0 CPU
+        std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
+        op_cpu->associateInput(0, T0_cpu);
+        T0_cpu->setDataType(DataType::Float32);
+        T0_cpu->setBackend("cpu");
+        T0_cpu->resize(dims0);
+        T0_cpu->getImpl()->setRawPtr(array0, nb_elements0);
+
+        // input1 CUDA
+        std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>();
+        T1_cuda->setDataType(DataType::Float32);
+        T1_cuda->setBackend("cuda");
+        T1_cuda->resize(dims1);
+        op_cuda->associateInput(1, T1_cuda);
+        cudaMalloc(reinterpret_cast<void **>(&array1_d),
+                   sizeof(float) * nb_elements1);
+        cudaMemcpy(array1_d,
+                   array1,
+                   sizeof(float) * nb_elements1,
+                   cudaMemcpyHostToDevice);
+        T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements1);
+
+        // input1 CPU
+        std::shared_ptr<Tensor> T1_cpu = std::make_shared<Tensor>();
+        op_cpu->associateInput(1, T1_cpu);
+        T1_cpu->setDataType(DataType::Float32);
+        T1_cpu->setBackend("cpu");
+        T1_cpu->resize(dims1);
+        T1_cpu->getImpl()->setRawPtr(array1, nb_elements1);
+
+        // forward CUDA
+        op_cuda->setDataType(DataType::Float32);
+        op_cuda->setBackend("cuda");
+        start = std::chrono::system_clock::now();
+        op_cuda->forward();
+        end = std::chrono::system_clock::now();
+        duration +=
+            std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+        // forward CPU
+        op_cpu->forward();
+        float *computedCPU =
+            static_cast<float *>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+
+        std::shared_ptr<Tensor> outputFallback;
+        const auto &cudaOutput =
+            op_cuda->getOutput(0)->refCastFrom(outputFallback,
+                                               *op_cpu->getOutput(0));
+        REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
+
+        delete[] array0;
+        delete[] array1;
+        cudaFree(array0_d);
+        cudaFree(array1_d);
+    }
 }
 } // namespace Aidge
diff --git a/unit_tests/Test_PadImpl.cpp b/unit_tests/Test_PadImpl.cpp
index 4e799ea6b7d11c9b446e0e4c8b9d12beae24bb05..0f2488e11b4cf85b8e632e1b21ab7390d33c90f9 100644
--- a/unit_tests/Test_PadImpl.cpp
+++ b/unit_tests/Test_PadImpl.cpp
@@ -11,7 +11,7 @@
 
 #include <array>
 #include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -26,117 +26,113 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") {
     SECTION("Symmetric Pad") {
         const int pv = 0; // pad value
 
-        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv));
-        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW
-            {
-                {
-                    {{  0,   1,   2,   3,   4},
-                    {  5,   6,   7,   8,   9},
-                    { 10,  11,  12,  13,  14},
-                    { 15,  16,  17,  18,  19},
-                    { 20,  21,  22,  23,  24}},
-
-                    {{ 25,  26,  27,  28,  29},
-                    { 30,  31,  32,  33,  34},
-                    { 35,  36,  37,  38,  39},
-                    { 40,  41,  42,  43,  44},
-                    { 45,  46,  47,  48,  49}},
-
-                    {{ 50,  51,  52,  53,  54},
-                    { 55,  56,  57,  58,  59},
-                    { 60,  61,  62,  63,  64},
-                    { 65,  66,  67,  68,  69},
-                    { 70,  71,  72,  73,  74}}
-                },
-                {
-                    {{ 75,  76,  77,  78,  79},
-                    { 80,  81,  82,  83,  84},
-                    { 85,  86,  87,  88,  89},
-                    { 90,  91,  92,  93,  94},
-                    { 95,  96,  97,  98,  99}},
-
-                    {{100, 101, 102, 103, 104},
-                    {105, 106, 107, 108, 109},
-                    {110, 111, 112, 113, 114},
-                    {115, 116, 117, 118, 119},
-                    {120, 121, 122, 123, 124}},
-
-                    {{125, 126, 127, 128, 129},
-                    {130, 131, 132, 133, 134},
-                    {135, 136, 137, 138, 139},
-                    {140, 141, 142, 143, 144},
-                    {145, 146, 147, 148, 149}}
-                }
-            }
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW
-            {
-                {
-                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
-                    { pv,   0,   1,   2,   3,   4,  pv},
-                    { pv,   5,   6,   7,   8,   9,  pv},
-                    { pv,  10,  11,  12,  13,  14,  pv},
-                    { pv,  15,  16,  17,  18,  19,  pv},
-                    { pv,  20,  21,  22,  23,  24,  pv},
-                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
-
-                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
-                    { pv,  25,  26,  27,  28,  29,  pv},
-                    { pv,  30,  31,  32,  33,  34,  pv},
-                    { pv,  35,  36,  37,  38,  39,  pv},
-                    { pv,  40,  41,  42,  43,  44,  pv},
-                    { pv,  45,  46,  47,  48,  49,  pv},
-                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
-
-                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
-                    { pv,  50,  51,  52,  53,  54,  pv},
-                    { pv,  55,  56,  57,  58,  59,  pv},
-                    { pv,  60,  61,  62,  63,  64,  pv},
-                    { pv,  65,  66,  67,  68,  69,  pv},
-                    { pv,  70,  71,  72,  73,  74,  pv},
-                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}}
-                },
-                {
-                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
-                    { pv,  75,  76,  77,  78,  79,  pv},
-                    { pv,  80,  81,  82,  83,  84,  pv},
-                    { pv,  85,  86,  87,  88,  89,  pv},
-                    { pv,  90,  91,  92,  93,  94,  pv},
-                    { pv,  95,  96,  97,  98,  99,  pv},
-                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
-
-                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
-                    {pv,  100, 101, 102, 103, 104,  pv},
-                    {pv,  105, 106, 107, 108, 109,  pv},
-                    {pv,  110, 111, 112, 113, 114,  pv},
-                    {pv,  115, 116, 117, 118, 119,  pv},
-                    {pv,  120, 121, 122, 123, 124,  pv},
-                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
-
-                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
-                    {pv,  125, 126, 127, 128, 129,  pv},
-                    {pv,  130, 131, 132, 133, 134,  pv},
-                    {pv,  135, 136, 137, 138, 139,  pv},
-                    {pv,  140, 141, 142, 143, 144,  pv},
-                    {pv,  145, 146, 147, 148, 149,  pv},
-                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}}
-                }
-            }
-        });
+        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1},
+                                             "mypad",
+                                             PadBorderType::Constant,
+                                             static_cast<double>(pv));
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myPad->getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array4D<float, 2, 3, 5, 5>{// NCHW
+                                       {{{{0, 1, 2, 3, 4},
+                                          {5, 6, 7, 8, 9},
+                                          {10, 11, 12, 13, 14},
+                                          {15, 16, 17, 18, 19},
+                                          {20, 21, 22, 23, 24}},
+
+                                         {{25, 26, 27, 28, 29},
+                                          {30, 31, 32, 33, 34},
+                                          {35, 36, 37, 38, 39},
+                                          {40, 41, 42, 43, 44},
+                                          {45, 46, 47, 48, 49}},
+
+                                         {{50, 51, 52, 53, 54},
+                                          {55, 56, 57, 58, 59},
+                                          {60, 61, 62, 63, 64},
+                                          {65, 66, 67, 68, 69},
+                                          {70, 71, 72, 73, 74}}},
+                                        {{{75, 76, 77, 78, 79},
+                                          {80, 81, 82, 83, 84},
+                                          {85, 86, 87, 88, 89},
+                                          {90, 91, 92, 93, 94},
+                                          {95, 96, 97, 98, 99}},
+
+                                         {{100, 101, 102, 103, 104},
+                                          {105, 106, 107, 108, 109},
+                                          {110, 111, 112, 113, 114},
+                                          {115, 116, 117, 118, 119},
+                                          {120, 121, 122, 123, 124}},
+
+                                         {{125, 126, 127, 128, 129},
+                                          {130, 131, 132, 133, 134},
+                                          {135, 136, 137, 138, 139},
+                                          {140, 141, 142, 143, 144},
+                                          {145, 146, 147, 148, 149}}}}});
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(
+            Array4D<float, 2, 3, 7, 7>{// NCHW
+                                       {{{{pv, pv, pv, pv, pv, pv, pv},
+                                          {pv, 0, 1, 2, 3, 4, pv},
+                                          {pv, 5, 6, 7, 8, 9, pv},
+                                          {pv, 10, 11, 12, 13, 14, pv},
+                                          {pv, 15, 16, 17, 18, 19, pv},
+                                          {pv, 20, 21, 22, 23, 24, pv},
+                                          {pv, pv, pv, pv, pv, pv, pv}},
+
+                                         {{pv, pv, pv, pv, pv, pv, pv},
+                                          {pv, 25, 26, 27, 28, 29, pv},
+                                          {pv, 30, 31, 32, 33, 34, pv},
+                                          {pv, 35, 36, 37, 38, 39, pv},
+                                          {pv, 40, 41, 42, 43, 44, pv},
+                                          {pv, 45, 46, 47, 48, 49, pv},
+                                          {pv, pv, pv, pv, pv, pv, pv}},
+
+                                         {{pv, pv, pv, pv, pv, pv, pv},
+                                          {pv, 50, 51, 52, 53, 54, pv},
+                                          {pv, 55, 56, 57, 58, 59, pv},
+                                          {pv, 60, 61, 62, 63, 64, pv},
+                                          {pv, 65, 66, 67, 68, 69, pv},
+                                          {pv, 70, 71, 72, 73, 74, pv},
+                                          {pv, pv, pv, pv, pv, pv, pv}}},
+                                        {{{pv, pv, pv, pv, pv, pv, pv},
+                                          {pv, 75, 76, 77, 78, 79, pv},
+                                          {pv, 80, 81, 82, 83, 84, pv},
+                                          {pv, 85, 86, 87, 88, 89, pv},
+                                          {pv, 90, 91, 92, 93, 94, pv},
+                                          {pv, 95, 96, 97, 98, 99, pv},
+                                          {pv, pv, pv, pv, pv, pv, pv}},
+
+                                         {{pv, pv, pv, pv, pv, pv, pv},
+                                          {pv, 100, 101, 102, 103, 104, pv},
+                                          {pv, 105, 106, 107, 108, 109, pv},
+                                          {pv, 110, 111, 112, 113, 114, pv},
+                                          {pv, 115, 116, 117, 118, 119, pv},
+                                          {pv, 120, 121, 122, 123, 124, pv},
+                                          {pv, pv, pv, pv, pv, pv, pv}},
+
+                                         {{pv, pv, pv, pv, pv, pv, pv},
+                                          {pv, 125, 126, 127, 128, 129, pv},
+                                          {pv, 130, 131, 132, 133, 134, pv},
+                                          {pv, 135, 136, 137, 138, 139, pv},
+                                          {pv, 140, 141, 142, 143, 144, pv},
+                                          {pv, 145, 146, 147, 148, 149, pv},
+                                          {pv, pv, pv, pv, pv, pv, pv}}}}});
 
         myInput->setBackend("cuda");
-        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->associateInput(0, myInput);
         myPad->getOperator()->setDataType(DataType::Float32);
         myPad->getOperator()->setBackend("cuda");
 
         myPad->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
@@ -146,111 +142,107 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") {
     SECTION("Asymmetric Pad") {
         const int pv = 0; // pad value
 
-        std::shared_ptr<Node> myPad = Pad<2>({1, 0, 0, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv));
-        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW
-            {
-                {
-                    {{  0,   1,   2,   3,   4},
-                    {  5,   6,   7,   8,   9},
-                    { 10,  11,  12,  13,  14},
-                    { 15,  16,  17,  18,  19},
-                    { 20,  21,  22,  23,  24}},
-
-                    {{ 25,  26,  27,  28,  29},
-                    { 30,  31,  32,  33,  34},
-                    { 35,  36,  37,  38,  39},
-                    { 40,  41,  42,  43,  44},
-                    { 45,  46,  47,  48,  49}},
-
-                    {{ 50,  51,  52,  53,  54},
-                    { 55,  56,  57,  58,  59},
-                    { 60,  61,  62,  63,  64},
-                    { 65,  66,  67,  68,  69},
-                    { 70,  71,  72,  73,  74}}
-                },
-                {
-                    {{ 75,  76,  77,  78,  79},
-                    { 80,  81,  82,  83,  84},
-                    { 85,  86,  87,  88,  89},
-                    { 90,  91,  92,  93,  94},
-                    { 95,  96,  97,  98,  99}},
-
-                    {{100, 101, 102, 103, 104},
-                    {105, 106, 107, 108, 109},
-                    {110, 111, 112, 113, 114},
-                    {115, 116, 117, 118, 119},
-                    {120, 121, 122, 123, 124}},
-
-                    {{125, 126, 127, 128, 129},
-                    {130, 131, 132, 133, 134},
-                    {135, 136, 137, 138, 139},
-                    {140, 141, 142, 143, 144},
-                    {145, 146, 147, 148, 149}}
-                }
-            }
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,6,6> { //NCHW
-            {
-                {
-                    {{ pv,   pv,   pv,   pv,   pv,  pv},
-                    { 0,   1,   2,   3,   4,  pv},
-                    { 5,   6,   7,   8,   9,  pv},
-                    { 10,  11,  12,  13,  14,  pv},
-                    { 15,  16,  17,  18,  19,  pv},
-                    { 20,  21,  22,  23,  24,  pv}},
-
-                    {{ pv,   pv,   pv,   pv,   pv,  pv},
-                    { 25,  26,  27,  28,  29,  pv},
-                    { 30,  31,  32,  33,  34,  pv},
-                    { 35,  36,  37,  38,  39,  pv},
-                    { 40,  41,  42,  43,  44,  pv},
-                    { 45,  46,  47,  48,  49,  pv}},
-
-                    {{ pv,   pv,   pv,   pv,   pv,  pv},
-                    { 50,  51,  52,  53,  54,  pv},
-                    { 55,  56,  57,  58,  59,  pv},
-                    { 60,  61,  62,  63,  64,  pv},
-                    { 65,  66,  67,  68,  69,  pv},
-                    { 70,  71,  72,  73,  74,  pv}}
-                },
-                {
-                    {{ pv,   pv,   pv,   pv,   pv,  pv},
-                    { 75,  76,  77,  78,  79,  pv},
-                    { 80,  81,  82,  83,  84,  pv},
-                    { 85,  86,  87,  88,  89,  pv},
-                    { 90,  91,  92,  93,  94,  pv},
-                    { 95,  96,  97,  98,  99,  pv}},
-
-                    {{ pv,   pv,   pv,   pv,   pv,  pv},
-                    { 100, 101, 102, 103, 104,  pv},
-                    { 105, 106, 107, 108, 109,  pv},
-                    { 110, 111, 112, 113, 114,  pv},
-                    { 115, 116, 117, 118, 119,  pv},
-                    { 120, 121, 122, 123, 124,  pv}},
-
-                    {{ pv,   pv,   pv,   pv,   pv,  pv},
-                    { 125, 126, 127, 128, 129,  pv},
-                    { 130, 131, 132, 133, 134,  pv},
-                    { 135, 136, 137, 138, 139,  pv},
-                    { 140, 141, 142, 143, 144,  pv},
-                    { 145, 146, 147, 148, 149,  pv}}
-                }
-            }
-        });
+        std::shared_ptr<Node> myPad = Pad<2>({1, 0, 0, 1},
+                                             "mypad",
+                                             PadBorderType::Constant,
+                                             static_cast<double>(pv));
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myPad->getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array4D<float, 2, 3, 5, 5>{// NCHW
+                                       {{{{0, 1, 2, 3, 4},
+                                          {5, 6, 7, 8, 9},
+                                          {10, 11, 12, 13, 14},
+                                          {15, 16, 17, 18, 19},
+                                          {20, 21, 22, 23, 24}},
+
+                                         {{25, 26, 27, 28, 29},
+                                          {30, 31, 32, 33, 34},
+                                          {35, 36, 37, 38, 39},
+                                          {40, 41, 42, 43, 44},
+                                          {45, 46, 47, 48, 49}},
+
+                                         {{50, 51, 52, 53, 54},
+                                          {55, 56, 57, 58, 59},
+                                          {60, 61, 62, 63, 64},
+                                          {65, 66, 67, 68, 69},
+                                          {70, 71, 72, 73, 74}}},
+                                        {{{75, 76, 77, 78, 79},
+                                          {80, 81, 82, 83, 84},
+                                          {85, 86, 87, 88, 89},
+                                          {90, 91, 92, 93, 94},
+                                          {95, 96, 97, 98, 99}},
+
+                                         {{100, 101, 102, 103, 104},
+                                          {105, 106, 107, 108, 109},
+                                          {110, 111, 112, 113, 114},
+                                          {115, 116, 117, 118, 119},
+                                          {120, 121, 122, 123, 124}},
+
+                                         {{125, 126, 127, 128, 129},
+                                          {130, 131, 132, 133, 134},
+                                          {135, 136, 137, 138, 139},
+                                          {140, 141, 142, 143, 144},
+                                          {145, 146, 147, 148, 149}}}}});
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(
+            Array4D<float, 2, 3, 6, 6>{// NCHW
+                                       {{{{pv, pv, pv, pv, pv, pv},
+                                          {0, 1, 2, 3, 4, pv},
+                                          {5, 6, 7, 8, 9, pv},
+                                          {10, 11, 12, 13, 14, pv},
+                                          {15, 16, 17, 18, 19, pv},
+                                          {20, 21, 22, 23, 24, pv}},
+
+                                         {{pv, pv, pv, pv, pv, pv},
+                                          {25, 26, 27, 28, 29, pv},
+                                          {30, 31, 32, 33, 34, pv},
+                                          {35, 36, 37, 38, 39, pv},
+                                          {40, 41, 42, 43, 44, pv},
+                                          {45, 46, 47, 48, 49, pv}},
+
+                                         {{pv, pv, pv, pv, pv, pv},
+                                          {50, 51, 52, 53, 54, pv},
+                                          {55, 56, 57, 58, 59, pv},
+                                          {60, 61, 62, 63, 64, pv},
+                                          {65, 66, 67, 68, 69, pv},
+                                          {70, 71, 72, 73, 74, pv}}},
+                                        {{{pv, pv, pv, pv, pv, pv},
+                                          {75, 76, 77, 78, 79, pv},
+                                          {80, 81, 82, 83, 84, pv},
+                                          {85, 86, 87, 88, 89, pv},
+                                          {90, 91, 92, 93, 94, pv},
+                                          {95, 96, 97, 98, 99, pv}},
+
+                                         {{pv, pv, pv, pv, pv, pv},
+                                          {100, 101, 102, 103, 104, pv},
+                                          {105, 106, 107, 108, 109, pv},
+                                          {110, 111, 112, 113, 114, pv},
+                                          {115, 116, 117, 118, 119, pv},
+                                          {120, 121, 122, 123, 124, pv}},
+
+                                         {{pv, pv, pv, pv, pv, pv},
+                                          {125, 126, 127, 128, 129, pv},
+                                          {130, 131, 132, 133, 134, pv},
+                                          {135, 136, 137, 138, 139, pv},
+                                          {140, 141, 142, 143, 144, pv},
+                                          {145, 146, 147, 148, 149, pv}}}}});
 
         myInput->setBackend("cuda");
-        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->associateInput(0, myInput);
         myPad->getOperator()->setDataType(DataType::Float32);
         myPad->getOperator()->setBackend("cuda");
 
         myPad->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
@@ -258,115 +250,110 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") {
     }
 
     SECTION("Pad Edge") {
-        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Edge);
-        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW
-            {
-                {
-                    {{  0,   1,   2,   3,   4},
-                    {  5,   6,   7,   8,   9},
-                    { 10,  11,  12,  13,  14},
-                    { 15,  16,  17,  18,  19},
-                    { 20,  21,  22,  23,  24}},
-
-                    {{ 25,  26,  27,  28,  29},
-                    { 30,  31,  32,  33,  34},
-                    { 35,  36,  37,  38,  39},
-                    { 40,  41,  42,  43,  44},
-                    { 45,  46,  47,  48,  49}},
-
-                    {{ 50,  51,  52,  53,  54},
-                    { 55,  56,  57,  58,  59},
-                    { 60,  61,  62,  63,  64},
-                    { 65,  66,  67,  68,  69},
-                    { 70,  71,  72,  73,  74}}
-                },
-                {
-                    {{ 75,  76,  77,  78,  79},
-                    { 80,  81,  82,  83,  84},
-                    { 85,  86,  87,  88,  89},
-                    { 90,  91,  92,  93,  94},
-                    { 95,  96,  97,  98,  99}},
-
-                    {{100, 101, 102, 103, 104},
-                    {105, 106, 107, 108, 109},
-                    {110, 111, 112, 113, 114},
-                    {115, 116, 117, 118, 119},
-                    {120, 121, 122, 123, 124}},
-
-                    {{125, 126, 127, 128, 129},
-                    {130, 131, 132, 133, 134},
-                    {135, 136, 137, 138, 139},
-                    {140, 141, 142, 143, 144},
-                    {145, 146, 147, 148, 149}}
-                }
-            }
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW
-            {
-                {
-                    {{ 0,  0,   1,   2,   3,   4,  4},
-                    { 0,   0,   1,   2,   3,   4,  4},
-                    { 5,   5,   6,   7,   8,   9,  9},
-                    { 10,  10,  11,  12,  13,  14,  14},
-                    { 15,  15,  16,  17,  18,  19,  19},
-                    { 20,  20,  21,  22,  23,  24,  24},
-                    { 20,  20,  21,  22,  23,  24,  24}},
-
-                    {{ 25,  25,  26,  27,  28,  29,  29},
-                    { 25,  25,  26,  27,  28,  29,  29},
-                    { 30,  30,  31,  32,  33,  34,  34},
-                    { 35,  35,  36,  37,  38,  39,  39},
-                    { 40,  40,  41,  42,  43,  44,  44},
-                    { 45,  45,  46,  47,  48,  49,  49},
-                    { 45,  45,  46,  47,  48,  49, 49}},
-
-                    {{ 50,  50,  51,  52,  53,  54,  54},
-                    { 50,  50,  51,  52,  53,  54,  54},
-                    { 55,  55,  56,  57,  58,  59,  59},
-                    { 60,  60,  61,  62,  63,  64,  64},
-                    { 65,  65,  66,  67,  68,  69,  69},
-                    { 70,  70,  71,  72,  73,  74,  74},
-                    { 70,  70,  71,  72,  73,  74,  74}}
-                },
-                {
-                    {{ 75,  75,  76,  77,  78,  79,  79},
-                    { 75,  75,  76,  77,  78,  79,  79},
-                    { 80,  80,  81,  82,  83,  84,  84},
-                    { 85,  85,  86,  87,  88,  89,  89},
-                    { 90,  90,  91,  92,  93,  94,  94},
-                    { 95,  95,  96,  97,  98,  99,  99},
-                    { 95,  95,  96,  97,  98,  99,  99}},
-
-                    {{100,  100, 101, 102, 103, 104,  104},
-                    {100,  100, 101, 102, 103, 104,  104},
-                    {105,  105, 106, 107, 108, 109, 109},
-                    {110,  110, 111, 112, 113, 114,  114},
-                    {115,  115, 116, 117, 118, 119,  119},
-                    {120,  120, 121, 122, 123, 124,  124},
-                    {120,  120, 121, 122, 123, 124,  124}},
-
-                    {{125,  125, 126, 127, 128, 129,  129},
-                    {125,  125, 126, 127, 128, 129,  129},
-                    {130,  130, 131, 132, 133, 134,  134},
-                    {135,  135, 136, 137, 138, 139,  139},
-                    {140,  140, 141, 142, 143, 144,  144},
-                    {145,  145, 146, 147, 148, 149,  149},
-                    {145,  145, 146, 147, 148, 149,  149}}
-                }
-            }
-        });
+        std::shared_ptr<Node> myPad =
+            Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Edge);
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myPad->getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array4D<float, 2, 3, 5, 5>{// NCHW
+                                       {{{{0, 1, 2, 3, 4},
+                                          {5, 6, 7, 8, 9},
+                                          {10, 11, 12, 13, 14},
+                                          {15, 16, 17, 18, 19},
+                                          {20, 21, 22, 23, 24}},
+
+                                         {{25, 26, 27, 28, 29},
+                                          {30, 31, 32, 33, 34},
+                                          {35, 36, 37, 38, 39},
+                                          {40, 41, 42, 43, 44},
+                                          {45, 46, 47, 48, 49}},
+
+                                         {{50, 51, 52, 53, 54},
+                                          {55, 56, 57, 58, 59},
+                                          {60, 61, 62, 63, 64},
+                                          {65, 66, 67, 68, 69},
+                                          {70, 71, 72, 73, 74}}},
+                                        {{{75, 76, 77, 78, 79},
+                                          {80, 81, 82, 83, 84},
+                                          {85, 86, 87, 88, 89},
+                                          {90, 91, 92, 93, 94},
+                                          {95, 96, 97, 98, 99}},
+
+                                         {{100, 101, 102, 103, 104},
+                                          {105, 106, 107, 108, 109},
+                                          {110, 111, 112, 113, 114},
+                                          {115, 116, 117, 118, 119},
+                                          {120, 121, 122, 123, 124}},
+
+                                         {{125, 126, 127, 128, 129},
+                                          {130, 131, 132, 133, 134},
+                                          {135, 136, 137, 138, 139},
+                                          {140, 141, 142, 143, 144},
+                                          {145, 146, 147, 148, 149}}}}});
+        std::shared_ptr<Tensor> myOutput =
+            std::make_shared<Tensor>(Array4D<float, 2, 3, 7, 7>{
+                // NCHW
+                {{{{0, 0, 1, 2, 3, 4, 4},
+                   {0, 0, 1, 2, 3, 4, 4},
+                   {5, 5, 6, 7, 8, 9, 9},
+                   {10, 10, 11, 12, 13, 14, 14},
+                   {15, 15, 16, 17, 18, 19, 19},
+                   {20, 20, 21, 22, 23, 24, 24},
+                   {20, 20, 21, 22, 23, 24, 24}},
+
+                  {{25, 25, 26, 27, 28, 29, 29},
+                   {25, 25, 26, 27, 28, 29, 29},
+                   {30, 30, 31, 32, 33, 34, 34},
+                   {35, 35, 36, 37, 38, 39, 39},
+                   {40, 40, 41, 42, 43, 44, 44},
+                   {45, 45, 46, 47, 48, 49, 49},
+                   {45, 45, 46, 47, 48, 49, 49}},
+
+                  {{50, 50, 51, 52, 53, 54, 54},
+                   {50, 50, 51, 52, 53, 54, 54},
+                   {55, 55, 56, 57, 58, 59, 59},
+                   {60, 60, 61, 62, 63, 64, 64},
+                   {65, 65, 66, 67, 68, 69, 69},
+                   {70, 70, 71, 72, 73, 74, 74},
+                   {70, 70, 71, 72, 73, 74, 74}}},
+                 {{{75, 75, 76, 77, 78, 79, 79},
+                   {75, 75, 76, 77, 78, 79, 79},
+                   {80, 80, 81, 82, 83, 84, 84},
+                   {85, 85, 86, 87, 88, 89, 89},
+                   {90, 90, 91, 92, 93, 94, 94},
+                   {95, 95, 96, 97, 98, 99, 99},
+                   {95, 95, 96, 97, 98, 99, 99}},
+
+                  {{100, 100, 101, 102, 103, 104, 104},
+                   {100, 100, 101, 102, 103, 104, 104},
+                   {105, 105, 106, 107, 108, 109, 109},
+                   {110, 110, 111, 112, 113, 114, 114},
+                   {115, 115, 116, 117, 118, 119, 119},
+                   {120, 120, 121, 122, 123, 124, 124},
+                   {120, 120, 121, 122, 123, 124, 124}},
+
+                  {{125, 125, 126, 127, 128, 129, 129},
+                   {125, 125, 126, 127, 128, 129, 129},
+                   {130, 130, 131, 132, 133, 134, 134},
+                   {135, 135, 136, 137, 138, 139, 139},
+                   {140, 140, 141, 142, 143, 144, 144},
+                   {145, 145, 146, 147, 148, 149, 149},
+                   {145, 145, 146, 147, 148, 149, 149}}}}});
         myInput->setBackend("cuda");
-        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->associateInput(0, myInput);
         myPad->getOperator()->setDataType(DataType::Float32);
         myPad->getOperator()->setBackend("cuda");
 
         myPad->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
@@ -374,124 +361,107 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") {
     }
 
     SECTION("Pad Reflect") {
-        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Reflect);
-        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW
-            {
-                {
-                    {{  0,   1,   2,   3,   4},
-                    {  5,   6,   7,   8,   9},
-                    { 10,  11,  12,  13,  14},
-                    { 15,  16,  17,  18,  19},
-                    { 20,  21,  22,  23,  24}},
-
-                    {{ 25,  26,  27,  28,  29},
-                    { 30,  31,  32,  33,  34},
-                    { 35,  36,  37,  38,  39},
-                    { 40,  41,  42,  43,  44},
-                    { 45,  46,  47,  48,  49}},
-
-                    {{ 50,  51,  52,  53,  54},
-                    { 55,  56,  57,  58,  59},
-                    { 60,  61,  62,  63,  64},
-                    { 65,  66,  67,  68,  69},
-                    { 70,  71,  72,  73,  74}}
-                },
-                {
-                    {{ 75,  76,  77,  78,  79},
-                    { 80,  81,  82,  83,  84},
-                    { 85,  86,  87,  88,  89},
-                    { 90,  91,  92,  93,  94},
-                    { 95,  96,  97,  98,  99}},
-
-                    {{100, 101, 102, 103, 104},
-                    {105, 106, 107, 108, 109},
-                    {110, 111, 112, 113, 114},
-                    {115, 116, 117, 118, 119},
-                    {120, 121, 122, 123, 124}},
-
-                    {{125, 126, 127, 128, 129},
-                    {130, 131, 132, 133, 134},
-                    {135, 136, 137, 138, 139},
-                    {140, 141, 142, 143, 144},
-                    {145, 146, 147, 148, 149}}
-                }
-            }
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW
-            {
-                {
-                    {
-                    { 6, 5, 6, 7, 8, 9, 5},
-                    { 1, 0, 1, 2, 3, 4, 0},
-                    { 6, 5, 6, 7, 8, 9, 5},
-                    { 11, 10, 11, 12, 13, 14, 10},
-                    { 16, 15, 16, 17, 18, 19, 15},
-                    { 21, 20, 21, 22, 23, 24, 20},
-                    { 1, 0, 1, 2, 3, 4, 0}
-                    },
-                    {
-                    { 31, 30, 31, 32, 33, 34, 30},
-                    { 26, 25, 26, 27, 28, 29, 25},
-                    { 31, 30, 31, 32, 33, 34, 30},
-                    { 36, 35, 36, 37, 38, 39, 35},
-                    { 41, 40, 41, 42, 43, 44, 40},
-                    { 46, 45, 46, 47, 48, 49, 45},
-                    { 26, 25, 26, 27, 28, 29, 25}
-                    },
-                    {
-                    { 56, 55, 56, 57, 58, 59, 55},
-                    { 51, 50, 51, 52, 53, 54, 50},
-                    { 56, 55, 56, 57, 58, 59, 55},
-                    { 61, 60, 61, 62, 63, 64, 60},
-                    { 66, 65, 66, 67, 68, 69, 65},
-                    { 71, 70, 71, 72, 73, 74, 70},
-                    { 51, 50, 51, 52, 53, 54, 50}
-                    }
-                },
-                {
-                    {
-                    { 81, 80, 81, 82, 83, 84, 80},
-                    { 76, 75, 76, 77, 78, 79, 75},
-                    { 81, 80, 81, 82, 83, 84, 80},
-                    { 86, 85, 86, 87, 88, 89, 85},
-                    { 91, 90, 91, 92, 93, 94, 90},
-                    { 96, 95, 96, 97, 98, 99, 95},
-                    { 76, 75, 76, 77, 78, 79, 75}
-                    },
-                    {
-                    { 106, 105, 106, 107, 108, 109, 105},
-                    { 101, 100, 101, 102, 103, 104, 100},
-                    { 106, 105, 106, 107, 108, 109, 105},
-                    { 111, 110, 111, 112, 113, 114, 110},
-                    { 116, 115, 116, 117, 118, 119, 115},
-                    { 121, 120, 121, 122, 123, 124, 120},
-                    { 101, 100, 101, 102, 103, 104, 100}
-                    },
-                    {
-                    { 131, 130, 131, 132, 133, 134, 130},
-                    { 126, 125, 126, 127, 128, 129, 125},
-                    { 131, 130, 131, 132, 133, 134, 130},
-                    { 136, 135, 136, 137, 138, 139, 135},
-                    { 141, 140, 141, 142, 143, 144, 140},
-                    { 146, 145, 146, 147, 148, 149, 145},
-                    { 126, 125, 126, 127, 128, 129, 125}
-                    }
-                    }
-                }
-        });
+        std::shared_ptr<Node> myPad =
+            Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Reflect);
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myPad->getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array4D<float, 2, 3, 5, 5>{// NCHW
+                                       {{{{0, 1, 2, 3, 4},
+                                          {5, 6, 7, 8, 9},
+                                          {10, 11, 12, 13, 14},
+                                          {15, 16, 17, 18, 19},
+                                          {20, 21, 22, 23, 24}},
+
+                                         {{25, 26, 27, 28, 29},
+                                          {30, 31, 32, 33, 34},
+                                          {35, 36, 37, 38, 39},
+                                          {40, 41, 42, 43, 44},
+                                          {45, 46, 47, 48, 49}},
+
+                                         {{50, 51, 52, 53, 54},
+                                          {55, 56, 57, 58, 59},
+                                          {60, 61, 62, 63, 64},
+                                          {65, 66, 67, 68, 69},
+                                          {70, 71, 72, 73, 74}}},
+                                        {{{75, 76, 77, 78, 79},
+                                          {80, 81, 82, 83, 84},
+                                          {85, 86, 87, 88, 89},
+                                          {90, 91, 92, 93, 94},
+                                          {95, 96, 97, 98, 99}},
+
+                                         {{100, 101, 102, 103, 104},
+                                          {105, 106, 107, 108, 109},
+                                          {110, 111, 112, 113, 114},
+                                          {115, 116, 117, 118, 119},
+                                          {120, 121, 122, 123, 124}},
+
+                                         {{125, 126, 127, 128, 129},
+                                          {130, 131, 132, 133, 134},
+                                          {135, 136, 137, 138, 139},
+                                          {140, 141, 142, 143, 144},
+                                          {145, 146, 147, 148, 149}}}}});
+        std::shared_ptr<Tensor> myOutput =
+            std::make_shared<Tensor>(Array4D<float, 2, 3, 7, 7>{
+                // NCHW
+                {{{{6, 5, 6, 7, 8, 9, 5},
+                   {1, 0, 1, 2, 3, 4, 0},
+                   {6, 5, 6, 7, 8, 9, 5},
+                   {11, 10, 11, 12, 13, 14, 10},
+                   {16, 15, 16, 17, 18, 19, 15},
+                   {21, 20, 21, 22, 23, 24, 20},
+                   {1, 0, 1, 2, 3, 4, 0}},
+                  {{31, 30, 31, 32, 33, 34, 30},
+                   {26, 25, 26, 27, 28, 29, 25},
+                   {31, 30, 31, 32, 33, 34, 30},
+                   {36, 35, 36, 37, 38, 39, 35},
+                   {41, 40, 41, 42, 43, 44, 40},
+                   {46, 45, 46, 47, 48, 49, 45},
+                   {26, 25, 26, 27, 28, 29, 25}},
+                  {{56, 55, 56, 57, 58, 59, 55},
+                   {51, 50, 51, 52, 53, 54, 50},
+                   {56, 55, 56, 57, 58, 59, 55},
+                   {61, 60, 61, 62, 63, 64, 60},
+                   {66, 65, 66, 67, 68, 69, 65},
+                   {71, 70, 71, 72, 73, 74, 70},
+                   {51, 50, 51, 52, 53, 54, 50}}},
+                 {{{81, 80, 81, 82, 83, 84, 80},
+                   {76, 75, 76, 77, 78, 79, 75},
+                   {81, 80, 81, 82, 83, 84, 80},
+                   {86, 85, 86, 87, 88, 89, 85},
+                   {91, 90, 91, 92, 93, 94, 90},
+                   {96, 95, 96, 97, 98, 99, 95},
+                   {76, 75, 76, 77, 78, 79, 75}},
+                  {{106, 105, 106, 107, 108, 109, 105},
+                   {101, 100, 101, 102, 103, 104, 100},
+                   {106, 105, 106, 107, 108, 109, 105},
+                   {111, 110, 111, 112, 113, 114, 110},
+                   {116, 115, 116, 117, 118, 119, 115},
+                   {121, 120, 121, 122, 123, 124, 120},
+                   {101, 100, 101, 102, 103, 104, 100}},
+                  {{131, 130, 131, 132, 133, 134, 130},
+                   {126, 125, 126, 127, 128, 129, 125},
+                   {131, 130, 131, 132, 133, 134, 130},
+                   {136, 135, 136, 137, 138, 139, 135},
+                   {141, 140, 141, 142, 143, 144, 140},
+                   {146, 145, 146, 147, 148, 149, 145},
+                   {126, 125, 126, 127, 128, 129, 125}}}}});
         myInput->setBackend("cuda");
-        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->associateInput(0, myInput);
         myPad->getOperator()->setDataType(DataType::Float32);
         myPad->getOperator()->setBackend("cuda");
 
         myPad->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
@@ -499,116 +469,111 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") {
     }
 
     SECTION("Pad Wrap") {
-        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Wrap);
-        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW
-            {
-                {
-                    {{  0,   1,   2,   3,   4},
-                    {  5,   6,   7,   8,   9},
-                    { 10,  11,  12,  13,  14},
-                    { 15,  16,  17,  18,  19},
-                    { 20,  21,  22,  23,  24}},
-
-                    {{ 25,  26,  27,  28,  29},
-                    { 30,  31,  32,  33,  34},
-                    { 35,  36,  37,  38,  39},
-                    { 40,  41,  42,  43,  44},
-                    { 45,  46,  47,  48,  49}},
-
-                    {{ 50,  51,  52,  53,  54},
-                    { 55,  56,  57,  58,  59},
-                    { 60,  61,  62,  63,  64},
-                    { 65,  66,  67,  68,  69},
-                    { 70,  71,  72,  73,  74}}
-                },
-                {
-                    {{ 75,  76,  77,  78,  79},
-                    { 80,  81,  82,  83,  84},
-                    { 85,  86,  87,  88,  89},
-                    { 90,  91,  92,  93,  94},
-                    { 95,  96,  97,  98,  99}},
-
-                    {{100, 101, 102, 103, 104},
-                    {105, 106, 107, 108, 109},
-                    {110, 111, 112, 113, 114},
-                    {115, 116, 117, 118, 119},
-                    {120, 121, 122, 123, 124}},
-
-                    {{125, 126, 127, 128, 129},
-                    {130, 131, 132, 133, 134},
-                    {135, 136, 137, 138, 139},
-                    {140, 141, 142, 143, 144},
-                    {145, 146, 147, 148, 149}}
-                }
-            }
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,3,7,7> { //NCHW
-            {
-                {
-                    {{ 24,  20,  21,  22,  23,  24,  20},
-                    { 4,   0,   1,   2,   3,   4,  0},
-                    { 9,   5,   6,   7,   8,   9,  5},
-                    { 14,  10,  11,  12,  13,  14,  10},
-                    { 19,  15,  16,  17,  18,  19,  15},
-                    { 24,  20,  21,  22,  23,  24,  20},
-                    { 4,   0,   1,   2,   3,   4,  0}},
-
-                    {{ 49,  45,  46,  47,  48,  49, 45},
-                    { 29,  25,  26,  27,  28,  29,  25},
-                    { 34,  30,  31,  32,  33,  34,  30},
-                    { 39,  35,  36,  37,  38,  39,  35},
-                    { 44,  40,  41,  42,  43,  44,  40},
-                    { 49,  45,  46,  47,  48,  49,  45},
-                    { 29,  25,  26,  27,  28,  29,  25}},
-
-                    {{ 74,  70,  71,  72,  73,  74,  70},
-                    { 54,  50,  51,  52,  53,  54,  50},
-                    { 59,  55,  56,  57,  58,  59,  55},
-                    { 64,  60,  61,  62,  63,  64,  60},
-                    { 69,  65,  66,  67,  68,  69,  65},
-                    { 74,  70,  71,  72,  73,  74,  70},
-                    { 54,  50,  51,  52,  53,  54,  50}}
-                },
-                {
-                    {{ 99,  95,  96,  97,  98,  99,  95},
-                    { 79,  75,  76,  77,  78,  79,  75},
-                    { 84,  80,  81,  82,  83,  84,  80},
-                    { 89,  85,  86,  87,  88,  89,  85},
-                    { 94,  90,  91,  92,  93,  94,  90},
-                    { 99,  95,  96,  97,  98,  99,  95},
-                    { 79,  75,  76,  77,  78,  79,  75}},
-
-                    {{124,  120, 121, 122, 123, 124,  120},
-                    {104,  100, 101, 102, 103, 104,  100},
-                    {109,  105, 106, 107, 108, 109, 105},
-                    {114,  110, 111, 112, 113, 114,  110},
-                    {119,  115, 116, 117, 118, 119,  115},
-                    {124,  120, 121, 122, 123, 124,  120},
-                    {104,  100, 101, 102, 103, 104,  100}},
-
-                    {{149,  145, 146, 147, 148, 149,  145},
-                    {129,  125, 126, 127, 128, 129,  125},
-                    {134,  130, 131, 132, 133, 134,  130},
-                    {139,  135, 136, 137, 138, 139,  135},
-                    {144,  140, 141, 142, 143, 144,  140},
-                    {149,  145, 146, 147, 148, 149,  145},
-                    {129,  125, 126, 127, 128, 129,  125}}
-                }
-            }
-        });
+        std::shared_ptr<Node> myPad =
+            Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Wrap);
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myPad->getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array4D<float, 2, 3, 5, 5>{// NCHW
+                                       {{{{0, 1, 2, 3, 4},
+                                          {5, 6, 7, 8, 9},
+                                          {10, 11, 12, 13, 14},
+                                          {15, 16, 17, 18, 19},
+                                          {20, 21, 22, 23, 24}},
+
+                                         {{25, 26, 27, 28, 29},
+                                          {30, 31, 32, 33, 34},
+                                          {35, 36, 37, 38, 39},
+                                          {40, 41, 42, 43, 44},
+                                          {45, 46, 47, 48, 49}},
+
+                                         {{50, 51, 52, 53, 54},
+                                          {55, 56, 57, 58, 59},
+                                          {60, 61, 62, 63, 64},
+                                          {65, 66, 67, 68, 69},
+                                          {70, 71, 72, 73, 74}}},
+                                        {{{75, 76, 77, 78, 79},
+                                          {80, 81, 82, 83, 84},
+                                          {85, 86, 87, 88, 89},
+                                          {90, 91, 92, 93, 94},
+                                          {95, 96, 97, 98, 99}},
+
+                                         {{100, 101, 102, 103, 104},
+                                          {105, 106, 107, 108, 109},
+                                          {110, 111, 112, 113, 114},
+                                          {115, 116, 117, 118, 119},
+                                          {120, 121, 122, 123, 124}},
+
+                                         {{125, 126, 127, 128, 129},
+                                          {130, 131, 132, 133, 134},
+                                          {135, 136, 137, 138, 139},
+                                          {140, 141, 142, 143, 144},
+                                          {145, 146, 147, 148, 149}}}}});
+        std::shared_ptr<Tensor> myOutput =
+            std::make_shared<Tensor>(Array4D<float, 2, 3, 7, 7>{
+                // NCHW
+                {{{{24, 20, 21, 22, 23, 24, 20},
+                   {4, 0, 1, 2, 3, 4, 0},
+                   {9, 5, 6, 7, 8, 9, 5},
+                   {14, 10, 11, 12, 13, 14, 10},
+                   {19, 15, 16, 17, 18, 19, 15},
+                   {24, 20, 21, 22, 23, 24, 20},
+                   {4, 0, 1, 2, 3, 4, 0}},
+
+                  {{49, 45, 46, 47, 48, 49, 45},
+                   {29, 25, 26, 27, 28, 29, 25},
+                   {34, 30, 31, 32, 33, 34, 30},
+                   {39, 35, 36, 37, 38, 39, 35},
+                   {44, 40, 41, 42, 43, 44, 40},
+                   {49, 45, 46, 47, 48, 49, 45},
+                   {29, 25, 26, 27, 28, 29, 25}},
+
+                  {{74, 70, 71, 72, 73, 74, 70},
+                   {54, 50, 51, 52, 53, 54, 50},
+                   {59, 55, 56, 57, 58, 59, 55},
+                   {64, 60, 61, 62, 63, 64, 60},
+                   {69, 65, 66, 67, 68, 69, 65},
+                   {74, 70, 71, 72, 73, 74, 70},
+                   {54, 50, 51, 52, 53, 54, 50}}},
+                 {{{99, 95, 96, 97, 98, 99, 95},
+                   {79, 75, 76, 77, 78, 79, 75},
+                   {84, 80, 81, 82, 83, 84, 80},
+                   {89, 85, 86, 87, 88, 89, 85},
+                   {94, 90, 91, 92, 93, 94, 90},
+                   {99, 95, 96, 97, 98, 99, 95},
+                   {79, 75, 76, 77, 78, 79, 75}},
+
+                  {{124, 120, 121, 122, 123, 124, 120},
+                   {104, 100, 101, 102, 103, 104, 100},
+                   {109, 105, 106, 107, 108, 109, 105},
+                   {114, 110, 111, 112, 113, 114, 110},
+                   {119, 115, 116, 117, 118, 119, 115},
+                   {124, 120, 121, 122, 123, 124, 120},
+                   {104, 100, 101, 102, 103, 104, 100}},
+
+                  {{149, 145, 146, 147, 148, 149, 145},
+                   {129, 125, 126, 127, 128, 129, 125},
+                   {134, 130, 131, 132, 133, 134, 130},
+                   {139, 135, 136, 137, 138, 139, 135},
+                   {144, 140, 141, 142, 143, 144, 140},
+                   {149, 145, 146, 147, 148, 149, 145},
+                   {129, 125, 126, 127, 128, 129, 125}}}}});
         myInput->setBackend("cuda");
-        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->associateInput(0, myInput);
         myPad->getOperator()->setDataType(DataType::Float32);
         myPad->getOperator()->setBackend("cuda");
 
         myPad->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
@@ -620,77 +585,103 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") {
         std::random_device rd;
         std::mt19937 gen(rd());
         std::uniform_real_distribution<float> valueDist(
-            0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> padTypeDist(std::size_t(0), std::size_t(1));
-        // TODO: fix Reflect and Wrap Pad, cpu and gpu only five same results when padding = 1 
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), std::size_t(10));
-        std::uniform_int_distribution<std::size_t> padSizeDist(std::size_t(0), std::size_t(5));
+            0.1f,
+            1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> padTypeDist(std::size_t(0),
+                                                               std::size_t(1));
+        // TODO: fix Reflect and Wrap Pad, cpu and gpu only five same results
+        // when padding = 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(
+            std::size_t(1),
+            std::size_t(10));
+        std::uniform_int_distribution<std::size_t> padSizeDist(std::size_t(0),
+                                                               std::size_t(5));
 
         // To measure execution time of 'forward()'
         std::chrono::time_point<std::chrono::system_clock> start;
         std::chrono::time_point<std::chrono::system_clock> end;
         std::chrono::duration<double, std::micro> duration{};
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
-        {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
             const std::size_t nbDims = 4;
             std::vector<std::size_t> dims;
             for (std::size_t i = 0; i < nbDims; ++i) {
                 dims.push_back(dimSizeDist(gen));
             }
 
-            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements =
+                std::accumulate(dims.cbegin(),
+                                dims.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
 
             const std::size_t borderType = padTypeDist(gen);
             const std::size_t padding = padSizeDist(gen);
             // Create Pad Operator CUDA
-            std::shared_ptr<Node> myPadCUDA = Pad<2>({padding, padding, padding, padding}, "mypadcuda",  static_cast<PadBorderType>(borderType));
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPadCUDA -> getOperator());
+            std::shared_ptr<Node> myPadCUDA =
+                Pad<2>({padding, padding, padding, padding},
+                       "mypadcuda",
+                       static_cast<PadBorderType>(borderType));
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(
+                myPadCUDA->getOperator());
             op_cuda->setDataType(DataType::Float32);
             op_cuda->setBackend("cuda");
 
             // Create Pad Operator CPU
-            std::shared_ptr<Node> myPadCPU = Pad<2>({padding, padding, padding, padding}, "mypadcpu", static_cast<PadBorderType>(borderType));
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPadCPU -> getOperator());
+            std::shared_ptr<Node> myPadCPU =
+                Pad<2>({padding, padding, padding, padding},
+                       "mypadcpu",
+                       static_cast<PadBorderType>(borderType));
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(
+                myPadCPU->getOperator());
             op_cpu->setDataType(DataType::Float32);
             op_cpu->setBackend("cpu");
 
-            float* array0 = new float[nb_elements];
+            float *array0 = new float[nb_elements];
             for (std::size_t i = 0; i < nb_elements; ++i) {
                 array0[i] = valueDist(gen);
             }
 
             // input CUDA
-            float* array0_d;
+            float *array0_d;
             std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
             T0_cuda->setDataType(DataType::Float32);
             T0_cuda->setBackend("cuda");
             T0_cuda->resize(dims);
             op_cuda->associateInput(0, T0_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
-            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                       sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d,
+                       array0,
+                       sizeof(float) * nb_elements,
+                       cudaMemcpyHostToDevice);
             T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
 
             // input CPU
             std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(0,T0_cpu);
+            op_cpu->associateInput(0, T0_cpu);
             T0_cpu->setDataType(DataType::Float32);
             T0_cpu->setBackend("cpu");
             T0_cpu->resize(dims);
-            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+            T0_cpu->getImpl()->setRawPtr(array0, nb_elements);
 
             // forward CUDA
             start = std::chrono::system_clock::now();
             op_cuda->forward();
             end = std::chrono::system_clock::now();
-            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(
+                end - start);
 
-            const std::size_t outSize =  op_cuda->getOutput(0)->size();
+            const std::size_t outSize = op_cuda->getOutput(0)->size();
             float *computed_cuda = new float[outSize]();
-            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * outSize, cudaMemcpyDeviceToHost);
+            cudaMemcpy(computed_cuda,
+                       op_cuda->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * outSize,
+                       cudaMemcpyDeviceToHost);
 
             // forward CPU
             op_cpu->forward();
-            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            float *computed_cpu = static_cast<float *>(
+                op_cpu->getOutput(0)->getImpl()->rawPtr());
             REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
 
             delete[] array0;
@@ -702,80 +693,80 @@ TEST_CASE("[gpu/operator] Pad(forward)", "[Pad][GPU]") {
 }
 
 TEST_CASE("[gpu/operator] Pad(backward)", "[Pad][GPU]") {
-   SECTION("Symmetric Pad") {
+    SECTION("Symmetric Pad") {
         const int pv = 0; // pad value
 
-        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv));
-        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,1,3,5,5> { //NCHW
-            {
-                {
-                    {{  0,   1,   2,   3,   4},
-                    {  5,   6,   7,   8,   9},
-                    { 10,  11,  12,  13,  14},
-                    { 15,  16,  17,  18,  19},
-                    { 20,  21,  22,  23,  24}},
-
-                    {{ 25,  26,  27,  28,  29},
-                    { 30,  31,  32,  33,  34},
-                    { 35,  36,  37,  38,  39},
-                    { 40,  41,  42,  43,  44},
-                    { 45,  46,  47,  48,  49}},
-
-                    {{ 50,  51,  52,  53,  54},
-                    { 55,  56,  57,  58,  59},
-                    { 60,  61,  62,  63,  64},
-                    { 65,  66,  67,  68,  69},
-                    { 70,  71,  72,  73,  74}}
-                }
-            }
-        });
+        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1},
+                                             "mypad",
+                                             PadBorderType::Constant,
+                                             static_cast<double>(pv));
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myPad->getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array4D<float, 1, 3, 5, 5>{// NCHW
+                                       {{{{0, 1, 2, 3, 4},
+                                          {5, 6, 7, 8, 9},
+                                          {10, 11, 12, 13, 14},
+                                          {15, 16, 17, 18, 19},
+                                          {20, 21, 22, 23, 24}},
+
+                                         {{25, 26, 27, 28, 29},
+                                          {30, 31, 32, 33, 34},
+                                          {35, 36, 37, 38, 39},
+                                          {40, 41, 42, 43, 44},
+                                          {45, 46, 47, 48, 49}},
+
+                                         {{50, 51, 52, 53, 54},
+                                          {55, 56, 57, 58, 59},
+                                          {60, 61, 62, 63, 64},
+                                          {65, 66, 67, 68, 69},
+                                          {70, 71, 72, 73, 74}}}}});
         myInput->setBackend("cuda");
-        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->associateInput(0, myInput);
         myPad->getOperator()->setDataType(DataType::Float32);
         myPad->getOperator()->setBackend("cuda");
 
         myPad->forward();
 
-        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,3,7,7> { //NCHW
-            {
-                {
-                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
-                    { pv,   0,   1,   2,   3,   4,  pv},
-                    { pv,   5,   6,   7,   8,   9,  pv},
-                    { pv,  10,  11,  12,  13,  14,  pv},
-                    { pv,  15,  16,  17,  18,  19,  pv},
-                    { pv,  20,  21,  22,  23,  24,  pv},
-                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
-
-                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
-                    { pv,  25,  26,  27,  28,  29,  pv},
-                    { pv,  30,  31,  32,  33,  34,  pv},
-                    { pv,  35,  36,  37,  38,  39,  pv},
-                    { pv,  40,  41,  42,  43,  44,  pv},
-                    { pv,  45,  46,  47,  48,  49,  pv},
-                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
-
-                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
-                    { pv,  50,  51,  52,  53,  54,  pv},
-                    { pv,  55,  56,  57,  58,  59,  pv},
-                    { pv,  60,  61,  62,  63,  64,  pv},
-                    { pv,  65,  66,  67,  68,  69,  pv},
-                    { pv,  70,  71,  72,  73,  74,  pv},
-                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}}
-                }
-            }
-        });
+        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(
+            Array4D<float, 1, 3, 7, 7>{// NCHW
+                                       {{{{pv, pv, pv, pv, pv, pv, pv},
+                                          {pv, 0, 1, 2, 3, 4, pv},
+                                          {pv, 5, 6, 7, 8, 9, pv},
+                                          {pv, 10, 11, 12, 13, 14, pv},
+                                          {pv, 15, 16, 17, 18, 19, pv},
+                                          {pv, 20, 21, 22, 23, 24, pv},
+                                          {pv, pv, pv, pv, pv, pv, pv}},
+
+                                         {{pv, pv, pv, pv, pv, pv, pv},
+                                          {pv, 25, 26, 27, 28, 29, pv},
+                                          {pv, 30, 31, 32, 33, 34, pv},
+                                          {pv, 35, 36, 37, 38, 39, pv},
+                                          {pv, 40, 41, 42, 43, 44, pv},
+                                          {pv, 45, 46, 47, 48, 49, pv},
+                                          {pv, pv, pv, pv, pv, pv, pv}},
+
+                                         {{pv, pv, pv, pv, pv, pv, pv},
+                                          {pv, 50, 51, 52, 53, 54, pv},
+                                          {pv, 55, 56, 57, 58, 59, pv},
+                                          {pv, 60, 61, 62, 63, 64, pv},
+                                          {pv, 65, 66, 67, 68, 69, pv},
+                                          {pv, 70, 71, 72, 73, 74, pv},
+                                          {pv, pv, pv, pv, pv, pv, pv}}}}});
         myOutputGrad->setBackend("cuda");
         op->getOutput(0)->setGrad(myOutputGrad);
         REQUIRE_NOTHROW(myPad->backward());
 
         float *computedGradCuda = new float[myInput->size()]();
-        cudaMemcpy(computedGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * myInput->size(), cudaMemcpyDeviceToHost);
+        cudaMemcpy(computedGradCuda,
+                   op->getInput(0)->grad()->getImpl()->rawPtr(),
+                   sizeof(float) * myInput->size(),
+                   cudaMemcpyDeviceToHost);
 
         myInput->setBackend("cpu");
-        for(int i = 0; i < myInput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myInput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myInput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myInput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
         }
 
diff --git a/unit_tests/Test_PowImpl.cpp b/unit_tests/Test_PowImpl.cpp
index 49e65b46d7d85b7087c5c73151d643593d91e02e..ab419d0b07029365600d9056e17f84a5e8442b27 100644
--- a/unit_tests/Test_PowImpl.cpp
+++ b/unit_tests/Test_PowImpl.cpp
@@ -11,7 +11,7 @@
 
 #include <array>
 #include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -27,10 +27,14 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
     // Create a random number generator
     std::random_device rd;
     std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
-    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
-    std::uniform_int_distribution<int> boolDist(0,1);
+    std::uniform_real_distribution<float> valueDist(
+        0.1f,
+        1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2),
+                                                           std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1),
+                                                          std::size_t(5));
+    std::uniform_int_distribution<int> boolDist(0, 1);
 
     // To measure execution time of 'MatPow_Op::forward()' member function call
     std::chrono::time_point<std::chrono::system_clock> start;
@@ -38,21 +42,19 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
     std::chrono::duration<double, std::micro> duration{};
 
     SECTION("PowImpl::forward()") {
-        SECTION("Scalar / Scalar") {
-
-        }
-        SECTION("Scalar / +1-D Tensor") {
-
-        }
+        SECTION("Scalar / Scalar") {}
+        SECTION("Scalar / +1-D Tensor") {}
         SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
 
             // Create Pow Operator
             std::shared_ptr<Node> myPowCUDA = Pow();
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPowCUDA-> getOperator());
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(
+                myPowCUDA->getOperator());
             op_cuda->setDataType(DataType::Float32);
             op_cuda->setBackend("cuda");
             std::shared_ptr<Node> myPowCPU = Pow();
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPowCPU-> getOperator());
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(
+                myPowCPU->getOperator());
             op_cpu->setDataType(DataType::Float32);
             op_cpu->setBackend("cpu");
 
@@ -65,12 +67,16 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                 for (std::size_t i = 0; i < nbDims; ++i) {
                     dims.push_back(dimSizeDist(gen));
                 }
-                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const std::size_t nb_elements =
+                    std::accumulate(dims.cbegin(),
+                                    dims.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
 
                 // without broadcasting
-                float* array0 = new float[nb_elements];
-                float* array1 = new float[nb_elements];
+                float *array0 = new float[nb_elements];
+                float *array1 = new float[nb_elements];
 
                 for (std::size_t i = 0; i < nb_elements; ++i) {
                     array0[i] = valueDist(gen);
@@ -78,14 +84,18 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                 }
 
                 // input0 CUDA
-                float* array0_d, *array1_d;
+                float *array0_d, *array1_d;
                 std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
                 T0_cuda->setDataType(DataType::Float32);
                 T0_cuda->setBackend("cuda");
                 T0_cuda->resize(dims);
                 op_cuda->associateInput(0, T0_cuda);
-                cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
-                cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+                cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                           sizeof(float) * nb_elements);
+                cudaMemcpy(array0_d,
+                           array0,
+                           sizeof(float) * nb_elements,
+                           cudaMemcpyHostToDevice);
                 T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
 
                 // input0 CPU
@@ -93,8 +103,8 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                 T0_cpu->setDataType(DataType::Float32);
                 T0_cpu->setBackend("cpu");
                 T0_cpu->resize(dims);
-                op_cpu->associateInput(0,T0_cpu);
-                T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+                op_cpu->associateInput(0, T0_cpu);
+                T0_cpu->getImpl()->setRawPtr(array0, nb_elements);
 
                 // input1 CUDA
                 std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>();
@@ -102,8 +112,12 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                 T1_cuda->setBackend("cuda");
                 T1_cuda->resize(dims);
                 op_cuda->associateInput(1, T1_cuda);
-                cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements);
-                cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+                cudaMalloc(reinterpret_cast<void **>(&array1_d),
+                           sizeof(float) * nb_elements);
+                cudaMemcpy(array1_d,
+                           array1,
+                           sizeof(float) * nb_elements,
+                           cudaMemcpyHostToDevice);
                 T1_cuda->getImpl()->setRawPtr(array1_d, nb_elements);
 
                 // input1
@@ -111,21 +125,25 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                 T1_cpu->setDataType(DataType::Float32);
                 T1_cpu->setBackend("cpu");
                 T1_cpu->resize(dims);
-                op_cpu -> associateInput(1,T1_cpu);
-                T1_cpu -> getImpl() -> setRawPtr(array1, nb_elements);
+                op_cpu->associateInput(1, T1_cpu);
+                T1_cpu->getImpl()->setRawPtr(array1, nb_elements);
 
                 op_cuda->forwardDims();
                 start = std::chrono::system_clock::now();
                 myPowCUDA->forward();
                 end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
 
                 // REQUIRE(false);
                 op_cpu->forwardDims();
                 myPowCPU->forward();
 
                 std::shared_ptr<Tensor> outputFallback;
-                const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));
+                const auto &cudaOutput =
+                    op_cuda->getOutput(0)->refCastFrom(outputFallback,
+                                                       *op_cpu->getOutput(0));
                 REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
 
                 delete[] array0;
@@ -133,26 +151,31 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                 cudaFree(array0_d);
                 cudaFree(array1_d);
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            std::cout << "number of elements over time spent: "
+                      << (number_of_operation / duration.count()) << std::endl;
+            std::cout << "total time: " << duration.count() << "Î¼s"
+                      << std::endl;
         }
 
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
             // Create Pow Operator
             std::shared_ptr<Node> myPowCUDA = Pow();
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPowCUDA-> getOperator());
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(
+                myPowCUDA->getOperator());
             op_cuda->setDataType(DataType::Float32);
             op_cuda->setBackend("cuda");
             std::shared_ptr<Node> myPowCPU = Pow();
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPowCPU-> getOperator());
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(
+                myPowCPU->getOperator());
             op_cpu->setDataType(DataType::Float32);
             op_cpu->setBackend("cpu");
-            
+
             std::size_t number_of_operation = 0;
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                 // generate 2 random Tensors
-                // handle dimensions, replace some dimensions with '1' to get broadcasting
+                // handle dimensions, replace some dimensions with '1' to get
+                // broadcasting
                 constexpr std::size_t nbDims = 4;
                 std::vector<std::size_t> dims;
                 for (std::size_t i = 0; i < nbDims; ++i) {
@@ -172,10 +195,18 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                 }
 
                 // create arrays and fill them with random values
-                std::size_t array0_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                float* array0 = new float[array0_size];
-                float* array1 = new float[array1_size];
+                std::size_t array0_size =
+                    std::accumulate(dims0.cbegin(),
+                                    dims0.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
+                std::size_t array1_size =
+                    std::accumulate(dims1.cbegin(),
+                                    dims1.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
+                float *array0 = new float[array0_size];
+                float *array1 = new float[array1_size];
 
                 for (std::size_t i = 0; i < array0_size; ++i) {
                     array0[i] = valueDist(gen);
@@ -184,23 +215,27 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                     array1[i] = valueDist(gen);
                 }
                 // input0 CUDA
-                float* array0_d, *array1_d;
+                float *array0_d, *array1_d;
                 std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
                 T0_cuda->setDataType(DataType::Float32);
                 T0_cuda->setBackend("cuda");
                 T0_cuda->resize(dims0);
                 op_cuda->associateInput(0, T0_cuda);
-                cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * array0_size);
-                cudaMemcpy(array0_d, array0, sizeof(float) * array0_size, cudaMemcpyHostToDevice);
+                cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                           sizeof(float) * array0_size);
+                cudaMemcpy(array0_d,
+                           array0,
+                           sizeof(float) * array0_size,
+                           cudaMemcpyHostToDevice);
                 T0_cuda->getImpl()->setRawPtr(array0_d, array0_size);
 
                 // input0 CPU
                 std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
                 T0_cpu->setDataType(DataType::Float32);
                 T0_cpu->setBackend("cpu");
-                op_cpu->associateInput(0,T0_cpu);
+                op_cpu->associateInput(0, T0_cpu);
                 T0_cpu->resize(dims0);
-                T0_cpu -> getImpl() -> setRawPtr(array0, array0_size);
+                T0_cpu->getImpl()->setRawPtr(array0, array0_size);
 
                 // input1 CUDA
                 std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>();
@@ -208,8 +243,12 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                 T1_cuda->setBackend("cuda");
                 T1_cuda->resize(dims1);
                 op_cuda->associateInput(1, T1_cuda);
-                cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * array1_size);
-                cudaMemcpy(array1_d, array1, sizeof(float) * array1_size, cudaMemcpyHostToDevice);
+                cudaMalloc(reinterpret_cast<void **>(&array1_d),
+                           sizeof(float) * array1_size);
+                cudaMemcpy(array1_d,
+                           array1,
+                           sizeof(float) * array1_size,
+                           cudaMemcpyHostToDevice);
                 T1_cuda->getImpl()->setRawPtr(array1_d, array1_size);
 
                 // input1
@@ -217,20 +256,24 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                 T1_cpu->setDataType(DataType::Float32);
                 T1_cpu->setBackend("cpu");
                 T1_cpu->resize(dims1);
-                op_cpu -> associateInput(1,T1_cpu);
-                T1_cpu -> getImpl() -> setRawPtr(array1, array1_size);
+                op_cpu->associateInput(1, T1_cpu);
+                T1_cpu->getImpl()->setRawPtr(array1, array1_size);
 
                 op_cuda->forwardDims();
                 start = std::chrono::system_clock::now();
                 myPowCUDA->forward();
                 end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
 
                 op_cpu->forwardDims();
                 myPowCPU->forward();
 
                 std::shared_ptr<Tensor> outputFallback;
-                const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));
+                const auto &cudaOutput =
+                    op_cuda->getOutput(0)->refCastFrom(outputFallback,
+                                                       *op_cpu->getOutput(0));
                 REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
 
                 delete[] array0;
@@ -238,25 +281,35 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                 cudaFree(array0_d);
                 cudaFree(array1_d);
 
-                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const std::size_t nb_elements =
+                    std::accumulate(dimsOut.cbegin(),
+                                    dimsOut.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            std::cout << "number of elements over time spent: "
+                      << (number_of_operation / duration.count()) << std::endl;
+            std::cout << "total time: " << duration.count() << "Î¼s"
+                      << std::endl;
         }
         SECTION("+1-D Tensor / 1-D Tensor") {
             // Create Pow Operator
             std::shared_ptr<Node> myPowCUDA = Pow();
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myPowCUDA-> getOperator());
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(
+                myPowCUDA->getOperator());
             op_cuda->setDataType(DataType::Float32);
             op_cuda->setBackend("cuda");
             std::shared_ptr<Node> myPowCPU = Pow();
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myPowCPU-> getOperator());
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(
+                myPowCPU->getOperator());
             op_cpu->setDataType(DataType::Float32);
             op_cpu->setBackend("cpu");
 
             std::size_t number_of_operation = 0;
-            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3));
+            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(
+                std::size_t(1),
+                std::size_t(3));
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                 // generate 2 random Tensors
@@ -273,13 +326,22 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                         dims1[i] = 1;
                     }
                 }
-                dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen));
+                dims1.erase(dims1.cbegin(),
+                            dims1.cbegin() + nbRemovedDimsDist(gen));
 
                 // create arrays and fill them with random values
-                std::size_t array0_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                float* array0 = new float[array0_size];
-                std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                float* array1 = new float[array1_size];
+                std::size_t array0_size =
+                    std::accumulate(dims0.cbegin(),
+                                    dims0.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
+                float *array0 = new float[array0_size];
+                std::size_t array1_size =
+                    std::accumulate(dims1.cbegin(),
+                                    dims1.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
+                float *array1 = new float[array1_size];
 
                 for (std::size_t i = 0; i < array0_size; ++i) {
                     array0[i] = valueDist(gen);
@@ -289,23 +351,27 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                 }
 
                 // input0 CUDA
-                float* array0_d, *array1_d;
+                float *array0_d, *array1_d;
                 std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
                 T0_cuda->setDataType(DataType::Float32);
                 T0_cuda->setBackend("cuda");
                 T0_cuda->resize(dims0);
                 op_cuda->associateInput(0, T0_cuda);
-                cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * array0_size);
-                cudaMemcpy(array0_d, array0, sizeof(float) * array0_size, cudaMemcpyHostToDevice);
+                cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                           sizeof(float) * array0_size);
+                cudaMemcpy(array0_d,
+                           array0,
+                           sizeof(float) * array0_size,
+                           cudaMemcpyHostToDevice);
                 T0_cuda->getImpl()->setRawPtr(array0_d, array0_size);
 
                 // input0 CPU
                 std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
                 T0_cpu->setDataType(DataType::Float32);
                 T0_cpu->setBackend("cpu");
-                op_cpu->associateInput(0,T0_cpu);
+                op_cpu->associateInput(0, T0_cpu);
                 T0_cpu->resize(dims0);
-                T0_cpu -> getImpl() -> setRawPtr(array0, array0_size);
+                T0_cpu->getImpl()->setRawPtr(array0, array0_size);
 
                 // input1 CUDA
                 std::shared_ptr<Tensor> T1_cuda = std::make_shared<Tensor>();
@@ -313,8 +379,12 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                 T1_cuda->setBackend("cuda");
                 T1_cuda->resize(dims1);
                 op_cuda->associateInput(1, T1_cuda);
-                cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * array1_size);
-                cudaMemcpy(array1_d, array1, sizeof(float) * array1_size, cudaMemcpyHostToDevice);
+                cudaMalloc(reinterpret_cast<void **>(&array1_d),
+                           sizeof(float) * array1_size);
+                cudaMemcpy(array1_d,
+                           array1,
+                           sizeof(float) * array1_size,
+                           cudaMemcpyHostToDevice);
                 T1_cuda->getImpl()->setRawPtr(array1_d, array1_size);
 
                 // input1
@@ -322,20 +392,24 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                 T1_cpu->setDataType(DataType::Float32);
                 T1_cpu->setBackend("cpu");
                 T1_cpu->resize(dims1);
-                op_cpu -> associateInput(1,T1_cpu);
-                T1_cpu -> getImpl() -> setRawPtr(array1, array1_size);
+                op_cpu->associateInput(1, T1_cpu);
+                T1_cpu->getImpl()->setRawPtr(array1, array1_size);
 
                 op_cuda->forwardDims();
                 start = std::chrono::system_clock::now();
                 myPowCUDA->forward();
                 end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
 
                 op_cpu->forwardDims();
                 myPowCPU->forward();
 
                 std::shared_ptr<Tensor> outputFallback;
-                const auto& cudaOutput = op_cuda->getOutput(0)->refCastFrom(outputFallback, *op_cpu->getOutput(0));
+                const auto &cudaOutput =
+                    op_cuda->getOutput(0)->refCastFrom(outputFallback,
+                                                       *op_cpu->getOutput(0));
                 REQUIRE(approxEq<float>(cudaOutput, *(op_cpu->getOutput(0))));
 
                 delete[] array0;
@@ -343,12 +417,18 @@ TEST_CASE("[gpu/operator] Pow", "[Pow][GPU]") {
                 cudaFree(array0_d);
                 cudaFree(array1_d);
 
-                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const std::size_t nb_elements =
+                    std::accumulate(dimsOut.cbegin(),
+                                    dimsOut.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
             }
 
-            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
-            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+            std::cout << "number of elements over time spent: "
+                      << (number_of_operation / duration.count()) << std::endl;
+            std::cout << "total time: " << duration.count() << "Î¼s"
+                      << std::endl;
         }
     }
 }
diff --git a/unit_tests/Test_ReLUImpl.cpp b/unit_tests/Test_ReLUImpl.cpp
index 7ab38aa7def7f846555ae33ccd3871d6ee5a1539..dc4c918e6577bc0004de89ca722f48d530b95112 100644
--- a/unit_tests/Test_ReLUImpl.cpp
+++ b/unit_tests/Test_ReLUImpl.cpp
@@ -12,7 +12,7 @@
 #include <array>
 #include <catch2/catch_test_macros.hpp>
 #include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include "aidge/backend/cpu.hpp"
 #include "aidge/backend/cuda.hpp"
@@ -21,109 +21,91 @@
 
 using namespace Aidge;
 
-
 TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") {
     SECTION("Constant Input") {
-        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
-            {
-                {
-                    {
-                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
-                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
-                    },
-                    {
-                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
-                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
-                    }
-                },
-                {
-                    {
-                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
-                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
-                    },
-                    {
-                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
-                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
-                    }
-                }
-            }
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
-            {
-                {
-                    {
-                        { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
-                        { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
-                    },
-                    {
-                        { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
-                        { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
-                    }
-                },
-                {
-                    {
-                        { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
-                        { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
-                    },
-                    {
-                        { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
-                        { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
-                    }
-                }
-            }
-        });
+        std::shared_ptr<Tensor> input0 =
+            std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 10>{
+                {{{{0, 1, 2, -3, 4, -5, -6, 7, 8, 9},
+                   {-5, 4, 2, -3, 4, -5, -6, 7, -1, 10}},
+                  {{0, 1, 2, -3, 4, -5, -6, 7, 8, 9},
+                   {-5, 4, 2, -3, 4, -5, -6, 7, -1, 10}}},
+                 {{{0, 1, 2, -3, 4, -5, -6, 7, 8, 9},
+                   {-5, 4, 2, -3, 4, -5, -6, 7, -1, 10}},
+                  {{0, 1, 2, -3, 4, -5, -6, 7, 8, 9},
+                   {-5, 4, 2, -3, 4, -5, -6, 7, -1, 10}}}}});
+        std::shared_ptr<Tensor> myOutput =
+            std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 10>{
+                {{{{0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
+                   {0, 4, 2, 0, 4, 0, 0, 7, 0, 10}},
+                  {{0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
+                   {0, 4, 2, 0, 4, 0, 0, 7, 0, 10}}},
+                 {{{0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
+                   {0, 4, 2, 0, 4, 0, 0, 7, 0, 10}},
+                  {{0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
+                   {0, 4, 2, 0, 4, 0, 0, 7, 0, 10}}}}});
 
         std::shared_ptr<Node> myReLU = ReLU();
-        auto op = std::static_pointer_cast<OperatorTensor>(myReLU -> getOperator());
-        op->associateInput(0,input0);
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myReLU->getOperator());
+        op->associateInput(0, input0);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         op->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedOutput;
     }
-    SECTION("Random Input")
-    {
+    SECTION("Random Input") {
         constexpr std::uint16_t NBTRIALS = 10;
         // Create a random number generator
         std::random_device rd;
         std::mt19937 gen(rd());
         std::uniform_real_distribution<float> valueDist(
-            0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
-                                                               std::size_t(10));
-
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(8)); // Max nbDims supported by cudnn is 8
+            0.1f,
+            1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(
+            std::size_t(1),
+            std::size_t(10));
+
+        std::uniform_int_distribution<std::size_t> nbDimsDist(
+            std::size_t(1),
+            std::size_t(8)); // Max nbDims supported by cudnn is 8
         // To measure execution time of 'forward()'
         std::chrono::time_point<std::chrono::system_clock> start;
         std::chrono::time_point<std::chrono::system_clock> end;
         std::chrono::duration<double, std::micro> duration{};
         std::size_t number_of_operation = 0;
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
-        {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
             // Create ReLU Operator
             std::shared_ptr<Node> myReLU = ReLU("myReLU");
-            auto op = std::static_pointer_cast<OperatorTensor>(myReLU->getOperator());
+            auto op = std::static_pointer_cast<OperatorTensor>(
+                myReLU->getOperator());
             op->setDataType(DataType::Float32);
             op->setBackend("cuda");
 
             // generate a random Tensor
             const std::size_t nbDims = nbDimsDist(gen);
             std::vector<std::size_t> dims;
-            for (std::size_t i = 0; i < nbDims; ++i)
-            {
+            for (std::size_t i = 0; i < nbDims; ++i) {
                 dims.push_back(dimSizeDist(gen));
             }
 
-            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements =
+                std::accumulate(dims.cbegin(),
+                                dims.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
             number_of_operation += nb_elements;
 
             // Create the input Tensor
@@ -136,25 +118,32 @@ TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") {
             // Fill input tensor
             float *input_h = new float[nb_elements];
             float *output_h = new float[nb_elements];
-            for (std::size_t i = 0; i < nb_elements; ++i)
-            {
+            for (std::size_t i = 0; i < nb_elements; ++i) {
                 float value = valueDist(gen);
                 input_h[i] = value;
-                output_h[i] = value>=0?value:0.0f;
+                output_h[i] = value >= 0 ? value : 0.0f;
             }
             float *input_d;
-            cudaMalloc(reinterpret_cast<void **>(&input_d), sizeof(float) * nb_elements);
-            cudaMemcpy(input_d, input_h, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&input_d),
+                       sizeof(float) * nb_elements);
+            cudaMemcpy(input_d,
+                       input_h,
+                       sizeof(float) * nb_elements,
+                       cudaMemcpyHostToDevice);
             T0->getImpl()->setRawPtr(input_d, nb_elements);
 
-            // Run inference            
+            // Run inference
             start = std::chrono::system_clock::now();
             myReLU->forward();
             end = std::chrono::system_clock::now();
-            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(
+                end - start);
 
             float *computedOutput = new float[nb_elements]();
-            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nb_elements, cudaMemcpyDeviceToHost);
+            cudaMemcpy(computedOutput,
+                       op->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * nb_elements,
+                       cudaMemcpyDeviceToHost);
 
             REQUIRE(approxEq<float>(*computedOutput, *output_h));
 
@@ -162,8 +151,8 @@ TEST_CASE("[gpu/operator] ReLU(forward)", "[ReLU][GPU]") {
             delete[] input_h;
             cudaFree(input_d);
         }
-        std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
+        std::cout << "number of elements over time spent: "
+                  << (number_of_operation / duration.count()) << std::endl;
         std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
-
     }
 }
diff --git a/unit_tests/Test_ReduceMeanImpl.cpp b/unit_tests/Test_ReduceMeanImpl.cpp
index 041ad6e02d5f39fde22f34ce715d2b807e164b1a..6ed30534368a976cb666ec0dd93f1331bf2eb7ee 100644
--- a/unit_tests/Test_ReduceMeanImpl.cpp
+++ b/unit_tests/Test_ReduceMeanImpl.cpp
@@ -11,7 +11,7 @@
 
 #include <array>
 #include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -25,89 +25,69 @@ namespace Aidge {
 TEST_CASE("[gpu/operator] ReduceMean(forward)", "[ReduceMean][GPU]") {
     SECTION("KeepDims") {
         SECTION("test 1") {
-            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
-                {
-                    {
-                        { 5.0, 1.0 },
-                        { 20.0, 2.0 }
-                    },
-                    {
-                        { 30.0, 1.0 },
-                        { 40.0, 2.0 }
-                    },
-                    {
-                        { 55.0, 1.0 },
-                        { 60.0, 2.0 }
-                    }
-                }
-            });
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+                Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}},
+                                         {{30.0, 1.0}, {40.0, 2.0}},
+                                         {{55.0, 1.0}, {60.0, 2.0}}}});
             myInput->setBackend("cuda");
-            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,2> {
-                {
+            std::shared_ptr<Tensor> myOutput =
+                std::make_shared<Tensor>(Array3D<float, 3, 1, 2>{{
 
-                    {{ 12.5, 1.5 }},
-                    {{ 35.0, 1.5 }},
-                    {{ 57.5, 1.5 }}
-                }
-            });
+                    {{12.5, 1.5}},
+                    {{35.0, 1.5}},
+                    {{57.5, 1.5}}}});
 
             std::shared_ptr<Node> myReduceMean = ReduceMean({1});
-            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
-            op->associateInput(0,myInput);
+            auto op = std::static_pointer_cast<OperatorTensor>(
+                myReduceMean->getOperator());
+            op->associateInput(0, myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cuda");
             myReduceMean->forward();
 
-            float* computedOutput   = new float[myOutput->size()]();
-            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-            for(int i = 0; i < myOutput->size(); i++){
-                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            float *computedOutput = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput,
+                       op->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * myOutput->size(),
+                       cudaMemcpyDeviceToHost);
+            for (int i = 0; i < myOutput->size(); i++) {
+                const float targetOutput =
+                    *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
                 REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
             }
 
             delete[] computedOutput;
         }
         SECTION("test 2") {
-            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> {
-                {
-                    {
-                        { 0.0, 0.0 },
-                        { 1.0, 1.0 },
-                        { 2.0, 2.0 }
-                    },
-                    {
-                        { 3.0, 3.0 },
-                        { 4.0, 4.0 },
-                        { 5.0, 5.0 }
-                    },
-                    {
-                        { 6.0, 6.0 },
-                        { 7.0, 7.0 },
-                        { 8.0, 8.0 }
-                    }
-                }
-            });
+            std::shared_ptr<Tensor> myInput =
+                std::make_shared<Tensor>(Array3D<float, 3, 3, 2>{
+                    {{{0.0, 0.0}, {1.0, 1.0}, {2.0, 2.0}},
+                     {{3.0, 3.0}, {4.0, 4.0}, {5.0, 5.0}},
+                     {{6.0, 6.0}, {7.0, 7.0}, {8.0, 8.0}}}});
             myInput->setBackend("cuda");
-            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,1> {
-                {
+            std::shared_ptr<Tensor> myOutput =
+                std::make_shared<Tensor>(Array3D<float, 3, 1, 1>{{
 
-                    {{ 1.0 }},
-                    {{ 4.0 }},
-                    {{ 7.0 }}
-                }
-            });
+                    {{1.0}},
+                    {{4.0}},
+                    {{7.0}}}});
 
             std::shared_ptr<Node> myReduceMean = ReduceMean({1, 2});
-            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
-            op->associateInput(0,myInput);
+            auto op = std::static_pointer_cast<OperatorTensor>(
+                myReduceMean->getOperator());
+            op->associateInput(0, myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cuda");
             myReduceMean->forward();
 
-            float* computedOutput   = new float[myOutput->size()]();
-            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-            for(int i = 0; i < myOutput->size(); i++){
-                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            float *computedOutput = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput,
+                       op->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * myOutput->size(),
+                       cudaMemcpyDeviceToHost);
+            for (int i = 0; i < myOutput->size(); i++) {
+                const float targetOutput =
+                    *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
                 REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
             }
 
@@ -115,145 +95,122 @@ TEST_CASE("[gpu/operator] ReduceMean(forward)", "[ReduceMean][GPU]") {
         }
     }
     SECTION("not_KeepDims") {
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
-            {
-                {
-                    { 5.0, 1.0 },
-                    { 20.0, 2.0 }
-                },
-                {
-                    { 30.0, 1.0 },
-                    { 40.0, 2.0 }
-                },
-                {
-                    { 55.0, 1.0 },
-                    { 60.0, 2.0 }
-                }
-            }
-        });
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}},
+                                     {{30.0, 1.0}, {40.0, 2.0}},
+                                     {{55.0, 1.0}, {60.0, 2.0}}}});
         myInput->setBackend("cuda");
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> {
-            {
-                { 12.5, 1.5 },
-                { 35.0, 1.5 },
-                { 57.5, 1.5 }
-            }
-        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(
+            Array2D<float, 3, 2>{{{12.5, 1.5}, {35.0, 1.5}, {57.5, 1.5}}});
 
         std::shared_ptr<Node> myReduceMean = ReduceMean({1}, false);
-        auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
-        op->associateInput(0,myInput);
+        auto op = std::static_pointer_cast<OperatorTensor>(
+            myReduceMean->getOperator());
+        op->associateInput(0, myInput);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myReduceMean->forward();
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
-            std::cout << "computed: " << computedOutput[i] << ", target: " << targetOutput << std::endl;
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
+            std::cout << "computed: " << computedOutput[i]
+                      << ", target: " << targetOutput << std::endl;
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedOutput;
-
     }
     SECTION("all_axes") {
         SECTION("1") {
-            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
-                {
-                    {
-                        { 5.0, 1.0 },
-                        { 20.0, 2.0 }
-                    },
-                    {
-                        { 30.0, 1.0 },
-                        { 40.0, 2.0 }
-                    },
-                    {
-                        { 55.0, 1.0 },
-                        { 60.0, 2.0 }
-                    }
-                }
-            });
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+                Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}},
+                                         {{30.0, 1.0}, {40.0, 2.0}},
+                                         {{55.0, 1.0}, {60.0, 2.0}}}});
             myInput->setBackend("cuda");
-            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
-                {18.25}
-            });
+            std::shared_ptr<Tensor> myOutput =
+                std::make_shared<Tensor>(Array1D<float, 1>{{18.25}});
 
             std::shared_ptr<Node> myReduceMean = ReduceMean({0, 1, 2}, false);
-            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
-            op->associateInput(0,myInput);
+            auto op = std::static_pointer_cast<OperatorTensor>(
+                myReduceMean->getOperator());
+            op->associateInput(0, myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cuda");
             myReduceMean->forward();
-            float* computedOutput   = new float[myOutput->size()]();
-            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-            for(int i = 0; i < myOutput->size(); i++){
-                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            float *computedOutput = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput,
+                       op->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * myOutput->size(),
+                       cudaMemcpyDeviceToHost);
+            for (int i = 0; i < myOutput->size(); i++) {
+                const float targetOutput =
+                    *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
                 REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
             }
 
             delete[] computedOutput;
         }
         SECTION("2") {
-            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,5,4> {
-               {{ 0.004232f, 0.105120f, 0.045124f, 0.009205f},
-                { 0.000766f, 0.272162f, 0.503560f, 0.044163f},
-                { 0.049755f, 0.000305f, 0.143634f, 0.013253f},
-                { 0.096258f, 0.311231f, 0.358143f, 0.000452f},
-                { 0.468617f, 0.015693f, 0.145316f, 0.000105f}}
-            });
+            std::shared_ptr<Tensor> myInput =
+                std::make_shared<Tensor>(Array2D<float, 5, 4>{
+                    {{0.004232f, 0.105120f, 0.045124f, 0.009205f},
+                     {0.000766f, 0.272162f, 0.503560f, 0.044163f},
+                     {0.049755f, 0.000305f, 0.143634f, 0.013253f},
+                     {0.096258f, 0.311231f, 0.358143f, 0.000452f},
+                     {0.468617f, 0.015693f, 0.145316f, 0.000105f}}});
             myInput->setBackend("cuda");
-            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
-                {0.1293547f}
-            });
+            std::shared_ptr<Tensor> myOutput =
+                std::make_shared<Tensor>(Array1D<float, 1>{{0.1293547f}});
 
             std::shared_ptr<Node> myReduceMean = ReduceMean({0, 1}, false);
-            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
-            op->associateInput(0,myInput);
+            auto op = std::static_pointer_cast<OperatorTensor>(
+                myReduceMean->getOperator());
+            op->associateInput(0, myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cuda");
             myReduceMean->forward();
 
-            float* computedOutput   = new float[myOutput->size()]();
-            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-            for(int i = 0; i < myOutput->size(); i++){
-                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            float *computedOutput = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput,
+                       op->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * myOutput->size(),
+                       cudaMemcpyDeviceToHost);
+            for (int i = 0; i < myOutput->size(); i++) {
+                const float targetOutput =
+                    *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
                 REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
             }
 
             delete[] computedOutput;
         }
         SECTION("noop_with_empty_axes") {
-            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
-                {
-                    {
-                        { 5.0, 1.0 },
-                        { 20.0, 2.0 }
-                    },
-                    {
-                        { 30.0, 1.0 },
-                        { 40.0, 2.0 }
-                    },
-                    {
-                        { 55.0, 1.0 },
-                        { 60.0, 2.0 }
-                    }
-                }
-            });
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+                Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}},
+                                         {{30.0, 1.0}, {40.0, 2.0}},
+                                         {{55.0, 1.0}, {60.0, 2.0}}}});
             myInput->setBackend("cuda");
             std::shared_ptr<Node> myReduceMean = ReduceMean({}, false, true);
-            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
-            op->associateInput(0,myInput);
+            auto op = std::static_pointer_cast<OperatorTensor>(
+                myReduceMean->getOperator());
+            op->associateInput(0, myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cuda");
             myReduceMean->forward();
-            
+
             myInput->setBackend("cpu");
-            float* computedOutput   = new float[myInput->size()]();
-            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myInput->size(), cudaMemcpyDeviceToHost);
-            for(int i = 0; i < myInput->size(); i++){
-                const float targetOutput = *(static_cast<float*>(myInput->getImpl()->rawPtr()) + i);
+            float *computedOutput = new float[myInput->size()]();
+            cudaMemcpy(computedOutput,
+                       op->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * myInput->size(),
+                       cudaMemcpyDeviceToHost);
+            for (int i = 0; i < myInput->size(); i++) {
+                const float targetOutput =
+                    *(static_cast<float *>(myInput->getImpl()->rawPtr()) + i);
                 REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
             }
 
@@ -264,70 +221,48 @@ TEST_CASE("[gpu/operator] ReduceMean(forward)", "[ReduceMean][GPU]") {
 
 TEST_CASE("[gpu/operator] ReduceMean(backward)", "[ReduceMean][GPU]") {
     SECTION("KeepDims") {
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
-            {
-                {
-                    { 5.0, 1.0 },
-                    { 20.0, 2.0 }
-                },
-                {
-                    { 30.0, 1.0 },
-                    { 40.0, 2.0 }
-                },
-                {
-                    { 55.0, 1.0 },
-                    { 60.0, 2.0 }
-                }
-            }
-        });
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}},
+                                     {{30.0, 1.0}, {40.0, 2.0}},
+                                     {{55.0, 1.0}, {60.0, 2.0}}}});
         myInput->setBackend("cuda");
 
-
         std::shared_ptr<Node> myReduceMean = ReduceMean({1});
-        auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
-        op->associateInput(0,myInput);
+        auto op = std::static_pointer_cast<OperatorTensor>(
+            myReduceMean->getOperator());
+        op->associateInput(0, myInput);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myReduceMean->forward();
 
+        std::shared_ptr<Tensor> myOutputGrad =
+            std::make_shared<Tensor>(Array3D<float, 3, 1, 2>{{
 
-        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array3D<float,3,1,2> {
-            {
-
-                {{ 1.0, 2.0 }},
-                {{ 3.0, 4.0 }},
-                {{ 5.0, 6.0 }}
-            }
-        });
-        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array3D<float,3,2,2> {
-            {
-                {
-                    { 1.0, 2.0 },
-                    { 1.0, 2.0 }
-                },
-                {
-                    { 3.0, 4.0 },
-                    { 3.0, 4.0 }
-                },
-                {
-                    { 5.0, 6.0 },
-                    { 5.0, 6.0 }
-                }
-            }
-        });
+                {{1.0, 2.0}},
+                {{3.0, 4.0}},
+                {{5.0, 6.0}}}});
+        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(
+            Array3D<float, 3, 2, 2>{{{{1.0, 2.0}, {1.0, 2.0}},
+                                     {{3.0, 4.0}, {3.0, 4.0}},
+                                     {{5.0, 6.0}, {5.0, 6.0}}}});
         myOutputGrad->setBackend("cuda");
         op->getOutput(0)->setGrad(myOutputGrad);
         REQUIRE_NOTHROW(myReduceMean->backward());
 
         float *computedGradCuda = new float[expectedInputGrad->size()]();
-        cudaMemcpy(computedGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost);
-        
-        for(int i = 0; i < expectedInputGrad->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i);
+        cudaMemcpy(computedGradCuda,
+                   op->getInput(0)->grad()->getImpl()->rawPtr(),
+                   sizeof(float) * expectedInputGrad->size(),
+                   cudaMemcpyDeviceToHost);
+
+        for (int i = 0; i < expectedInputGrad->size(); i++) {
+            const float targetOutput = *(
+                static_cast<float *>(expectedInputGrad->getImpl()->rawPtr()) +
+                i);
             REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedGradCuda;
     }
 }
-}
+} // namespace Aidge
diff --git a/unit_tests/Test_ReduceSumImpl.cpp b/unit_tests/Test_ReduceSumImpl.cpp
index d0d37754102331c8f91a1ce1c81d679761916339..a640d1f7e3585e7068dbd0ba935a31cb2e725df6 100644
--- a/unit_tests/Test_ReduceSumImpl.cpp
+++ b/unit_tests/Test_ReduceSumImpl.cpp
@@ -11,7 +11,7 @@
 
 #include <array>
 #include <numeric> // std::accumulate
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -25,89 +25,68 @@ namespace Aidge {
 TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") {
     SECTION("KeepDims") {
         SECTION("test 1") {
-            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
-                {
-                    {
-                        { 5.0, 1.0 },
-                        { 20.0, 2.0 }
-                    },
-                    {
-                        { 30.0, 1.0 },
-                        { 40.0, 2.0 }
-                    },
-                    {
-                        { 55.0, 1.0 },
-                        { 60.0, 2.0 }
-                    }
-                }
-            });
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+                Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}},
+                                         {{30.0, 1.0}, {40.0, 2.0}},
+                                         {{55.0, 1.0}, {60.0, 2.0}}}});
             myInput->setBackend("cuda");
-            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,2> {
-                {
-                    {{ 25.0, 3.0 }},
-                    {{ 70.0, 3.0 }},
-                    {{ 115.0, 3.0 }}
-                }
-            });
+            std::shared_ptr<Tensor> myOutput =
+                std::make_shared<Tensor>(Array3D<float, 3, 1, 2>{
+                    {{{25.0, 3.0}}, {{70.0, 3.0}}, {{115.0, 3.0}}}});
 
             std::shared_ptr<Node> myReduceSum = ReduceSum({1});
-            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
-            op->associateInput(0,myInput);
+            auto op = std::static_pointer_cast<OperatorTensor>(
+                myReduceSum->getOperator());
+            op->associateInput(0, myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cuda");
             myReduceSum->forward();
 
-            float* computedOutput   = new float[myOutput->size()]();
-            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-            for(int i = 0; i < myOutput->size(); i++){
-                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
-                std::cout << "i: " << i << ", computed: " << computedOutput[i] << ", target: "<< targetOutput <<std::endl;
+            float *computedOutput = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput,
+                       op->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * myOutput->size(),
+                       cudaMemcpyDeviceToHost);
+            for (int i = 0; i < myOutput->size(); i++) {
+                const float targetOutput =
+                    *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
+                std::cout << "i: " << i << ", computed: " << computedOutput[i]
+                          << ", target: " << targetOutput << std::endl;
                 REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
             }
 
             delete[] computedOutput;
         }
         SECTION("test 2") {
-            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> {
-                {
-                    {
-                        { 0.0, 0.0 },
-                        { 1.0, 1.0 },
-                        { 2.0, 2.0 }
-                    },
-                    {
-                        { 3.0, 3.0 },
-                        { 4.0, 4.0 },
-                        { 5.0, 5.0 }
-                    },
-                    {
-                        { 6.0, 6.0 },
-                        { 7.0, 7.0 },
-                        { 8.0, 8.0 }
-                    }
-                }
-            });
+            std::shared_ptr<Tensor> myInput =
+                std::make_shared<Tensor>(Array3D<float, 3, 3, 2>{
+                    {{{0.0, 0.0}, {1.0, 1.0}, {2.0, 2.0}},
+                     {{3.0, 3.0}, {4.0, 4.0}, {5.0, 5.0}},
+                     {{6.0, 6.0}, {7.0, 7.0}, {8.0, 8.0}}}});
             myInput->setBackend("cuda");
-            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,1> {
-                {
+            std::shared_ptr<Tensor> myOutput =
+                std::make_shared<Tensor>(Array3D<float, 3, 1, 1>{{
 
-                    {{ 6.0 }},
-                    {{ 24.0 }},
-                    {{ 42.0 }}
-                }
-            });
+                    {{6.0}},
+                    {{24.0}},
+                    {{42.0}}}});
 
             std::shared_ptr<Node> myReduceSum = ReduceSum({1, 2});
-            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
-            op->associateInput(0,myInput);
+            auto op = std::static_pointer_cast<OperatorTensor>(
+                myReduceSum->getOperator());
+            op->associateInput(0, myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cuda");
             myReduceSum->forward();
 
-            float* computedOutput   = new float[myOutput->size()]();
-            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-            for(int i = 0; i < myOutput->size(); i++){
-                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            float *computedOutput = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput,
+                       op->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * myOutput->size(),
+                       cudaMemcpyDeviceToHost);
+            for (int i = 0; i < myOutput->size(); i++) {
+                const float targetOutput =
+                    *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
                 REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
             }
 
@@ -115,109 +94,92 @@ TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") {
         }
     }
     SECTION("not_KeepDims") {
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
-            {
-                {
-                    { 5.0, 1.0 },
-                    { 20.0, 2.0 }
-                },
-                {
-                    { 30.0, 1.0 },
-                    { 40.0, 2.0 }
-                },
-                {
-                    { 55.0, 1.0 },
-                    { 60.0, 2.0 }
-                }
-            }
-        });
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}},
+                                     {{30.0, 1.0}, {40.0, 2.0}},
+                                     {{55.0, 1.0}, {60.0, 2.0}}}});
         myInput->setBackend("cuda");
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> {
-            {
-                    { 25.0, 3.0 },
-                    { 70.0, 3.0 },
-                    { 115.0, 3.0 }
-            }
-        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(
+            Array2D<float, 3, 2>{{{25.0, 3.0}, {70.0, 3.0}, {115.0, 3.0}}});
 
         std::shared_ptr<Node> myReduceSum = ReduceSum({1}, false);
-        auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
-        op->associateInput(0,myInput);
+        auto op = std::static_pointer_cast<OperatorTensor>(
+            myReduceSum->getOperator());
+        op->associateInput(0, myInput);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myReduceSum->forward();
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedOutput;
-
     }
     SECTION("all_axes") {
         SECTION("1") {
-            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
-                {
-                    {
-                        { 5.0, 1.0 },
-                        { 20.0, 2.0 }
-                    },
-                    {
-                        { 30.0, 1.0 },
-                        { 40.0, 2.0 }
-                    },
-                    {
-                        { 55.0, 1.0 },
-                        { 60.0, 2.0 }
-                    }
-                }
-            });
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+                Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}},
+                                         {{30.0, 1.0}, {40.0, 2.0}},
+                                         {{55.0, 1.0}, {60.0, 2.0}}}});
             myInput->setBackend("cuda");
-            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
-                {219.0}
-            });
+            std::shared_ptr<Tensor> myOutput =
+                std::make_shared<Tensor>(Array1D<float, 1>{{219.0}});
 
             std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1, 2}, false);
-            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
-            op->associateInput(0,myInput);
+            auto op = std::static_pointer_cast<OperatorTensor>(
+                myReduceSum->getOperator());
+            op->associateInput(0, myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cuda");
             myReduceSum->forward();
-            float* computedOutput   = new float[myOutput->size()]();
-            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-            for(int i = 0; i < myOutput->size(); i++){
-                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            float *computedOutput = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput,
+                       op->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * myOutput->size(),
+                       cudaMemcpyDeviceToHost);
+            for (int i = 0; i < myOutput->size(); i++) {
+                const float targetOutput =
+                    *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
                 REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
             }
 
             delete[] computedOutput;
         }
         SECTION("2") {
-            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,5,4> {
-               {{ 0.004232f, 0.105120f, 0.045124f, 0.009205f},
-                { 0.000766f, 0.272162f, 0.503560f, 0.044163f},
-                { 0.049755f, 0.000305f, 0.143634f, 0.013253f},
-                { 0.096258f, 0.311231f, 0.358143f, 0.000452f},
-                { 0.468617f, 0.015693f, 0.145316f, 0.000105f}}
-            });
+            std::shared_ptr<Tensor> myInput =
+                std::make_shared<Tensor>(Array2D<float, 5, 4>{
+                    {{0.004232f, 0.105120f, 0.045124f, 0.009205f},
+                     {0.000766f, 0.272162f, 0.503560f, 0.044163f},
+                     {0.049755f, 0.000305f, 0.143634f, 0.013253f},
+                     {0.096258f, 0.311231f, 0.358143f, 0.000452f},
+                     {0.468617f, 0.015693f, 0.145316f, 0.000105f}}});
             myInput->setBackend("cuda");
-            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
-                {2.587094f}
-            });
+            std::shared_ptr<Tensor> myOutput =
+                std::make_shared<Tensor>(Array1D<float, 1>{{2.587094f}});
 
             std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1}, false);
-            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
-            op->associateInput(0,myInput);
+            auto op = std::static_pointer_cast<OperatorTensor>(
+                myReduceSum->getOperator());
+            op->associateInput(0, myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cuda");
             myReduceSum->forward();
 
-            float* computedOutput   = new float[myOutput->size()]();
-            cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
-            for(int i = 0; i < myOutput->size(); i++){
-                const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+            float *computedOutput = new float[myOutput->size()]();
+            cudaMemcpy(computedOutput,
+                       op->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * myOutput->size(),
+                       cudaMemcpyDeviceToHost);
+            for (int i = 0; i < myOutput->size(); i++) {
+                const float targetOutput =
+                    *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
                 REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
             }
 
@@ -228,70 +190,48 @@ TEST_CASE("[gpu/operator] ReduceSum(forward)", "[ReduceSum][GPU]") {
 
 TEST_CASE("[gpu/operator] ReduceSum(backward)", "[ReduceSum][GPU]") {
     SECTION("KeepDims") {
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
-            {
-                {
-                    { 5.0, 1.0 },
-                    { 20.0, 2.0 }
-                },
-                {
-                    { 30.0, 1.0 },
-                    { 40.0, 2.0 }
-                },
-                {
-                    { 55.0, 1.0 },
-                    { 60.0, 2.0 }
-                }
-            }
-        });
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array3D<float, 3, 2, 2>{{{{5.0, 1.0}, {20.0, 2.0}},
+                                     {{30.0, 1.0}, {40.0, 2.0}},
+                                     {{55.0, 1.0}, {60.0, 2.0}}}});
         myInput->setBackend("cuda");
 
-
         std::shared_ptr<Node> myReduceSum = ReduceSum({1});
-        auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
-        op->associateInput(0,myInput);
+        auto op = std::static_pointer_cast<OperatorTensor>(
+            myReduceSum->getOperator());
+        op->associateInput(0, myInput);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myReduceSum->forward();
 
+        std::shared_ptr<Tensor> myOutputGrad =
+            std::make_shared<Tensor>(Array3D<float, 3, 1, 2>{{
 
-        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array3D<float,3,1,2> {
-            {
-
-                {{ 1.0, 2.0 }},
-                {{ 3.0, 4.0 }},
-                {{ 5.0, 6.0 }}
-            }
-        });
-        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array3D<float,3,2,2> {
-            {
-                {
-                    { 1.0, 2.0 },
-                    { 1.0, 2.0 }
-                },
-                {
-                    { 3.0, 4.0 },
-                    { 3.0, 4.0 }
-                },
-                {
-                    { 5.0, 6.0 },
-                    { 5.0, 6.0 }
-                }
-            }
-        });
+                {{1.0, 2.0}},
+                {{3.0, 4.0}},
+                {{5.0, 6.0}}}});
+        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(
+            Array3D<float, 3, 2, 2>{{{{1.0, 2.0}, {1.0, 2.0}},
+                                     {{3.0, 4.0}, {3.0, 4.0}},
+                                     {{5.0, 6.0}, {5.0, 6.0}}}});
         myOutputGrad->setBackend("cuda");
         op->getOutput(0)->setGrad(myOutputGrad);
         REQUIRE_NOTHROW(myReduceSum->backward());
 
         float *computedGradCuda = new float[expectedInputGrad->size()]();
-        cudaMemcpy(computedGradCuda, op->getInput(0)->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost);
-        
-        for(int i = 0; i < expectedInputGrad->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i);
+        cudaMemcpy(computedGradCuda,
+                   op->getInput(0)->grad()->getImpl()->rawPtr(),
+                   sizeof(float) * expectedInputGrad->size(),
+                   cudaMemcpyDeviceToHost);
+
+        for (int i = 0; i < expectedInputGrad->size(); i++) {
+            const float targetOutput = *(
+                static_cast<float *>(expectedInputGrad->getImpl()->rawPtr()) +
+                i);
             REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedGradCuda;
     }
 }
-}
+} // namespace Aidge
diff --git a/unit_tests/Test_ReshapeImpl.cpp b/unit_tests/Test_ReshapeImpl.cpp
index df9a4dda6d59371c8dd07f8c4442e3a3bb4a7159..a8a03a3e427a3d760342053f2d0baa6b9128e4af 100644
--- a/unit_tests/Test_ReshapeImpl.cpp
+++ b/unit_tests/Test_ReshapeImpl.cpp
@@ -11,7 +11,7 @@
 
 #include <array>
 #include <numeric> // std::accumulate, std::shuffle, std::transform
-#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -22,250 +22,256 @@
 
 using namespace Aidge;
 
-
 TEST_CASE("[gpu/operator] Reshape(forward)") {
     SECTION("1D Tensor") {
-        std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array1D<float,6> {
-            {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,2,3> {
-            {
-                {1.0, 2.0, 3.0},
-                {4.0, 5.0, 6.0}
-            }
-        });
+        std::shared_ptr<Tensor> input = std::make_shared<Tensor>(
+            Array1D<float, 6>{{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}});
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}});
 
         std::shared_ptr<Node> myReshape = Reshape({2, 3});
-        auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator());
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myReshape->getOperator());
         op->associateInput(0, input);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myReshape->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedOutput;
     }
     SECTION("2D Tensor") {
-        std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array2D<float,2,3> {
-            {
-                {1.0, 2.0, 3.0},
-                {4.0, 5.0, 6.0}
-            }
+        std::shared_ptr<Tensor> input = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}
 
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> {
-            {
-                {1.0, 2.0},
-                {3.0, 4.0},
-                {5.0, 6.0}
-            }
-        });
+            });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(
+            Array2D<float, 3, 2>{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}});
 
         std::shared_ptr<Node> myReshape = Reshape({3, 2});
-        auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator());
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myReshape->getOperator());
         op->associateInput(0, input);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myReshape->forward();
 
-        float* computedOutput   = new float[myOutput->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
+        float *computedOutput = new float[myOutput->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * myOutput->size(),
+                   cudaMemcpyDeviceToHost);
 
-        for(int i = 0; i < myOutput->size(); i++){
-            const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < myOutput->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(myOutput->getImpl()->rawPtr()) + i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedOutput;
     }
-    SECTION("Random Input")
-    {
+    SECTION("Random Input") {
         constexpr std::uint16_t NBTRIALS = 10;
         // Create a random number generator
         std::random_device rd;
         std::mt19937 gen(rd());
         std::uniform_real_distribution<float> valueDist(
-            0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1),
-                                                               std::size_t(10));
-
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(CUDNN_DIM_MAX)); // Max nbDims supported by cudnn is 8
+            0.1f,
+            1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(
+            std::size_t(1),
+            std::size_t(10));
+
+        std::uniform_int_distribution<std::size_t> nbDimsDist(
+            std::size_t(1),
+            std::size_t(CUDNN_DIM_MAX)); // Max nbDims supported by cudnn is 8
         // To measure execution time of 'forward()'
         std::chrono::time_point<std::chrono::system_clock> start;
         std::chrono::time_point<std::chrono::system_clock> end;
         std::chrono::duration<double, std::micro> duration{};
         std::size_t number_of_operation = 0;
-        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial)
-        {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
             // generate a random Tensor
             const std::size_t nbDims = nbDimsDist(gen);
             std::vector<std::size_t> dims, shuffeledDims;
-            for (std::size_t i = 0; i < nbDims; ++i)
-            {
+            for (std::size_t i = 0; i < nbDims; ++i) {
                 dims.push_back(dimSizeDist(gen));
             }
             shuffeledDims = dims;
             std::shuffle(shuffeledDims.begin(), shuffeledDims.end(), gen);
 
             std::vector<std::int64_t> shuffeledIntDims(shuffeledDims.size());
-            std::transform(shuffeledDims.begin(), shuffeledDims.end(), shuffeledIntDims.begin(),
-                        [](int value) { return static_cast<std::int64_t>(value); });
+            std::transform(
+                shuffeledDims.begin(),
+                shuffeledDims.end(),
+                shuffeledIntDims.begin(),
+                [](int value) { return static_cast<std::int64_t>(value); });
             // Create Reshape Operator CUDA
-            std::shared_ptr<Node> myReshapeCuda = Reshape(shuffeledIntDims, false,"myreshapecuda");
-            auto op_cuda = std::static_pointer_cast<OperatorTensor>(myReshapeCuda->getOperator());
+            std::shared_ptr<Node> myReshapeCuda =
+                Reshape(shuffeledIntDims, false, "myreshapecuda");
+            auto op_cuda = std::static_pointer_cast<OperatorTensor>(
+                myReshapeCuda->getOperator());
             op_cuda->setDataType(DataType::Float32);
             op_cuda->setBackend("cuda");
 
             // Create Reshape Operator CPU
-            std::shared_ptr<Node> myReshapeCpu = Reshape(shuffeledIntDims, false,"myreshapecpu");
-            auto op_cpu = std::static_pointer_cast<OperatorTensor>(myReshapeCpu->getOperator());
+            std::shared_ptr<Node> myReshapeCpu =
+                Reshape(shuffeledIntDims, false, "myreshapecpu");
+            auto op_cpu = std::static_pointer_cast<OperatorTensor>(
+                myReshapeCpu->getOperator());
             op_cpu->setDataType(DataType::Float32);
             op_cpu->setBackend("cpu");
 
-            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            const std::size_t nb_elements =
+                std::accumulate(dims.cbegin(),
+                                dims.cend(),
+                                std::size_t(1),
+                                std::multiplies<std::size_t>());
             number_of_operation += nb_elements;
 
             // Fill input tensor
             float *array0 = new float[nb_elements];
-            for (std::size_t i = 0; i < nb_elements; ++i)
-            {
+            for (std::size_t i = 0; i < nb_elements; ++i) {
                 array0[i] = valueDist(gen);
             }
 
             // input0 CUDA
-            float* array0_d;
+            float *array0_d;
             std::shared_ptr<Tensor> T0_cuda = std::make_shared<Tensor>();
             T0_cuda->setDataType(DataType::Float32);
             T0_cuda->setBackend("cuda");
             T0_cuda->resize(dims);
             op_cuda->associateInput(0, T0_cuda);
-            cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
-            cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
+            cudaMalloc(reinterpret_cast<void **>(&array0_d),
+                       sizeof(float) * nb_elements);
+            cudaMemcpy(array0_d,
+                       array0,
+                       sizeof(float) * nb_elements,
+                       cudaMemcpyHostToDevice);
             T0_cuda->getImpl()->setRawPtr(array0_d, nb_elements);
 
             // input0 CPU
             std::shared_ptr<Tensor> T0_cpu = std::make_shared<Tensor>();
-            op_cpu->associateInput(0,T0_cpu);
+            op_cpu->associateInput(0, T0_cpu);
             T0_cpu->setDataType(DataType::Float32);
             T0_cpu->setBackend("cpu");
             T0_cpu->resize(dims);
-            T0_cpu -> getImpl() -> setRawPtr(array0, nb_elements);
+            T0_cpu->getImpl()->setRawPtr(array0, nb_elements);
 
-            // Run inference            
+            // Run inference
             start = std::chrono::system_clock::now();
             op_cuda->forward();
             end = std::chrono::system_clock::now();
-            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(
+                end - start);
 
             float *computed_cuda = new float[nb_elements];
-            cudaMemcpy(computed_cuda, op_cuda->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * nb_elements, cudaMemcpyDeviceToHost);
+            cudaMemcpy(computed_cuda,
+                       op_cuda->getOutput(0)->getImpl()->rawPtr(),
+                       sizeof(float) * nb_elements,
+                       cudaMemcpyDeviceToHost);
 
             // forward CPU
             op_cpu->forward();
-            float *computed_cpu = static_cast<float*>(op_cpu->getOutput(0)->getImpl()->rawPtr());
+            float *computed_cpu = static_cast<float *>(
+                op_cpu->getOutput(0)->getImpl()->rawPtr());
             REQUIRE(approxEq<float>(*computed_cuda, *computed_cpu));
 
             delete[] computed_cuda;
             delete[] array0;
             cudaFree(array0_d);
         }
-        std::cout << "number of elements over time spent: " << (number_of_operation / duration.count()) << std::endl;
+        std::cout << "number of elements over time spent: "
+                  << (number_of_operation / duration.count()) << std::endl;
         std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
-
     }
 }
 
 TEST_CASE("[gpu/operator] Reshape(backward)") {
     SECTION("1D Tensor") {
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,2,3> {
-            {
-                {1.0, 2.0, 3.0},
-                {4.0, 5.0, 6.0}
-            }
-        });
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}});
 
         std::shared_ptr<Node> myReshape = Reshape({6});
-        auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator());
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myReshape->getOperator());
         op->associateInput(0, myInput);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myReshape->forward();
 
         // Run and test backward operation
-        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array1D<float, 6> {
-            {1, 2, 3, 4, 5, 6}
-        });
+        std::shared_ptr<Tensor> myOutputGrad =
+            std::make_shared<Tensor>(Array1D<float, 6>{{1, 2, 3, 4, 5, 6}});
         myOutputGrad->setBackend("cuda");
         std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
         std::shared_ptr<Tensor> input = op->getInput(0);
         predictedOutput->setGrad(myOutputGrad);
         REQUIRE_NOTHROW(myReshape->backward());
 
-        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float,2,3> {
-            {
-                {1.0, 2.0, 3.0},
-                {4.0, 5.0, 6.0}
-            }
-        });
+        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}});
 
         float *computedGradCuda = new float[expectedInputGrad->size()]();
-        cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost);
-        
-        for(int i = 0; i < expectedInputGrad->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i);
+        cudaMemcpy(computedGradCuda,
+                   input->grad()->getImpl()->rawPtr(),
+                   sizeof(float) * expectedInputGrad->size(),
+                   cudaMemcpyDeviceToHost);
+
+        for (int i = 0; i < expectedInputGrad->size(); i++) {
+            const float targetOutput = *(
+                static_cast<float *>(expectedInputGrad->getImpl()->rawPtr()) +
+                i);
             REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
         }
 
         delete[] computedGradCuda;
     }
     SECTION("2D Tensor") {
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,2,3> {
-            {
-                {1.0, 2.0, 3.0},
-                {4.0, 5.0, 6.0}
-            }
-        });
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}});
 
         std::shared_ptr<Node> myReshape = Reshape({3, 2});
-        auto op = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator());
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myReshape->getOperator());
         op->associateInput(0, myInput);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         myReshape->forward();
 
         // Run and test backward operation
-        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 3, 2> {
-            {
-                {1.0, 2.0},
-                {3.0, 4.0},
-                {5.0, 6.0}
-            }
-        });
+        std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(
+            Array2D<float, 3, 2>{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}});
         myOutputGrad->setBackend("cuda");
         std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
         std::shared_ptr<Tensor> input = op->getInput(0);
         predictedOutput->setGrad(myOutputGrad);
         REQUIRE_NOTHROW(myReshape->backward());
 
-        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float,2,3> {
-            {
-                {1.0, 2.0, 3.0},
-                {4.0, 5.0, 6.0}
-            }
-        });
+        std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}});
 
         float *computedGradCuda = new float[expectedInputGrad->size()]();
-        cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * expectedInputGrad->size(), cudaMemcpyDeviceToHost);
-        
-        for(int i = 0; i < expectedInputGrad->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedInputGrad->getImpl()->rawPtr()) + i);
+        cudaMemcpy(computedGradCuda,
+                   input->grad()->getImpl()->rawPtr(),
+                   sizeof(float) * expectedInputGrad->size(),
+                   cudaMemcpyDeviceToHost);
+
+        for (int i = 0; i < expectedInputGrad->size(); i++) {
+            const float targetOutput = *(
+                static_cast<float *>(expectedInputGrad->getImpl()->rawPtr()) +
+                i);
             REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
         }
 
diff --git a/unit_tests/Test_ShiftGELUImpl.cpp b/unit_tests/Test_ShiftGELUImpl.cpp
index 86e747e735eccb397caa8062f52c2561e8ef759d..99194c012fe8b2c15b5eb99f2bf1b2bd51d17cfc 100644
--- a/unit_tests/Test_ShiftGELUImpl.cpp
+++ b/unit_tests/Test_ShiftGELUImpl.cpp
@@ -26,103 +26,219 @@ using namespace Aidge;
 
 TEST_CASE("[gpu/operator] ShiftGELU(forward)", "[ShiftGELU][GPU]") {
     SECTION("4D Tensor") {
-        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
-            {
-                {
-                    {
-                        {0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61},
-                        {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16}
-                    },
-                    {
-                        {0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09},
-                        {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79}
-                    }
-                },
-                {
-                    {
-                        {0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85},
-                        {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02}
-                    },
-                    {
-                        {0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06},
-                        {0.89, 0.32, 0.61, 0.24, 0.70, 0.23, 0.09, 0.03, 0.14, 0.80}
-                    }
-                }
-            }
-        });
-
-        //expected output of shiftgelu forward operator
-        std::shared_ptr<Tensor> output_shiftGELU = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
-            {
-                {
-                    {
-                        { 0.991388f, 0.413078f, 0.413078f, 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.0f, 0.413078f, 0.413078f },
-                        { 0.413078f, 0.0f, 0.0f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.0f }
-                    },
-                    {
-                        { 0.0f, 0.413078f, 0.991388f, 0.0f, 0.413078f, 0.0f, 0.413078f, 0.991388f, 0.413078f, 0.0f },
-                        { 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.0f, 0.0f, 0.413078f, 0.0f, 0.413078f, 0.413078f }
-                    }
-                },
-                {
-                    {
-                        { 0.0f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.413078f },
-                        { 0.991388f, 0.413078f, 0.0f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.0f}
-                    },
-                    {
-                        { 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.413078f, 0.413078f, 0.413078f, 0.0f },
-                        { 0.413078f, 0.413078f, 0.413078f, 0.0f, 0.413078f, 0.0f, 0.0f, 0.0f, 0.0f, 0.413078f }
-                    }
-                }
-            }
-        });
-
-        //expected output of GELU forward operator (computed with PyTorch)
-        std::shared_ptr<Tensor> output_GELU = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 10> {
-            {
-                {
-                    {
-                        { 0.7982f, 0.3285f, 0.3809f, 0.3371f, 0.4262f, 0.7661f, 0.0000f, 0.0000f, 0.4447f, 0.4447f },
-                        { 0.6820f, 0.0314f, 0.0598f, 0.7028f, 0.3899f, 0.0657f, 0.6305f, 0.3285f, 0.2702f, 0.0902f }
-                    },
-                    {
-                        { 0.1428f, 0.3115f, 0.8090f, 0.1093f, 0.4824f, 0.0657f, 0.2948f, 0.8413f, 0.2384f, 0.0482f },
-                        { 0.2948f, 0.4729f, 0.1225f, 0.4170f, 0.0260f, 0.1428f, 0.3989f, 0.0370f, 0.3371f, 0.6203f }
-                    }
-                },
-                {
-                    {
-                        { 0.0000f, 0.0717f, 0.3899f, 0.2784f, 0.3371f, 0.1709f, 0.3632f, 0.3899f, 0.2152f, 0.6820f },
-                        { 0.8197f, 0.2002f, 0.0482f, 0.0260f, 0.2384f, 0.3200f, 0.4635f, 0.0717f, 0.5306f, 0.0102f }
-                    },
-                    {
-                        { 0.5209f, 0.0717f, 0.5701f, 0.4447f, 0.1497f, 0.7028f, 0.3115f, 0.2622f, 0.6407f, 0.0314f },
-                        { 0.7238f, 0.2002f, 0.4447f, 0.1428f, 0.5306f, 0.1359f, 0.0482f, 0.0154f, 0.0778f, 0.6305f }
-                    }
-                }
-            }
-        });
+        std::shared_ptr<Tensor> input0 = std::make_shared<
+            Tensor>(Array4D<float, 2, 2, 2, 10>{
+            {{{{0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61},
+               {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16}},
+              {{0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09},
+               {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79}}},
+             {{{0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85},
+               {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02}},
+              {{0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06},
+               {0.89,
+                0.32,
+                0.61,
+                0.24,
+                0.70,
+                0.23,
+                0.09,
+                0.03,
+                0.14,
+                0.80}}}}});
+
+        // expected output of shiftgelu forward operator
+        std::shared_ptr<Tensor> output_shiftGELU = std::make_shared<Tensor>(
+            Array4D<float, 2, 2, 2, 10>{{{{{0.991388f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.0f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.413078f},
+                                           {0.413078f,
+                                            0.0f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.0f}},
+                                          {{0.0f,
+                                            0.413078f,
+                                            0.991388f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.991388f,
+                                            0.413078f,
+                                            0.0f},
+                                           {0.413078f,
+                                            0.413078f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.0f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.413078f}}},
+                                         {{{0.0f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.413078f},
+                                           {0.991388f,
+                                            0.413078f,
+                                            0.0f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.0f}},
+                                          {{0.413078f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.0f},
+                                           {0.413078f,
+                                            0.413078f,
+                                            0.413078f,
+                                            0.0f,
+                                            0.413078f,
+                                            0.0f,
+                                            0.0f,
+                                            0.0f,
+                                            0.0f,
+                                            0.413078f}}}}});
+
+        // expected output of GELU forward operator (computed with PyTorch)
+        std::shared_ptr<Tensor> output_GELU = std::make_shared<Tensor>(
+            Array4D<float, 2, 2, 2, 10>{{{{{0.7982f,
+                                            0.3285f,
+                                            0.3809f,
+                                            0.3371f,
+                                            0.4262f,
+                                            0.7661f,
+                                            0.0000f,
+                                            0.0000f,
+                                            0.4447f,
+                                            0.4447f},
+                                           {0.6820f,
+                                            0.0314f,
+                                            0.0598f,
+                                            0.7028f,
+                                            0.3899f,
+                                            0.0657f,
+                                            0.6305f,
+                                            0.3285f,
+                                            0.2702f,
+                                            0.0902f}},
+                                          {{0.1428f,
+                                            0.3115f,
+                                            0.8090f,
+                                            0.1093f,
+                                            0.4824f,
+                                            0.0657f,
+                                            0.2948f,
+                                            0.8413f,
+                                            0.2384f,
+                                            0.0482f},
+                                           {0.2948f,
+                                            0.4729f,
+                                            0.1225f,
+                                            0.4170f,
+                                            0.0260f,
+                                            0.1428f,
+                                            0.3989f,
+                                            0.0370f,
+                                            0.3371f,
+                                            0.6203f}}},
+                                         {{{0.0000f,
+                                            0.0717f,
+                                            0.3899f,
+                                            0.2784f,
+                                            0.3371f,
+                                            0.1709f,
+                                            0.3632f,
+                                            0.3899f,
+                                            0.2152f,
+                                            0.6820f},
+                                           {0.8197f,
+                                            0.2002f,
+                                            0.0482f,
+                                            0.0260f,
+                                            0.2384f,
+                                            0.3200f,
+                                            0.4635f,
+                                            0.0717f,
+                                            0.5306f,
+                                            0.0102f}},
+                                          {{0.5209f,
+                                            0.0717f,
+                                            0.5701f,
+                                            0.4447f,
+                                            0.1497f,
+                                            0.7028f,
+                                            0.3115f,
+                                            0.2622f,
+                                            0.6407f,
+                                            0.0314f},
+                                           {0.7238f,
+                                            0.2002f,
+                                            0.4447f,
+                                            0.1428f,
+                                            0.5306f,
+                                            0.1359f,
+                                            0.0482f,
+                                            0.0154f,
+                                            0.0778f,
+                                            0.6305f}}}}});
 
         std::shared_ptr<Node> myShiftGELU = ShiftGELU();
-        auto op = std::static_pointer_cast<OperatorTensor>(myShiftGELU -> getOperator());
-        op->associateInput(0,input0);
+        auto op = std::static_pointer_cast<OperatorTensor>(
+            myShiftGELU->getOperator());
+        op->associateInput(0, input0);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         op->forward();
-        
-        float* computedOutput   = new float[output_shiftGELU->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_shiftGELU->size(), cudaMemcpyDeviceToHost);
 
-        //test if forward result are as expected
-        for(int i = 0; i < output_shiftGELU->size(); i++){
-            const float targetOutput = *(static_cast<float*>(output_shiftGELU->getImpl()->rawPtr()) + i);
+        float *computedOutput = new float[output_shiftGELU->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * output_shiftGELU->size(),
+                   cudaMemcpyDeviceToHost);
+
+        // test if forward result are as expected
+        for (int i = 0; i < output_shiftGELU->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(output_shiftGELU->getImpl()->rawPtr()) +
+                  i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
-        //measure difference between GELU and shiftgelu
+        // measure difference between GELU and shiftgelu
         float sum = 0.0;
-        for(int i = 0; i < output_GELU->size(); i++){
-            const float targetOutput = *(static_cast<float*>(output_GELU->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < output_GELU->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(output_GELU->getImpl()->rawPtr()) + i);
             sum += fabs(computedOutput[i] - targetOutput);
         }
         sum = sum / output_GELU->size();
@@ -130,42 +246,54 @@ TEST_CASE("[gpu/operator] ShiftGELU(forward)", "[ShiftGELU][GPU]") {
 
         delete[] computedOutput;
     }
-
 }
 
 TEST_CASE("[gpu/operator] ShiftGELU(backward)", "[ShiftGELU][GPU]")
 
 {
 
-    std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW
-            {
-                    {
-                        {
-                            {1.46650600,  1.24083233, -0.33106008, -0.15137172, 0.06625678, -1.8326609, 0.53444749, -0.05167147},
-                        },
-                    },
-            }
-        });
-    
+    std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(
+        Array4D<float, 1, 1, 1, 8>{// NCHW
+                                   {
+                                       {
+                                           {
+                                               {1.46650600,
+                                                1.24083233,
+                                                -0.33106008,
+                                                -0.15137172,
+                                                0.06625678,
+                                                -1.8326609,
+                                                0.53444749,
+                                                -0.05167147},
+                                           },
+                                       },
+                                   }});
+
     input0->setBackend("cuda");
 
     std::shared_ptr<Node> myShiftGELU = ShiftGELU();
-    auto op = std::static_pointer_cast<OperatorTensor>(myShiftGELU->getOperator());
+    auto op =
+        std::static_pointer_cast<OperatorTensor>(myShiftGELU->getOperator());
     op->associateInput(0, input0);
     op->setDataType(DataType::Float32);
     op->setBackend("cuda");
     myShiftGELU->forward();
 
-    std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+    std::shared_ptr<Tensor> myOutputGrad =
+        std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{
             {
                 {
-                    {
-                        { 1.34347093,  0.90813798, 0.39607167,  1.20428133, 0.16845724,  0.48487359, 0.40748054, -0.21790814},
-                    },
+                    {1.34347093,
+                     0.90813798,
+                     0.39607167,
+                     1.20428133,
+                     0.16845724,
+                     0.48487359,
+                     0.40748054,
+                     -0.21790814},
                 },
-            }
-        });
-
+            },
+        }});
 
     myOutputGrad->setBackend("cuda");
     std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
@@ -173,48 +301,66 @@ TEST_CASE("[gpu/operator] ShiftGELU(backward)", "[ShiftGELU][GPU]")
     predictedOutput->setGrad(myOutputGrad);
     REQUIRE_NOTHROW(myShiftGELU->backward());
 
-    //expected output of shiftgelu backward operator
-    std::shared_ptr<Tensor> expectedInputGradShiftGELU = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+    // expected output of shiftgelu backward operator
+    std::shared_ptr<Tensor> expectedInputGradShiftGELU =
+        std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{
             {
                 {
-                    {
-                        { 1.88094, 1.09182, 0.134203, 0.439603, 0.0696628, 0.173469, 0.254718, -0.084009},
-                    },
+                    {1.88094,
+                     1.09182,
+                     0.134203,
+                     0.439603,
+                     0.0696628,
+                     0.173469,
+                     0.254718,
+                     -0.084009},
                 },
-            }
-        });
+            },
+        }});
 
-    //expected output of gelu backward operator (computed with PyTorch)
-    std::shared_ptr<Tensor> expectedInputGradGELU = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+    // expected output of gelu backward operator (computed with PyTorch)
+    std::shared_ptr<Tensor> expectedInputGradGELU =
+        std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{
             {
                 {
-                    {
-                        {  1.5159,  1.0188,  0.0971,  0.4578,  0.0931, -0.0499,  0.3620, -0.1000},
-                    },
+                    {1.5159,
+                     1.0188,
+                     0.0971,
+                     0.4578,
+                     0.0931,
+                     -0.0499,
+                     0.3620,
+                     -0.1000},
                 },
-            }
-        });
-
+            },
+        }});
 
     float *computedGradCuda = new float[myOutputGrad->size()]();
 
-    cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * myOutputGrad->size(), cudaMemcpyDeviceToHost);
+    cudaMemcpy(computedGradCuda,
+               input->grad()->getImpl()->rawPtr(),
+               sizeof(float) * myOutputGrad->size(),
+               cudaMemcpyDeviceToHost);
 
-    //test if backward result are as expected
-    for(int i = 0; i < expectedInputGradShiftGELU->size(); i++){
-        const float targetOutput = *(static_cast<float*>(expectedInputGradShiftGELU->getImpl()->rawPtr()) + i);
-        REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 2e-6);  
+    // test if backward result are as expected
+    for (int i = 0; i < expectedInputGradShiftGELU->size(); i++) {
+        const float targetOutput =
+            *(static_cast<float *>(
+                  expectedInputGradShiftGELU->getImpl()->rawPtr()) +
+              i);
+        REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 2e-6);
     }
 
-    //measure difference between gelu and shifgelu
+    // measure difference between gelu and shifgelu
     float sum = 0.0;
-        for(int i = 0; i < expectedInputGradGELU->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedInputGradGELU->getImpl()->rawPtr()) + i);
-            sum += fabs(computedGradCuda[i] - targetOutput);
-        }
-        sum = sum / expectedInputGradGELU->size();
-        REQUIRE(sum < 2e-1);
-
+    for (int i = 0; i < expectedInputGradGELU->size(); i++) {
+        const float targetOutput = *(
+            static_cast<float *>(expectedInputGradGELU->getImpl()->rawPtr()) +
+            i);
+        sum += fabs(computedGradCuda[i] - targetOutput);
+    }
+    sum = sum / expectedInputGradGELU->size();
+    REQUIRE(sum < 2e-1);
 
     delete[] computedGradCuda;
 }
diff --git a/unit_tests/Test_ShiftMaxImpl.cpp b/unit_tests/Test_ShiftMaxImpl.cpp
index 2a94a23c3a04edd72cb535ebfb6e2c538e4aeee8..1ae24357398c7ad35f817937d9ffc993b82a8091 100644
--- a/unit_tests/Test_ShiftMaxImpl.cpp
+++ b/unit_tests/Test_ShiftMaxImpl.cpp
@@ -26,101 +26,217 @@ using namespace Aidge;
 
 TEST_CASE("[gpu/operator] ShiftMax(forward)", "[ShiftMax][GPU]") {
     SECTION("4D Tensor") {
-        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
-            {
-                {
-                    {
-                        {0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61},
-                        {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16}
-                    },
-                    {
-                        {0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09},
-                        {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79}
-                    }
-                },
-                {
-                    {
-                        {0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85},
-                        {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02}
-                    },
-                    {
-                        {0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06},
-                        {0.89, 0.32, 0.61, 0.24, 0.70, 0.23, 0.09, 0.03, 0.14, 0.80}
-                    }
-                }
-            }
-        });
-        //expected output of shiftmax forward operator
-        std::shared_ptr<Tensor> output_shiftmax = std::make_shared<Tensor>(Array4D<float,2,2,2,10> {
-            {
-                {
-                    {
-                        { 0.111084f, 0.111084f, 0.111084f, 0.111084f, 0.111084f, 0.111084f, 0.055542f, 0.055542f, 0.111084f, 0.111084f },
-                        { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f }
-                    },
-                    {
-                        { 0.0624695f, 0.124969f, 0.124969f, 0.0624695f, 0.124969f, 0.0624695f, 0.124969f, 0.124969f, 0.124969f, 0.0624695f },
-                        { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f }
-                    }
-                },
-                {
-                    {
-                        { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f },
-                        { 0.124969f, 0.124969f, 0.0624695f, 0.0624695f, 0.124969f, 0.124969f, 0.124969f, 0.0624695f, 0.124969f, 0.0624695f }
-                    },
-                    {
-                        { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f },
-                        { 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f, 0.0999756f }
-                    }
-                }
-            }
-        });
-        //expected output of softmax forward operator (computed with PyTorch)
-        std::shared_ptr<Tensor> output_softmax = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 10> {
-            {
-                {
-                    {
-                        { 0.1484f, 0.0918f, 0.0975f, 0.0928f, 0.1025f, 0.1440f, 0.0568f, 0.0568f, 0.1046f, 0.1046f },
-                        { 0.1436f, 0.0652f, 0.0685f, 0.1465f, 0.1064f, 0.0692f, 0.1366f, 0.0992f, 0.0925f, 0.0721f }
-                    },
-                    {
-                        { 0.0768f, 0.0957f, 0.1593f, 0.0730f, 0.1157f, 0.0681f, 0.0938f, 0.1642f, 0.0874f, 0.0661f },
-                        { 0.1005f, 0.1227f, 0.0798f, 0.1156f, 0.0680f, 0.0823f, 0.1133f, 0.0694f, 0.1056f, 0.1426f }
-                    }
-                },
-                {
-                    {
-                        { 0.0645f, 0.0734f, 0.1118f, 0.0981f, 0.1052f, 0.0853f, 0.1085f, 0.1118f, 0.0906f, 0.1509f },
-                        { 0.1743f, 0.0901f, 0.0716f, 0.0688f, 0.0947f, 0.1047f, 0.1228f, 0.0745f, 0.1317f, 0.0667f }
-                    },
-                    {
-                        { 0.1164f, 0.0665f, 0.1224f, 0.1075f, 0.0750f, 0.1394f, 0.0925f, 0.0871f, 0.1313f, 0.0620f },
-                        { 0.1551f, 0.0877f, 0.1172f, 0.0810f, 0.1283f, 0.0802f, 0.0697f, 0.0656f, 0.0733f, 0.1418f }
-                    }
-                }
-            }
-        });
+        std::shared_ptr<Tensor> input0 = std::make_shared<
+            Tensor>(Array4D<float, 2, 2, 2, 10>{
+            {{{{0.96, 0.48, 0.54, 0.49, 0.59, 0.93, 0.00, 0.00, 0.61, 0.61},
+               {0.85, 0.06, 0.11, 0.87, 0.55, 0.12, 0.80, 0.48, 0.41, 0.16}},
+              {{0.24, 0.46, 0.97, 0.19, 0.65, 0.12, 0.44, 1.00, 0.37, 0.09},
+               {0.44, 0.64, 0.21, 0.58, 0.05, 0.24, 0.56, 0.07, 0.49, 0.79}}},
+             {{{0.00, 0.13, 0.55, 0.42, 0.49, 0.28, 0.52, 0.55, 0.34, 0.85},
+               {0.98, 0.32, 0.09, 0.05, 0.37, 0.47, 0.63, 0.13, 0.70, 0.02}},
+              {{0.69, 0.13, 0.74, 0.61, 0.25, 0.87, 0.46, 0.40, 0.81, 0.06},
+               {0.89,
+                0.32,
+                0.61,
+                0.24,
+                0.70,
+                0.23,
+                0.09,
+                0.03,
+                0.14,
+                0.80}}}}});
+        // expected output of shiftmax forward operator
+        std::shared_ptr<Tensor> output_shiftmax = std::make_shared<Tensor>(
+            Array4D<float, 2, 2, 2, 10>{{{{{0.111084f,
+                                            0.111084f,
+                                            0.111084f,
+                                            0.111084f,
+                                            0.111084f,
+                                            0.111084f,
+                                            0.055542f,
+                                            0.055542f,
+                                            0.111084f,
+                                            0.111084f},
+                                           {0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f}},
+                                          {{0.0624695f,
+                                            0.124969f,
+                                            0.124969f,
+                                            0.0624695f,
+                                            0.124969f,
+                                            0.0624695f,
+                                            0.124969f,
+                                            0.124969f,
+                                            0.124969f,
+                                            0.0624695f},
+                                           {0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f}}},
+                                         {{{0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f},
+                                           {0.124969f,
+                                            0.124969f,
+                                            0.0624695f,
+                                            0.0624695f,
+                                            0.124969f,
+                                            0.124969f,
+                                            0.124969f,
+                                            0.0624695f,
+                                            0.124969f,
+                                            0.0624695f}},
+                                          {{0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f},
+                                           {0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f,
+                                            0.0999756f}}}}});
+        // expected output of softmax forward operator (computed with PyTorch)
+        std::shared_ptr<Tensor> output_softmax = std::make_shared<Tensor>(
+            Array4D<float, 2, 2, 2, 10>{{{{{0.1484f,
+                                            0.0918f,
+                                            0.0975f,
+                                            0.0928f,
+                                            0.1025f,
+                                            0.1440f,
+                                            0.0568f,
+                                            0.0568f,
+                                            0.1046f,
+                                            0.1046f},
+                                           {0.1436f,
+                                            0.0652f,
+                                            0.0685f,
+                                            0.1465f,
+                                            0.1064f,
+                                            0.0692f,
+                                            0.1366f,
+                                            0.0992f,
+                                            0.0925f,
+                                            0.0721f}},
+                                          {{0.0768f,
+                                            0.0957f,
+                                            0.1593f,
+                                            0.0730f,
+                                            0.1157f,
+                                            0.0681f,
+                                            0.0938f,
+                                            0.1642f,
+                                            0.0874f,
+                                            0.0661f},
+                                           {0.1005f,
+                                            0.1227f,
+                                            0.0798f,
+                                            0.1156f,
+                                            0.0680f,
+                                            0.0823f,
+                                            0.1133f,
+                                            0.0694f,
+                                            0.1056f,
+                                            0.1426f}}},
+                                         {{{0.0645f,
+                                            0.0734f,
+                                            0.1118f,
+                                            0.0981f,
+                                            0.1052f,
+                                            0.0853f,
+                                            0.1085f,
+                                            0.1118f,
+                                            0.0906f,
+                                            0.1509f},
+                                           {0.1743f,
+                                            0.0901f,
+                                            0.0716f,
+                                            0.0688f,
+                                            0.0947f,
+                                            0.1047f,
+                                            0.1228f,
+                                            0.0745f,
+                                            0.1317f,
+                                            0.0667f}},
+                                          {{0.1164f,
+                                            0.0665f,
+                                            0.1224f,
+                                            0.1075f,
+                                            0.0750f,
+                                            0.1394f,
+                                            0.0925f,
+                                            0.0871f,
+                                            0.1313f,
+                                            0.0620f},
+                                           {0.1551f,
+                                            0.0877f,
+                                            0.1172f,
+                                            0.0810f,
+                                            0.1283f,
+                                            0.0802f,
+                                            0.0697f,
+                                            0.0656f,
+                                            0.0733f,
+                                            0.1418f}}}}});
 
         std::shared_ptr<Node> myShiftMax = ShiftMax();
-        auto op = std::static_pointer_cast<OperatorTensor>(myShiftMax -> getOperator());
-        op->associateInput(0,input0);
+        auto op = std::static_pointer_cast<OperatorTensor>(
+            myShiftMax->getOperator());
+        op->associateInput(0, input0);
         op->setDataType(DataType::Float32);
         op->setBackend("cuda");
         op->forward();
-        
-        float* computedOutput   = new float[output_shiftmax->size()]();
-        cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * output_shiftmax->size(), cudaMemcpyDeviceToHost);
 
-        //test if forward result are as expected
-        for(int i = 0; i < output_shiftmax->size(); i++){
-            const float targetOutput = *(static_cast<float*>(output_shiftmax->getImpl()->rawPtr()) + i);
+        float *computedOutput = new float[output_shiftmax->size()]();
+        cudaMemcpy(computedOutput,
+                   op->getOutput(0)->getImpl()->rawPtr(),
+                   sizeof(float) * output_shiftmax->size(),
+                   cudaMemcpyDeviceToHost);
+
+        // test if forward result are as expected
+        for (int i = 0; i < output_shiftmax->size(); i++) {
+            const float targetOutput =
+                *(static_cast<float *>(output_shiftmax->getImpl()->rawPtr()) +
+                  i);
             REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
         }
 
-        //measure difference between softmax and shiftmax
+        // measure difference between softmax and shiftmax
         float sum = 0.0;
-        for(int i = 0; i < output_softmax->size(); i++){
-            const float targetOutput = *(static_cast<float*>(output_softmax->getImpl()->rawPtr()) + i);
+        for (int i = 0; i < output_softmax->size(); i++) {
+            const float targetOutput = *(
+                static_cast<float *>(output_softmax->getImpl()->rawPtr()) + i);
             sum += fabs(computedOutput[i] - targetOutput);
         }
         sum = sum / output_softmax->size();
@@ -128,42 +244,54 @@ TEST_CASE("[gpu/operator] ShiftMax(forward)", "[ShiftMax][GPU]") {
 
         delete[] computedOutput;
     }
-
 }
 
 TEST_CASE("[gpu/operator] ShiftMax(backward)", "[ShiftMax][GPU]")
 
 {
 
-    std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<float,1,1,1,8> { //NCHW
-            {
-                    {
-                        {
-                            {1.46650600,  1.24083233, -0.33106008, -0.15137172, 0.06625678, -1.8326609, 0.53444749, -0.05167147},
-                        },
-                    },
-            }
-        });
-    
+    std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(
+        Array4D<float, 1, 1, 1, 8>{// NCHW
+                                   {
+                                       {
+                                           {
+                                               {1.46650600,
+                                                1.24083233,
+                                                -0.33106008,
+                                                -0.15137172,
+                                                0.06625678,
+                                                -1.8326609,
+                                                0.53444749,
+                                                -0.05167147},
+                                           },
+                                       },
+                                   }});
+
     input0->setBackend("cuda");
 
     std::shared_ptr<Node> myShiftMax = ShiftMax();
-    auto op = std::static_pointer_cast<OperatorTensor>(myShiftMax->getOperator());
+    auto op =
+        std::static_pointer_cast<OperatorTensor>(myShiftMax->getOperator());
     op->associateInput(0, input0);
     op->setDataType(DataType::Float32);
     op->setBackend("cuda");
     myShiftMax->forward();
 
-    std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+    std::shared_ptr<Tensor> myOutputGrad =
+        std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{
             {
                 {
-                    {
-                        { 1.34347093,  0.90813798, 0.39607167,  1.20428133, 0.16845724,  0.48487359, 0.40748054, -0.21790814},
-                    },
+                    {1.34347093,
+                     0.90813798,
+                     0.39607167,
+                     1.20428133,
+                     0.16845724,
+                     0.48487359,
+                     0.40748054,
+                     -0.21790814},
                 },
-            }
-        });
-
+            },
+        }});
 
     myOutputGrad->setBackend("cuda");
     std::shared_ptr<Tensor> predictedOutput = op->getOutput(0);
@@ -171,47 +299,67 @@ TEST_CASE("[gpu/operator] ShiftMax(backward)", "[ShiftMax][GPU]")
     predictedOutput->setGrad(myOutputGrad);
     REQUIRE_NOTHROW(myShiftMax->backward());
 
-    //expected output of shiftmax backward operator
-    std::shared_ptr<Tensor> expectedInputGradShiftMax = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+    // expected output of shiftmax backward operator
+    std::shared_ptr<Tensor> expectedInputGradShiftMax =
+        std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{
             {
                 {
-                    {
-                        { 0.159378, 0.0249331, -0.0250217, 0.0262418, -0.0514701, -0.00459638, -0.0551896, -0.0739511},
-                    },
+                    {0.159378,
+                     0.0249331,
+                     -0.0250217,
+                     0.0262418,
+                     -0.0514701,
+                     -0.00459638,
+                     -0.0551896,
+                     -0.0739511},
                 },
-            }
-        });
+            },
+        }});
 
-    //expected output of softmax backward operator (computed with PyTorch)
-    std::shared_ptr<Tensor> expectedInputGradSoftmax = std::make_shared<Tensor>(Array4D<float,1,1,1,8> {
+    // expected output of softmax backward operator (computed with PyTorch)
+    std::shared_ptr<Tensor> expectedInputGradSoftmax =
+        std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 8>{{
             {
                 {
-                    {
-                        { 0.1672,  0.0198, -0.0236,  0.0241, -0.0535, -0.0042, -0.0547, -0.0752},
-                    },
+                    {0.1672,
+                     0.0198,
+                     -0.0236,
+                     0.0241,
+                     -0.0535,
+                     -0.0042,
+                     -0.0547,
+                     -0.0752},
                 },
-            }
-        });
-
+            },
+        }});
 
     float *computedGradCuda = new float[myOutputGrad->size()]();
 
-    cudaMemcpy(computedGradCuda, input->grad()->getImpl()->rawPtr(), sizeof(float) * myOutputGrad->size(), cudaMemcpyDeviceToHost);
+    cudaMemcpy(computedGradCuda,
+               input->grad()->getImpl()->rawPtr(),
+               sizeof(float) * myOutputGrad->size(),
+               cudaMemcpyDeviceToHost);
 
-    //test if backward result are as expected
-    for(int i = 0; i < expectedInputGradShiftMax->size(); i++){
-        const float targetOutput = *(static_cast<float*>(expectedInputGradShiftMax->getImpl()->rawPtr()) + i);
+    // test if backward result are as expected
+    for (int i = 0; i < expectedInputGradShiftMax->size(); i++) {
+        const float targetOutput =
+            *(static_cast<float *>(
+                  expectedInputGradShiftMax->getImpl()->rawPtr()) +
+              i);
         REQUIRE(fabs(computedGradCuda[i] - targetOutput) < 1e-6);
     }
 
-    //measure difference between softmax and shiftmax
+    // measure difference between softmax and shiftmax
     float sum = 0.0;
-        for(int i = 0; i < expectedInputGradSoftmax->size(); i++){
-            const float targetOutput = *(static_cast<float*>(expectedInputGradSoftmax->getImpl()->rawPtr()) + i);
-            sum += fabs(computedGradCuda[i] - targetOutput);
-        }
-        sum = sum / expectedInputGradSoftmax->size();
-        REQUIRE(sum < 4e-3);
+    for (int i = 0; i < expectedInputGradSoftmax->size(); i++) {
+        const float targetOutput =
+            *(static_cast<float *>(
+                  expectedInputGradSoftmax->getImpl()->rawPtr()) +
+              i);
+        sum += fabs(computedGradCuda[i] - targetOutput);
+    }
+    sum = sum / expectedInputGradSoftmax->size();
+    REQUIRE(sum < 4e-3);
 
     delete[] computedGradCuda;
 }
diff --git a/unit_tests/Test_TensorImpl.cpp b/unit_tests/Test_TensorImpl.cpp
index cb120a970c5310f80f8c62960c029a845937ba30..ed34394267165549421d2171f50fc93d0d5d4572 100644
--- a/unit_tests/Test_TensorImpl.cpp
+++ b/unit_tests/Test_TensorImpl.cpp
@@ -26,34 +26,34 @@ TEST_CASE("CUDA test") {
     const int N = 100;
 
     // Allocate host memory
-    float* a   = new float[N]();
-    float* b   = new float[N]();
-    float* out = new float[N]();
+    float *a = new float[N]();
+    float *b = new float[N]();
+    float *out = new float[N]();
 
     // Initialize host arrays
-    for(int i = 0; i < N; i++){
+    for (int i = 0; i < N; i++) {
         a[i] = 1.0f;
         b[i] = 2.0f;
     }
 
     // Allocate device memory
-    float *d_a, *d_b, *d_out; 
-    cudaMalloc(reinterpret_cast<void**>(&d_a), sizeof(float) * N);
-    cudaMalloc(reinterpret_cast<void**>(&d_b), sizeof(float) * N);
-    cudaMalloc(reinterpret_cast<void**>(&d_out), sizeof(float) * N);
+    float *d_a, *d_b, *d_out;
+    cudaMalloc(reinterpret_cast<void **>(&d_a), sizeof(float) * N);
+    cudaMalloc(reinterpret_cast<void **>(&d_b), sizeof(float) * N);
+    cudaMalloc(reinterpret_cast<void **>(&d_out), sizeof(float) * N);
 
     // Transfer data from host to device memory
     cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
     cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
 
-    // Executing kernel 
+    // Executing kernel
     vector_add(d_out, d_a, d_b, N);
-    
+
     // Transfer data back to host memory
     cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
 
     // Verification
-    for(int i = 0; i < N; i++){
+    for (int i = 0; i < N; i++) {
         REQUIRE(fabs(out[i] - a[i] - b[i]) < 1e-6);
     }
 
@@ -72,17 +72,7 @@ TEST_CASE("Tensor creation", "[Connector]") {
     SECTION("from const array") {
         Tensor x;
         x.setBackend("cuda");
-        x = Array3D<int,2,2,2>{
-        {
-            {
-                {1, 2},
-                {3, 4}
-            },
-            {
-                {5, 6},
-                {7, 8}
-            }
-        }};
+        x = Array3D<int, 2, 2, 2>{{{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}};
 
         REQUIRE(x.nbDims() == 3);
         REQUIRE(x.dims()[0] == 2);
@@ -91,23 +81,16 @@ TEST_CASE("Tensor creation", "[Connector]") {
         REQUIRE(x.size() == 8);
 
         std::array<int, 8> val;
-        cudaMemcpy(&val[0], x.getImpl()->rawPtr(), 8 * sizeof(int), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&val[0],
+                   x.getImpl()->rawPtr(),
+                   8 * sizeof(int),
+                   cudaMemcpyDeviceToHost);
         REQUIRE(val[0] == 1);
         REQUIRE(val[7] == 8);
     }
 
     SECTION("from const array before backend") {
-        Tensor x = Array3D<int,2,2,2>{
-        {
-            {
-                {1, 2},
-                {3, 4}
-            },
-            {
-                {5, 6},
-                {7, 8}
-            }
-        }};
+        Tensor x = Array3D<int, 2, 2, 2>{{{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}};
         x.setBackend("cuda");
 
         REQUIRE(x.nbDims() == 3);
@@ -117,7 +100,10 @@ TEST_CASE("Tensor creation", "[Connector]") {
         REQUIRE(x.size() == 8);
 
         std::array<int, 8> val;
-        cudaMemcpy(&val[0], x.getImpl()->rawPtr(), 8 * sizeof(int), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&val[0],
+                   x.getImpl()->rawPtr(),
+                   8 * sizeof(int),
+                   cudaMemcpyDeviceToHost);
         REQUIRE(val[0] == 1);
         REQUIRE(val[7] == 8);
     }
@@ -127,37 +113,52 @@ TEST_CASE("Tensor Descriptor Update") {
     Tensor x;
     x.setBackend("cuda");
 
-    std::vector<std::size_t> shapeA = { 7, 6, 5, 4, 3 };
+    std::vector<std::size_t> shapeA = {7, 6, 5, 4, 3};
     x.resize(shapeA);
 
-    cudnnTensorDescriptor_t desc = std::dynamic_pointer_cast<TensorImpl_cuda_>(x.getImpl())->getCudnnTensorDesc(x);
+    cudnnTensorDescriptor_t desc =
+        std::dynamic_pointer_cast<TensorImpl_cuda_>(x.getImpl())
+            ->getCudnnTensorDesc(x);
 
     cudnnDataType_t currentDataType;
     int currentNbDims;
     std::vector<int> currentDimA(shapeA.size());
     std::vector<int> currentStrideA(shapeA.size());
 
-    REQUIRE_NOTHROW(cudnnGetTensorNdDescriptor(desc, shapeA.size(), &currentDataType, &currentNbDims, currentDimA.data(), currentStrideA.data()));
+    REQUIRE_NOTHROW(cudnnGetTensorNdDescriptor(desc,
+                                               shapeA.size(),
+                                               &currentDataType,
+                                               &currentNbDims,
+                                               currentDimA.data(),
+                                               currentStrideA.data()));
 
-    REQUIRE(std::equal(currentDimA.begin(), currentDimA.end(), shapeA.begin(), [](int a, std::size_t b) {
-                            return static_cast<std::size_t>(a) == b;
-                        }
-                      )
-            );
+    REQUIRE(std::equal(currentDimA.begin(),
+                       currentDimA.end(),
+                       shapeA.begin(),
+                       [](int a, std::size_t b) {
+                           return static_cast<std::size_t>(a) == b;
+                       }));
 
     // Change the tensor shape and check tensor descriptor
-    std::vector<std::size_t> shapeB = { 6, 5, 4 };
+    std::vector<std::size_t> shapeB = {6, 5, 4};
     x.resize(shapeB);
 
     std::vector<int> currentDimB(shapeB.size());
     std::vector<int> currentStrideB(shapeB.size());
 
-    desc = std::dynamic_pointer_cast<TensorImpl_cuda_>(x.getImpl())->getCudnnTensorDesc(x);
-    REQUIRE_NOTHROW(cudnnGetTensorNdDescriptor(desc, shapeB.size(), &currentDataType, &currentNbDims, currentDimB.data(), currentStrideB.data()));
-
-    REQUIRE(std::equal(currentDimB.begin(), currentDimB.end(), shapeB.begin(), [](int a, std::size_t b) {
-                            return static_cast<std::size_t>(a) == b;
-                        }
-                      )
-            );
+    desc = std::dynamic_pointer_cast<TensorImpl_cuda_>(x.getImpl())
+               ->getCudnnTensorDesc(x);
+    REQUIRE_NOTHROW(cudnnGetTensorNdDescriptor(desc,
+                                               shapeB.size(),
+                                               &currentDataType,
+                                               &currentNbDims,
+                                               currentDimB.data(),
+                                               currentStrideB.data()));
+
+    REQUIRE(std::equal(currentDimB.begin(),
+                       currentDimB.end(),
+                       shapeB.begin(),
+                       [](int a, std::size_t b) {
+                           return static_cast<std::size_t>(a) == b;
+                       }));
 }