diff --git a/include/aidge/learning/optimizer/Adam.hpp b/include/aidge/learning/optimizer/Adam.hpp
index 8c89e53f94dade675d3e7da139554561929f5e3d..da2146932d3a09b9d38ea744b6e685ac0af46aec 100644
--- a/include/aidge/learning/optimizer/Adam.hpp
+++ b/include/aidge/learning/optimizer/Adam.hpp
@@ -58,23 +58,43 @@ public:
     }
 
     void update() override final {
+
+        auto backend = mParameters[0]->backend();
+        auto device = mParameters[0]->device();
+        auto dataType = mParameters[0]->dataType();
+
         float mBeta1Power = std::pow(this->getAttr<AdamAttr::Beta1>(), static_cast<float>(mLRScheduler.step() + 1));
         float mBeta2Power = std::pow(this->getAttr<AdamAttr::Beta2>(), static_cast<float>(mLRScheduler.step() + 1));
+        
         float mReversedBeta1Power = 1.0f - mBeta1Power;
         float mSqrtReversedBeta2Power = std::sqrt(1.0f - mBeta2Power);
 
         Tensor alpha = Tensor(learningRate() * mSqrtReversedBeta2Power / mReversedBeta1Power);
-        alpha.setBackend(mParameters[0]->getImpl()->backend());
-        alpha.setDataType(mParameters[0]->dataType());
+        alpha.setBackend(backend, device);
+        alpha.setDataType(dataType);
 
         Tensor epsilon_hat = Tensor(this->getAttr<AdamAttr::Epsilon>() * mSqrtReversedBeta2Power);
-        epsilon_hat.setBackend(mParameters[0]->getImpl()->backend());
-        epsilon_hat.setDataType(mParameters[0]->dataType());
+        epsilon_hat.setBackend(backend, device);
+        epsilon_hat.setDataType(dataType);
+
+        mBeta1.setBackend(backend, device);
+        mBeta1.setDataType(dataType);
+        mReversedBeta1.setBackend(backend, device);
+        mReversedBeta1.setDataType(dataType);
+
+        mBeta2.setBackend(backend, device);
+        mBeta2.setDataType(dataType);
+        mReversedBeta2.setBackend(backend, device);
+        mReversedBeta2.setDataType(dataType);
 
         if (mLRScheduler.step() == 0) {
             for (std::size_t i = 0; i < mParameters.size(); ++i) {
                 mMomentum1[i].zeros();
-                mMomentum2[i].zeros();
+                mMomentum1[i].setBackend(backend, device);
+                mMomentum1[i].setDataType(dataType);
+                mMomentum2[i].zeros(); 
+                mMomentum2[i].setBackend(backend, device);
+                mMomentum2[i].setDataType(dataType);
             }
         }
 
@@ -88,25 +108,33 @@ public:
     }
 
     void setParameters(const std::vector<std::shared_ptr<Tensor>>& parameters) override final {
+
         Optimizer::setParameters(parameters);
         mMomentum1 = std::vector<Tensor>(parameters.size());
         mMomentum2 = std::vector<Tensor>(parameters.size());
+
         for (std::size_t i = 0; i < parameters.size(); ++i) {
+
             mMomentum1[i] = Tensor(parameters[i]->dims());
-            mMomentum1[i].setBackend(parameters[i]->getImpl()->backend());
+            mMomentum1[i].setBackend(parameters[i]->backend(), parameters[i]->device());
             mMomentum1[i].setDataType(parameters[i]->dataType());
+
             mMomentum2[i] = Tensor(parameters[i]->dims());
-            mMomentum2[i].setBackend(parameters[i]->getImpl()->backend());
+            mMomentum2[i].setBackend(parameters[i]->backend(), parameters[i]->device());
             mMomentum2[i].setDataType(parameters[i]->dataType());
         }
         if (parameters.size() > 0) {
-            mBeta1.setBackend(mParameters[0]->getImpl()->backend());
+
+            mBeta1.setBackend(mParameters[0]->backend(), mParameters[0]->device());
             mBeta1.setDataType(parameters[0]->dataType());
-            mReversedBeta1.setBackend(mParameters[0]->getImpl()->backend());
+
+            mReversedBeta1.setBackend(mParameters[0]->backend(), mParameters[0]->device());
             mReversedBeta1.setDataType(parameters[0]->dataType());
-            mBeta2.setBackend(mParameters[0]->getImpl()->backend());
+
+            mBeta2.setBackend(mParameters[0]->backend(), mParameters[0]->device());
             mBeta2.setDataType(parameters[0]->dataType());
-            mReversedBeta2.setBackend(mParameters[0]->getImpl()->backend());
+
+            mReversedBeta2.setBackend(mParameters[0]->backend(), mParameters[0]->device());
             mReversedBeta2.setDataType(parameters[0]->dataType());
         }
     }
diff --git a/include/aidge/learning/optimizer/SGD.hpp b/include/aidge/learning/optimizer/SGD.hpp
index da029b36fae81af32aad79e668fab1e98e1a0076..1c5f1a625dc777e44b367fe8b3851a4f0f75c483 100644
--- a/include/aidge/learning/optimizer/SGD.hpp
+++ b/include/aidge/learning/optimizer/SGD.hpp
@@ -56,16 +56,33 @@ public:
     }
 
     void update() override final {
+
+        auto backend  = mParameters[0]->backend();
+        auto device   = mParameters[0]->device();
+        auto dataType = mParameters[0]->dataType();
+
         mLR = Tensor(learningRate());
-        mLR.setBackend(mParameters[0]->getImpl()->backend());
-        mLR.setDataType(mParameters[0]->dataType());
-        mWeightDecay.setBackend(mParameters[0]->getImpl()->backend());
-        mWeightDecay.setDataType(mParameters[0]->dataType());
+
+        // Set backends / devices
+
+        mLR.setDataType(dataType);
+        mLR.setBackend(backend, device);
+
+        mWeightDecay.setDataType(dataType);
+        mWeightDecay.setBackend(backend, device);
+
+        mReversedDampening.setDataType(dataType);
+        mReversedDampening.setBackend(backend, device);
+
+        mMomentum.setDataType(dataType);
+        mMomentum.setBackend(backend, device);
+
+        // update loop
 
         if (mLRScheduler.step() == 0) {
             for (std::size_t i = 0; i < mParameters.size(); ++i) {
                 mGradientInertia[i] = mParameters[i]->grad()->clone();
-                *mParameters[i] -= mLR*mGradientInertia[i];
+                *mParameters[i] -= mLR * mGradientInertia[i];
             }
         } else {
             for (std::size_t i = 0; i < mParameters.size(); ++i) {
@@ -82,13 +99,13 @@ public:
         mGradientInertia = std::vector<Tensor>(parameters.size());
         for (std::size_t i = 0; i < parameters.size(); ++i) {
             mGradientInertia[i] = Tensor(parameters[i]->dims());
-            mGradientInertia[i].setBackend(parameters[i]->backend());
+            mGradientInertia[i].setBackend(parameters[i]->backend(), parameters[i]->device());
             mGradientInertia[i].setDataType(parameters[i]->dataType());
         }
         if (parameters.size() > 0) {
-            mReversedDampening.setBackend(mParameters[0]->getImpl()->backend());
+            mReversedDampening.setBackend(mParameters[0]->backend(), mParameters[0]->device());
             mReversedDampening.setDataType(parameters[0]->dataType());
-            mMomentum.setBackend(mParameters[0]->getImpl()->backend());
+            mMomentum.setBackend(mParameters[0]->backend(), mParameters[0]->device());
             mMomentum.setDataType(parameters[0]->dataType());
         }
     }
diff --git a/src/loss/classification/BCE.cpp b/src/loss/classification/BCE.cpp
index 4d5ce38bec984b9cf4faf4d80a494f16a48d72b1..c277658a44e11b7e80a1d6fd7836b3ce1a335d7d 100644
--- a/src/loss/classification/BCE.cpp
+++ b/src/loss/classification/BCE.cpp
@@ -43,14 +43,22 @@ Aidge::Tensor Aidge::loss::BCE(std::shared_ptr<Tensor>& prediction,
 
     AIDGE_ASSERT(target->dims().size() == 2,
                  "Label must have two dims: [BatchSize, NbChannel]");
+
     AIDGE_ASSERT(prediction->backend() == target->backend(),
                  "'prediction' and 'target' Tensors must be on the "
                  "same backend. Found {} and {}.\n",
                  prediction->backend(), target->backend());
+
+    AIDGE_ASSERT(prediction->device() == target->device(),
+                 "'prediction' and 'target' Tensors must be on the "
+                 "same device. Found {} and {}.\n",
+                 prediction->device(), target->device());
+
     AIDGE_ASSERT(prediction->dims() == target->dims(),
                  "'prediction' (shape {}) and 'target' (shape {}) Tensors must "
                  "have the same dimensions.\n",
                  prediction->dims(), target->dims());
+
     AIDGE_ASSERT(prediction->dataType() == target->dataType(),
                  "'prediction' (data type {}) and 'target' (data type {}) "
                  "Tensors must have the same data type.\n",
@@ -135,7 +143,7 @@ Aidge::Tensor Aidge::loss::BCE(std::shared_ptr<Tensor>& prediction,
                   ln1_node, ln2_node,
                   sub2_node, mul1_node, mul2_node, sub3_node, loss_node,
                   sub4_node, mul3_node, div1_node, gradient_node->getParent(1), gradient_node});
-    gv_loss->compile(prediction->getImpl()->backend(), prediction->dataType());
+    gv_loss->compile(prediction->backend(), prediction->dataType(), prediction->device());
 
     // Compute loss and gradient
     SequentialScheduler ss_loss{gv_loss};
diff --git a/src/loss/classification/CELoss.cpp b/src/loss/classification/CELoss.cpp
index 2d4b7f0a818abc524c8196dfa14810da7f19b523..2f2000585790a4781f91aa02236d9e3609f73680 100644
--- a/src/loss/classification/CELoss.cpp
+++ b/src/loss/classification/CELoss.cpp
@@ -37,14 +37,22 @@ Aidge::Tensor Aidge::loss::CELoss(std::shared_ptr<Tensor>& prediction,
 {
     AIDGE_ASSERT(prediction->nbDims() == 2,
                  "Label must have two dims: [BatchSize, NbChannel]");
+
     AIDGE_ASSERT(prediction->backend() == target->backend(),
                  "'prediction' and 'target' Tensors must be on the "
                  "same backend. Found {} and {}.\n",
                  prediction->backend(), target->backend());
+
+    AIDGE_ASSERT(prediction->device() == target->device(),
+                 "'prediction' and 'target' Tensors must be on the "
+                 "same device. Found {} and {}.\n",
+                 prediction->device(), target->device());
+
     AIDGE_ASSERT(prediction->dims() == target->dims(),
                  "'prediction' (shape {}) and 'target' (shape {}) Tensors must "
                  "have the same dimensions.\n",
                  prediction->dims(), target->dims());
+
     AIDGE_ASSERT(prediction->dataType() == target->dataType(),
                  "'prediction' (data type {}) and 'target' (data type {}) "
                  "Tensors must have the same data type.\n",
@@ -52,12 +60,13 @@ Aidge::Tensor Aidge::loss::CELoss(std::shared_ptr<Tensor>& prediction,
 
     auto backend  = prediction->backend();
     auto dataType = prediction->dataType();
+    auto device   = prediction->device();
 
     // Compute the predicition SoftMax
 
     auto softmaxOp = Softmax_Op(1);
     softmaxOp.setDataType(dataType);
-    softmaxOp.setBackend(backend);
+    softmaxOp.setBackend(backend, device);
 
     softmaxOp.associateInput(0, prediction);
     softmaxOp.forward();
@@ -80,7 +89,7 @@ Aidge::Tensor Aidge::loss::CELoss(std::shared_ptr<Tensor>& prediction,
 
     std::shared_ptr<GraphView> lossGraphView = std::make_shared<GraphView>("CELoss");
     lossGraphView->add({targetNode, softmaxNode, logNode, mulNode, sumNode, meanNode});
-    lossGraphView->compile(backend, dataType);
+    lossGraphView->compile(backend, dataType, device);
 
     SequentialScheduler scheduler(lossGraphView);
     scheduler.forward(true);
@@ -89,7 +98,7 @@ Aidge::Tensor Aidge::loss::CELoss(std::shared_ptr<Tensor>& prediction,
     auto lossTensor = meanOp->getOutput(0);
 
     auto scalar = Tensor(-1.0f);
-    scalar.setBackend(backend);
+    scalar.setBackend(backend, device);
     scalar.setDataType(dataType);
 
     (*lossTensor) = (*lossTensor) * scalar;
@@ -100,7 +109,7 @@ Aidge::Tensor Aidge::loss::CELoss(std::shared_ptr<Tensor>& prediction,
 
     auto subOp = Sub_Op();
     subOp.setDataType(dataType);
-    subOp.setBackend(backend);
+    subOp.setBackend(backend, device);
 
     subOp.associateInput(0, softmax);
     subOp.associateInput(1, target);
@@ -110,13 +119,17 @@ Aidge::Tensor Aidge::loss::CELoss(std::shared_ptr<Tensor>& prediction,
 
     const float batchSize = static_cast<float>((target->dims())[0]);
 
+    // Compute the rescaled error
+
     scalar = Tensor(1.0f / batchSize);
-    scalar.setBackend(backend);
+    scalar.setBackend(backend, device);
     scalar.setDataType(dataType);
 
 
     (*err) = (*err) * scalar;
 
+    // Set the error signal
+
     prediction->setGrad(err);
 
     // Return the loss value
diff --git a/src/loss/distillation/KD.cpp b/src/loss/distillation/KD.cpp
index 4283a28fa05c91cdb58f3a6677210b414e18f44a..411be45803069b32b94a18a46b196f82ac91ce69 100644
--- a/src/loss/distillation/KD.cpp
+++ b/src/loss/distillation/KD.cpp
@@ -59,10 +59,17 @@ Aidge::Tensor Aidge::loss::KD(std::shared_ptr<Tensor>& student_prediction,
                  "'prediction' and 'target' Tensors must be on the "
                  "same backend. Found {} and {}.\n",
                  student_prediction->backend(), teacher_prediction->backend());
+
+    AIDGE_ASSERT(student_prediction->device() == teacher_prediction->device(),
+                 "'prediction' and 'target' Tensors must be on the "
+                 "same device. Found {} and {}.\n",
+                 student_prediction->device(), teacher_prediction->device());
+
     AIDGE_ASSERT(student_prediction->dims() == teacher_prediction->dims(),
                  "'prediction' (shape {}) and 'target' (shape {}) Tensors must "
                  "have the same dimensions.\n",
                  student_prediction->dims(), teacher_prediction->dims());
+
     AIDGE_ASSERT(student_prediction->dataType() == teacher_prediction->dataType(),
                  "'prediction' (data type {}) and 'target' (data type {}) "
                  "Tensors must have the same data type.\n",
@@ -134,7 +141,7 @@ Aidge::Tensor Aidge::loss::KD(std::shared_ptr<Tensor>& student_prediction,
                   soft_teacher_node, mul_node, 
                   mul2_node->getParent(1), mul2_node, 
                   rm_node, sub_node});
-    gv_loss->compile(student_prediction->getImpl()->backend(), student_prediction->dataType());
+    gv_loss->compile(student_prediction->backend(), student_prediction->dataType(), student_prediction->device());
 
     SequentialScheduler ss_loss{gv_loss};
     ss_loss.forward(false);
diff --git a/src/loss/regression/MSE.cpp b/src/loss/regression/MSE.cpp
index 8234aa9875163f9a6223327fcaa0780b74bd630c..67d37d3d11551a72fcc4089f7daeaee5510fb017 100644
--- a/src/loss/regression/MSE.cpp
+++ b/src/loss/regression/MSE.cpp
@@ -56,10 +56,17 @@ Aidge::Tensor Aidge::loss::MSE(std::shared_ptr<Tensor>& prediction,
                  "'prediction' and 'target' Tensors must be on the "
                  "same backend. Found {} and {}.\n",
                  prediction->backend(), target->backend());
+
+    AIDGE_ASSERT(prediction->device() == target->device(),
+                 "'prediction' and 'target' Tensors must be on the "
+                 "same device. Found {} and {}.\n",
+                 prediction->device(), target->device());
+
     AIDGE_ASSERT(prediction->dims() == target->dims(),
                  "'prediction' (shape {}) and 'target' (shape {}) Tensors must "
                  "have the same dimensions.\n",
                  prediction->dims(), target->dims());
+
     AIDGE_ASSERT(prediction->dataType() == target->dataType(),
                  "'prediction' (data type {}) and 'target' (data type {}) "
                  "Tensors must have the same data type.\n",
@@ -93,7 +100,7 @@ Aidge::Tensor Aidge::loss::MSE(std::shared_ptr<Tensor>& prediction,
         Sequential({sub_node, pow_node, rm_node});
     gv_local->add({sub_node->getParent(0), sub_node->getParent(1), pow_exp_node,
                    mul_node->getParent(1), mul_node});
-    gv_local->compile(prediction->getImpl()->backend(), prediction->dataType());
+    gv_local->compile(prediction->backend(), prediction->dataType(), prediction->device());
 
     SequentialScheduler ss_local{gv_local};
     ss_local.forward(false);
diff --git a/src/metrics/Accuracy.cpp b/src/metrics/Accuracy.cpp
index 5462505fd67786598f6ad09793ddcf66c4f15c69..7181febb65e948436746549bff38086b1b2196c5 100644
--- a/src/metrics/Accuracy.cpp
+++ b/src/metrics/Accuracy.cpp
@@ -29,8 +29,9 @@ Aidge::Tensor Aidge::metrics::Accuracy(std::shared_ptr<Tensor>& prediction,
                                         const std::shared_ptr<Tensor>& target,
                                         std::int32_t axis) {
     /*
-    Implementation note:
-    Accuracy is computed using a graph in order to not be backend dependant.
+        Implementation note:
+        Accuracy is computed using a graph in order to not be backend dependant.
+        The graph used is the following:
 
     The graph used is the following:
 
@@ -42,23 +43,29 @@ Aidge::Tensor Aidge::metrics::Accuracy(std::shared_ptr<Tensor>& prediction,
     AIDGE_ASSERT(target->dims().size() == 2,
                  "Label must have two dims: [BatchSize, NbChannel]");
 
-    std::shared_ptr<Tensor> outputGrad = prediction->grad();
-
     AIDGE_ASSERT(prediction->backend() == target->backend(),
                  "'prediction' and 'target' Tensors must be on the "
                  "same backend. Found {} and {}.\n",
                  prediction->backend(), target->backend());
+
+    AIDGE_ASSERT(prediction->device() == target->device(),
+                 "'prediction' and 'target' Tensors must be on the "
+                 "same device. Found {} and {}.\n",
+                 prediction->device(), target->device());
+
     AIDGE_ASSERT(prediction->dims() == target->dims(),
                  "'prediction' (shape {}) and 'target' (shape {}) Tensors must "
                  "have the same dimensions.\n",
                  prediction->dims(), target->dims());
+
     AIDGE_ASSERT(prediction->dataType() == target->dataType(),
                  "'prediction' (data type {}) and 'target' (data type {}) "
                  "Tensors must have the same data type.\n",
                  prediction->dataType(), target->dataType());
 
     // Create graph nodes and connections
-    const std::shared_ptr<Node> argmax_perd_node = ArgMax(axis);
+
+    const std::shared_ptr<Node> argmax_pred_node = ArgMax(axis);
     const std::shared_ptr<Node> argmax_target_node = ArgMax(axis);
 
     const std::shared_ptr<Node> equal_node = Equal();
@@ -66,29 +73,30 @@ Aidge::Tensor Aidge::metrics::Accuracy(std::shared_ptr<Tensor>& prediction,
     const std::shared_ptr<Node> rs_node = ReduceSum();
 
     const std::shared_ptr<Node> pred_node = Producer(prediction, "pred");
-    pred_node->addChild(argmax_perd_node);
+    pred_node->addChild(argmax_pred_node);
+
     const std::shared_ptr<Node> label_node = Producer(target, "label");
     label_node->addChild(argmax_target_node);
 
-    argmax_perd_node->addChild(equal_node,0,0);
-    argmax_target_node->addChild(equal_node,0,1);
-
-    // equal_node->addChild(rs_node,0,0);
+    argmax_pred_node->addChild(equal_node, 0, 0);
+    argmax_target_node->addChild(equal_node, 0, 1);
 
     // Create the graph
-    std::shared_ptr<GraphView> gv_local =
-        Sequential({equal_node, cast_node, rs_node});
 
-    gv_local->add({pred_node,argmax_perd_node, label_node,argmax_target_node});
-    gv_local->compile(prediction->getImpl()->backend(), prediction->dataType());
+    std::shared_ptr<GraphView> gv_local = Sequential({equal_node, cast_node, rs_node});
+
+    gv_local->add({pred_node, argmax_pred_node, label_node, argmax_target_node});
+    gv_local->compile(prediction->backend(), prediction->dataType(), prediction->device());
+
+    // Execute the graph and retreive the result
 
     SequentialScheduler ss_local{gv_local};
+
     ss_local.forward(false);
 
-    // TODO: way too complicated to access
-    const std::shared_ptr<OperatorTensor> res =
-        std::dynamic_pointer_cast<OperatorTensor>(rs_node->getOperator());
+    const std::shared_ptr<OperatorTensor> op = std::dynamic_pointer_cast<OperatorTensor>(rs_node->getOperator());
+
     std::shared_ptr<Tensor> fallback;
 
-    return res->getOutput(0)->refFrom(fallback, "cpu");
+    return op->getOutput(0)->refFrom(fallback, "cpu");
 }