diff --git a/src/loss/classification/BCE.cpp b/src/loss/classification/BCE.cpp
index d5156072e9aeff84470fc60a4efb7571de81483b..296d466cd3583600a3e02d13b7b5a19f5ad5ac76 100644
--- a/src/loss/classification/BCE.cpp
+++ b/src/loss/classification/BCE.cpp
@@ -122,7 +122,7 @@ Aidge::Tensor Aidge::loss::BCE(std::shared_ptr<Tensor>& prediction,
     // Define node: loss
     std::vector<int> axes_dims(prediction->nbDims());
     std::iota(std::begin(axes_dims), std::end(axes_dims), 0);
-    auto loss_node = ReduceMean(axes_dims, 1, "loss");
+    auto loss_node = ReduceMean(axes_dims, true, false, "loss");
     sub3_node->addChild(loss_node, 0, 0);
 
     // Define node: gradient
@@ -153,5 +153,8 @@ Aidge::Tensor Aidge::loss::BCE(std::shared_ptr<Tensor>& prediction,
     outputGrad->copyFrom(gradient_op->getOutput(0)->clone()); // Update gradient
 
     const std::shared_ptr<OperatorTensor> loss_op = std::dynamic_pointer_cast<OperatorTensor>(loss_node->getOperator());
-    return loss_op->getOutput(0)->clone(); // Return loss
+    // return loss_op->getOutput(0)->clone(); // Return loss
+    std::shared_ptr<Tensor> fallback;
+
+    return loss_op->getOutput(0)->refFrom(fallback, "cpu");
 }