diff --git a/src/loss/classification/BCE.cpp b/src/loss/classification/BCE.cpp
index 857bfe7161928e59ad90c17791b0d691ffe5365a..d80dd2749ae8e4128e9fc5b07740647854d7f5bb 100644
--- a/src/loss/classification/BCE.cpp
+++ b/src/loss/classification/BCE.cpp
@@ -31,10 +31,14 @@
 Aidge::Tensor Aidge::loss::BCE(std::shared_ptr<Tensor>& prediction,
                                const std::shared_ptr<Tensor>& target) {
     /*
-	Binay Cross Entropy (BCE) loss function
+	Binay Cross Entropy (BCE) loss function:
+
+	   loss =  - 1/N * SUM_i[ t_i * ln(p_i + eps) + (1 - t_i) * ln(1 - p_i + eps) ]
+
+	   d(loss) / d(p_i) = - 1 / N * (t_i - p_i) / [ (p_i + eps) * (1 - p_i + eps) ]
 
     Implementation note:
-    loss function is computed using a graph in order to not be backend dependant.
+        loss function is computed using a graph in order to not be backend dependant.
     */
 
     AIDGE_ASSERT(target->dims().size() == 2,
@@ -52,100 +56,95 @@ Aidge::Tensor Aidge::loss::BCE(std::shared_ptr<Tensor>& prediction,
                  "Tensors must have the same data type.\n",
                  prediction->dataType(), target->dataType());
 
-    const float eps1 = 1.e-10f;
-    const float eps2 = 1.e-10f;
-
     // Define nodes: inputs
     const std::shared_ptr<Node> prediction_node = Producer(prediction, "pred");
     const std::shared_ptr<Node> target_node = Producer(target, "label");
+    const std::shared_ptr<Node> one_node =  Producer(std::make_shared<Tensor>(Array1D<float, 1>{{1.0f}}), "one");
+    const std::shared_ptr<Node> eps_node =  Producer(std::make_shared<Tensor>(Array1D<float, 1>{{1.e-12f}}), "eps");
 
-    // Define nodes: add1 = prediction + eps1, add2 = target + eps1
+    // Define nodes: add1 = p + eps
     const std::shared_ptr<Node> add1_node = Add("add1");
-    const std::shared_ptr<Node> add2_node = Add("add2");
     prediction_node->addChild(add1_node, 0, 0);
-    Producer(std::make_shared<Tensor>(Array1D<float, 1>{{eps1}}))
-        ->addChild(add1_node, 0, 1);
-    target_node->addChild(add2_node, 0, 0);
-    Producer(std::make_shared<Tensor>(Array1D<float, 1>{{eps1}}))
-        ->addChild(add2_node, 0, 1);
+    eps_node->addChild(add1_node, 0, 1);
 
-    // Define nodes: sub1 = 1 - prediction + eps2 and sub2 = - (1 - target + eps2)
+    // Define nodes: sub1 = 1 - p and add2  = 1 - p + eps
     const std::shared_ptr<Node> sub1_node = Sub("sub1");
-    const std::shared_ptr<Node> sub2_node = Sub("sub2");
-    Producer(std::make_shared<Tensor>(Array1D<float, 1>{{1.0f + eps2}}))
-        ->addChild(sub1_node, 0, 0);
+    const std::shared_ptr<Node> add2_node = Add("add2");
+    one_node->addChild(sub1_node, 0, 0);
     prediction_node->addChild(sub1_node, 0, 1);
-    target_node->addChild(sub2_node, 0, 0);
-    Producer(std::make_shared<Tensor>(Array1D<float, 1>{{1.0f + eps2}}))
-        ->addChild(sub2_node, 0, 1);
+    sub1_node->addChild(add2_node, 0, 0);
+    eps_node->addChild(add2_node, 0, 1);
 
-    // Define nodes: ln1 = ln(prediction + eps1) and ln2 = ln(1 - prediction + eps2)
+    // Define nodes: ln1 = ln(p + eps) and ln2 = ln(1 - p + eps)
     const std::shared_ptr<Node> ln1_node = Ln("ln1");
     const std::shared_ptr<Node> ln2_node = Ln("ln2");
     add1_node-> addChild(ln1_node, 0, 0);
-    sub1_node-> addChild(ln2_node, 0, 0);
+    add2_node-> addChild(ln2_node, 0, 0);
 
-    // Define nodes: mul1 = (target + eps1) * ln(prediction + eps1) and mul2 = - (1 - target + eps2) * ln(1 - prediction + eps2)
+    // Define nodes: sub2 = - (1 - t)
+    const std::shared_ptr<Node> sub2_node = Sub("sub2");
+    target_node->addChild(sub2_node, 0, 0);
+    one_node->addChild(sub2_node, 0, 1);
+
+    // Define nodes: mul1 = t * ln(p + eps) and mul2 = - (1 - t) * ln(1 - p + eps)
     const std::shared_ptr<Node> mul1_node = Mul("mul1");
     const std::shared_ptr<Node> mul2_node = Mul("mul2");
-    add2_node->addChild(mul1_node, 0, 0);
+    target_node->addChild(mul1_node, 0, 0);
     ln1_node->addChild(mul1_node, 0, 1);
     sub2_node->addChild(mul2_node, 0, 0);
     ln2_node->addChild(mul2_node, 0, 1);
 
-    // Define node: sub3 = - [(target + eps1) * ln(prediction + eps1) + (1 - target + eps2) * ln(1 - prediction + eps2)]
+    // Define node: sub3 = - [t * ln(p + eps) + (1 - t) * ln(1 - p + eps)]
     const std::shared_ptr<Node> sub3_node = Sub("sub3");
     mul2_node->addChild(sub3_node, 0, 0);
     mul1_node->addChild(sub3_node, 0, 1);
 
-    // Define nodes: div1 = (target + eps1) / (prediction + eps1) and div2 = - (1 - target + eps2)/(1 - prediction + eps2)
-    const std::shared_ptr<Node> div1_node = Div("div1");
-    const std::shared_ptr<Node> div2_node = Div("div2");
-    add2_node->addChild(div1_node, 0, 0);
-    add1_node->addChild(div1_node, 0, 1);
-    sub2_node->addChild(div2_node, 0, 0);
-    sub1_node->addChild(div2_node, 0, 1);
+    // Define node: sub4 = t - p
+    const std::shared_ptr<Node> sub4_node = Sub("sub4");
+    target_node->addChild(sub4_node, 0, 0);
+    prediction_node->addChild(sub4_node, 0, 1);
+
+    // Define nodes: mul3 = (p + eps) * (1 - p + eps)
+    const std::shared_ptr<Node> mul3_node = Mul("mul3");
+    add1_node->addChild(mul3_node, 0, 0);
+    add2_node->addChild(mul3_node, 0, 1);
 
-    // Define node: add3 = (target + eps1) / (prediction + eps1) - (1 - target + eps2)/(1 - prediction + eps2)
-    const std::shared_ptr<Node> add3_node = Add("add3");
-    div1_node->addChild(add3_node, 0, 0);
-    div2_node->addChild(add3_node, 0, 1);
+    // Define nodes: div1 = (t - p) / [ (p + eps) * (1 - p + eps) ]
+    const std::shared_ptr<Node> div1_node = Div("div1");
+    sub4_node->addChild(div1_node, 0, 0);
+    mul3_node->addChild(div1_node, 0, 1);
 
     // Define node: loss
     std::vector<int> axes_dims(prediction->nbDims());
     std::iota(std::begin(axes_dims), std::end(axes_dims), 0);
-    auto loss_node = ReduceMean(axes_dims, true, false, "loss");
+    auto loss_node = ReduceMean(axes_dims, "loss");
     sub3_node->addChild(loss_node, 0, 0);
 
     // Define node: gradient
     const std::shared_ptr<Node> gradient_node = Mul("gradient");
-    add3_node->addChild(gradient_node, 0, 0);
+    div1_node->addChild(gradient_node, 0, 0);
     Producer(std::make_shared<Tensor>(Array1D<float, 1>{{-1.0f/float(target->dims()[0])}}))
         ->addChild(gradient_node, 0, 1);
 
     // Create GraphView
     std::shared_ptr<GraphView> gv_loss = std::make_shared<GraphView>("BCE");
     gv_loss->add({prediction_node, target_node,
-                  add1_node->getParent(1), add1_node,
-                  add2_node->getParent(1), add2_node,
-                  sub1_node->getParent(0), sub1_node,
-                  sub2_node->getParent(1), sub2_node,
-                  ln1_node, ln2_node, mul1_node, mul2_node, div1_node, div2_node,
-                  sub3_node, loss_node,
-                  add3_node, gradient_node->getParent(1), gradient_node});
+                  one_node, eps_node,
+                  add1_node, sub1_node, add2_node,
+                  ln1_node, ln2_node,
+                  sub2_node, mul1_node, mul2_node, sub3_node, loss_node,
+                  sub4_node, mul3_node, div1_node, gradient_node->getParent(1), gradient_node});
     gv_loss->compile(prediction->getImpl()->backend(), prediction->dataType());
 
     // Compute loss and gradient
     SequentialScheduler ss_loss{gv_loss};
     ss_loss.forward(false);
 
-    // prediction->initGrad(); // Enable gradient for output
     std::shared_ptr<Tensor> outputGrad = prediction->grad();
     const std::shared_ptr<OperatorTensor> gradient_op = std::dynamic_pointer_cast<OperatorTensor>(gradient_node->getOperator());
     outputGrad->copyFrom(gradient_op->getOutput(0)->clone()); // Update gradient
 
     const std::shared_ptr<OperatorTensor> loss_op = std::dynamic_pointer_cast<OperatorTensor>(loss_node->getOperator());
-    // return loss_op->getOutput(0)->clone(); // Return loss
     std::shared_ptr<Tensor> fallback;
 
     return loss_op->getOutput(0)->refFrom(fallback, "cpu");
diff --git a/unit_tests/loss/classification/Test_BCE.cpp b/unit_tests/loss/classification/Test_BCE.cpp
index 9190e81012d0a530e677822a973847a0bbeff72e..5a7fafd6816093134d384747d1ac315156797490 100644
--- a/unit_tests/loss/classification/Test_BCE.cpp
+++ b/unit_tests/loss/classification/Test_BCE.cpp
@@ -63,11 +63,10 @@ TEST_CASE("[loss/classification] BCE", "[loss][classification][BCE]") {
             }
 
             // compute the BCE manually
-            const float eps1 = 1.0e-10f;
-            const float eps2 = 1.0e-10f;
+            const float eps = 1.0e-12f;
             std::unique_ptr<float[]> tmp_res_manual = std::make_unique<float[]>(nb_elements);
             for (std::size_t i = 0; i < nb_elements; ++i) {
-                tmp_res_manual[i] = - ((targ[i] + eps1) * std::log(pred[i] + eps1) + (1.0f - targ[i] + eps2) * std::log(1.0f - pred[i] + eps2));
+                tmp_res_manual[i] = - (targ[i] * std::log(pred[i] + eps) + (1.0f - targ[i]) * std::log((1.0f - pred[i]) + eps));
             }
             fmt::println("Output manual:");
             std::shared_ptr<Tensor> tmp_tensor = std::make_shared<Tensor>(dims);
@@ -126,11 +125,10 @@ TEST_CASE("[loss/classification] BCE", "[loss][classification][BCE]") {
             cudaMemcpy(d_targ, targ.get(), nb_elements * sizeof(float), cudaMemcpyHostToDevice);
 
             // compute the BCE manually
-            const float eps1 = 1.0e-10f;
-            const float eps2 = 1.0e-10f;
+            const float eps = 1.0e-12f;
             std::unique_ptr<float[]> tmp_res_manual = std::make_unique<float[]>(nb_elements);
             for (std::size_t i = 0; i < nb_elements; ++i) {
-                tmp_res_manual[i] = - ((targ[i] + eps1) * std::log(pred[i] + eps1) + (1.0f - targ[i] + eps2) * std::log(1.0f - pred[i] + eps2));
+                tmp_res_manual[i] = - (targ[i] * std::log(pred[i] + eps) + (1.0f - targ[i]) * std::log((1.0f - pred[i]) + eps));
             }
             float * d_tmp_res_manual;
             cudaMalloc(reinterpret_cast<void **>(&d_tmp_res_manual), nb_elements * sizeof(float));