Skip to content
Snippets Groups Projects

BCE loss: fix numerical issues

Merged Olivier Antoni requested to merge oantoni/aidge_learning:BCE_loss into dev
2 files
+ 49
52
Compare changes
  • Side-by-side
  • Inline
Files
2
@@ -31,10 +31,14 @@
@@ -31,10 +31,14 @@
Aidge::Tensor Aidge::loss::BCE(std::shared_ptr<Tensor>& prediction,
Aidge::Tensor Aidge::loss::BCE(std::shared_ptr<Tensor>& prediction,
const std::shared_ptr<Tensor>& target) {
const std::shared_ptr<Tensor>& target) {
/*
/*
Binay Cross Entropy (BCE) loss function
Binay Cross Entropy (BCE) loss function:
 
 
loss = - 1/N * SUM_i[ t_i * ln(p_i + eps) + (1 - t_i) * ln(1 - p_i + eps) ]
 
 
d(loss) / d(p_i) = - 1 / N * (t_i - p_i) / [ (p_i + eps) * (1 - p_i + eps) ]
Implementation note:
Implementation note:
loss function is computed using a graph in order to not be backend dependant.
loss function is computed using a graph in order to not be backend dependant.
*/
*/
AIDGE_ASSERT(target->dims().size() == 2,
AIDGE_ASSERT(target->dims().size() == 2,
@@ -52,100 +56,95 @@ Aidge::Tensor Aidge::loss::BCE(std::shared_ptr<Tensor>& prediction,
@@ -52,100 +56,95 @@ Aidge::Tensor Aidge::loss::BCE(std::shared_ptr<Tensor>& prediction,
"Tensors must have the same data type.\n",
"Tensors must have the same data type.\n",
prediction->dataType(), target->dataType());
prediction->dataType(), target->dataType());
const float eps1 = 1.e-10f;
const float eps2 = 1.e-10f;
// Define nodes: inputs
// Define nodes: inputs
const std::shared_ptr<Node> prediction_node = Producer(prediction, "pred");
const std::shared_ptr<Node> prediction_node = Producer(prediction, "pred");
const std::shared_ptr<Node> target_node = Producer(target, "label");
const std::shared_ptr<Node> target_node = Producer(target, "label");
 
const std::shared_ptr<Node> one_node = Producer(std::make_shared<Tensor>(Array1D<float, 1>{{1.0f}}), "one");
 
const std::shared_ptr<Node> eps_node = Producer(std::make_shared<Tensor>(Array1D<float, 1>{{1.e-12f}}), "eps");
// Define nodes: add1 = prediction + eps1, add2 = target + eps1
// Define nodes: add1 = p + eps
const std::shared_ptr<Node> add1_node = Add("add1");
const std::shared_ptr<Node> add1_node = Add("add1");
const std::shared_ptr<Node> add2_node = Add("add2");
prediction_node->addChild(add1_node, 0, 0);
prediction_node->addChild(add1_node, 0, 0);
Producer(std::make_shared<Tensor>(Array1D<float, 1>{{eps1}}))
eps_node->addChild(add1_node, 0, 1);
->addChild(add1_node, 0, 1);
target_node->addChild(add2_node, 0, 0);
Producer(std::make_shared<Tensor>(Array1D<float, 1>{{eps1}}))
->addChild(add2_node, 0, 1);
// Define nodes: sub1 = 1 - prediction + eps2 and sub2 = - (1 - target + eps2)
// Define nodes: sub1 = 1 - p and add2 = 1 - p + eps
const std::shared_ptr<Node> sub1_node = Sub("sub1");
const std::shared_ptr<Node> sub1_node = Sub("sub1");
const std::shared_ptr<Node> sub2_node = Sub("sub2");
const std::shared_ptr<Node> add2_node = Add("add2");
Producer(std::make_shared<Tensor>(Array1D<float, 1>{{1.0f + eps2}}))
one_node->addChild(sub1_node, 0, 0);
->addChild(sub1_node, 0, 0);
prediction_node->addChild(sub1_node, 0, 1);
prediction_node->addChild(sub1_node, 0, 1);
target_node->addChild(sub2_node, 0, 0);
sub1_node->addChild(add2_node, 0, 0);
Producer(std::make_shared<Tensor>(Array1D<float, 1>{{1.0f + eps2}}))
eps_node->addChild(add2_node, 0, 1);
->addChild(sub2_node, 0, 1);
// Define nodes: ln1 = ln(prediction + eps1) and ln2 = ln(1 - prediction + eps2)
// Define nodes: ln1 = ln(p + eps) and ln2 = ln(1 - p + eps)
const std::shared_ptr<Node> ln1_node = Ln("ln1");
const std::shared_ptr<Node> ln1_node = Ln("ln1");
const std::shared_ptr<Node> ln2_node = Ln("ln2");
const std::shared_ptr<Node> ln2_node = Ln("ln2");
add1_node-> addChild(ln1_node, 0, 0);
add1_node-> addChild(ln1_node, 0, 0);
sub1_node-> addChild(ln2_node, 0, 0);
add2_node-> addChild(ln2_node, 0, 0);
// Define nodes: mul1 = (target + eps1) * ln(prediction + eps1) and mul2 = - (1 - target + eps2) * ln(1 - prediction + eps2)
// Define nodes: sub2 = - (1 - t)
 
const std::shared_ptr<Node> sub2_node = Sub("sub2");
 
target_node->addChild(sub2_node, 0, 0);
 
one_node->addChild(sub2_node, 0, 1);
 
 
// Define nodes: mul1 = t * ln(p + eps) and mul2 = - (1 - t) * ln(1 - p + eps)
const std::shared_ptr<Node> mul1_node = Mul("mul1");
const std::shared_ptr<Node> mul1_node = Mul("mul1");
const std::shared_ptr<Node> mul2_node = Mul("mul2");
const std::shared_ptr<Node> mul2_node = Mul("mul2");
add2_node->addChild(mul1_node, 0, 0);
target_node->addChild(mul1_node, 0, 0);
ln1_node->addChild(mul1_node, 0, 1);
ln1_node->addChild(mul1_node, 0, 1);
sub2_node->addChild(mul2_node, 0, 0);
sub2_node->addChild(mul2_node, 0, 0);
ln2_node->addChild(mul2_node, 0, 1);
ln2_node->addChild(mul2_node, 0, 1);
// Define node: sub3 = - [(target + eps1) * ln(prediction + eps1) + (1 - target + eps2) * ln(1 - prediction + eps2)]
// Define node: sub3 = - [t * ln(p + eps) + (1 - t) * ln(1 - p + eps)]
const std::shared_ptr<Node> sub3_node = Sub("sub3");
const std::shared_ptr<Node> sub3_node = Sub("sub3");
mul2_node->addChild(sub3_node, 0, 0);
mul2_node->addChild(sub3_node, 0, 0);
mul1_node->addChild(sub3_node, 0, 1);
mul1_node->addChild(sub3_node, 0, 1);
// Define nodes: div1 = (target + eps1) / (prediction + eps1) and div2 = - (1 - target + eps2)/(1 - prediction + eps2)
// Define node: sub4 = t - p
const std::shared_ptr<Node> div1_node = Div("div1");
const std::shared_ptr<Node> sub4_node = Sub("sub4");
const std::shared_ptr<Node> div2_node = Div("div2");
target_node->addChild(sub4_node, 0, 0);
add2_node->addChild(div1_node, 0, 0);
prediction_node->addChild(sub4_node, 0, 1);
add1_node->addChild(div1_node, 0, 1);
sub2_node->addChild(div2_node, 0, 0);
// Define nodes: mul3 = (p + eps) * (1 - p + eps)
sub1_node->addChild(div2_node, 0, 1);
const std::shared_ptr<Node> mul3_node = Mul("mul3");
 
add1_node->addChild(mul3_node, 0, 0);
 
add2_node->addChild(mul3_node, 0, 1);
// Define node: add3 = (target + eps1) / (prediction + eps1) - (1 - target + eps2)/(1 - prediction + eps2)
// Define nodes: div1 = (t - p) / [ (p + eps) * (1 - p + eps) ]
const std::shared_ptr<Node> add3_node = Add("add3");
const std::shared_ptr<Node> div1_node = Div("div1");
div1_node->addChild(add3_node, 0, 0);
sub4_node->addChild(div1_node, 0, 0);
div2_node->addChild(add3_node, 0, 1);
mul3_node->addChild(div1_node, 0, 1);
// Define node: loss
// Define node: loss
std::vector<int> axes_dims(prediction->nbDims());
std::vector<int> axes_dims(prediction->nbDims());
std::iota(std::begin(axes_dims), std::end(axes_dims), 0);
std::iota(std::begin(axes_dims), std::end(axes_dims), 0);
auto loss_node = ReduceMean(axes_dims, true, false, "loss");
auto loss_node = ReduceMean(axes_dims, "loss");
sub3_node->addChild(loss_node, 0, 0);
sub3_node->addChild(loss_node, 0, 0);
// Define node: gradient
// Define node: gradient
const std::shared_ptr<Node> gradient_node = Mul("gradient");
const std::shared_ptr<Node> gradient_node = Mul("gradient");
add3_node->addChild(gradient_node, 0, 0);
div1_node->addChild(gradient_node, 0, 0);
Producer(std::make_shared<Tensor>(Array1D<float, 1>{{-1.0f/float(target->dims()[0])}}))
Producer(std::make_shared<Tensor>(Array1D<float, 1>{{-1.0f/float(target->dims()[0])}}))
->addChild(gradient_node, 0, 1);
->addChild(gradient_node, 0, 1);
// Create GraphView
// Create GraphView
std::shared_ptr<GraphView> gv_loss = std::make_shared<GraphView>("BCE");
std::shared_ptr<GraphView> gv_loss = std::make_shared<GraphView>("BCE");
gv_loss->add({prediction_node, target_node,
gv_loss->add({prediction_node, target_node,
add1_node->getParent(1), add1_node,
one_node, eps_node,
add2_node->getParent(1), add2_node,
add1_node, sub1_node, add2_node,
sub1_node->getParent(0), sub1_node,
ln1_node, ln2_node,
sub2_node->getParent(1), sub2_node,
sub2_node, mul1_node, mul2_node, sub3_node, loss_node,
ln1_node, ln2_node, mul1_node, mul2_node, div1_node, div2_node,
sub4_node, mul3_node, div1_node, gradient_node->getParent(1), gradient_node});
sub3_node, loss_node,
add3_node, gradient_node->getParent(1), gradient_node});
gv_loss->compile(prediction->getImpl()->backend(), prediction->dataType());
gv_loss->compile(prediction->getImpl()->backend(), prediction->dataType());
// Compute loss and gradient
// Compute loss and gradient
SequentialScheduler ss_loss{gv_loss};
SequentialScheduler ss_loss{gv_loss};
ss_loss.forward(false);
ss_loss.forward(false);
// prediction->initGrad(); // Enable gradient for output
std::shared_ptr<Tensor> outputGrad = prediction->grad();
std::shared_ptr<Tensor> outputGrad = prediction->grad();
const std::shared_ptr<OperatorTensor> gradient_op = std::dynamic_pointer_cast<OperatorTensor>(gradient_node->getOperator());
const std::shared_ptr<OperatorTensor> gradient_op = std::dynamic_pointer_cast<OperatorTensor>(gradient_node->getOperator());
outputGrad->copyFrom(gradient_op->getOutput(0)->clone()); // Update gradient
outputGrad->copyFrom(gradient_op->getOutput(0)->clone()); // Update gradient
const std::shared_ptr<OperatorTensor> loss_op = std::dynamic_pointer_cast<OperatorTensor>(loss_node->getOperator());
const std::shared_ptr<OperatorTensor> loss_op = std::dynamic_pointer_cast<OperatorTensor>(loss_node->getOperator());
// return loss_op->getOutput(0)->clone(); // Return loss
std::shared_ptr<Tensor> fallback;
std::shared_ptr<Tensor> fallback;
return loss_op->getOutput(0)->refFrom(fallback, "cpu");
return loss_op->getOutput(0)->refFrom(fallback, "cpu");
Loading