diff --git a/src/loss/classification/BCE.cpp b/src/loss/classification/BCE.cpp index 857bfe7161928e59ad90c17791b0d691ffe5365a..d80dd2749ae8e4128e9fc5b07740647854d7f5bb 100644 --- a/src/loss/classification/BCE.cpp +++ b/src/loss/classification/BCE.cpp @@ -31,10 +31,14 @@ Aidge::Tensor Aidge::loss::BCE(std::shared_ptr<Tensor>& prediction, const std::shared_ptr<Tensor>& target) { /* - Binay Cross Entropy (BCE) loss function + Binay Cross Entropy (BCE) loss function: + + loss = - 1/N * SUM_i[ t_i * ln(p_i + eps) + (1 - t_i) * ln(1 - p_i + eps) ] + + d(loss) / d(p_i) = - 1 / N * (t_i - p_i) / [ (p_i + eps) * (1 - p_i + eps) ] Implementation note: - loss function is computed using a graph in order to not be backend dependant. + loss function is computed using a graph in order to not be backend dependant. */ AIDGE_ASSERT(target->dims().size() == 2, @@ -52,100 +56,95 @@ Aidge::Tensor Aidge::loss::BCE(std::shared_ptr<Tensor>& prediction, "Tensors must have the same data type.\n", prediction->dataType(), target->dataType()); - const float eps1 = 1.e-10f; - const float eps2 = 1.e-10f; - // Define nodes: inputs const std::shared_ptr<Node> prediction_node = Producer(prediction, "pred"); const std::shared_ptr<Node> target_node = Producer(target, "label"); + const std::shared_ptr<Node> one_node = Producer(std::make_shared<Tensor>(Array1D<float, 1>{{1.0f}}), "one"); + const std::shared_ptr<Node> eps_node = Producer(std::make_shared<Tensor>(Array1D<float, 1>{{1.e-12f}}), "eps"); - // Define nodes: add1 = prediction + eps1, add2 = target + eps1 + // Define nodes: add1 = p + eps const std::shared_ptr<Node> add1_node = Add("add1"); - const std::shared_ptr<Node> add2_node = Add("add2"); prediction_node->addChild(add1_node, 0, 0); - Producer(std::make_shared<Tensor>(Array1D<float, 1>{{eps1}})) - ->addChild(add1_node, 0, 1); - target_node->addChild(add2_node, 0, 0); - Producer(std::make_shared<Tensor>(Array1D<float, 1>{{eps1}})) - ->addChild(add2_node, 0, 1); + eps_node->addChild(add1_node, 0, 1); - // Define nodes: sub1 = 1 - prediction + eps2 and sub2 = - (1 - target + eps2) + // Define nodes: sub1 = 1 - p and add2 = 1 - p + eps const std::shared_ptr<Node> sub1_node = Sub("sub1"); - const std::shared_ptr<Node> sub2_node = Sub("sub2"); - Producer(std::make_shared<Tensor>(Array1D<float, 1>{{1.0f + eps2}})) - ->addChild(sub1_node, 0, 0); + const std::shared_ptr<Node> add2_node = Add("add2"); + one_node->addChild(sub1_node, 0, 0); prediction_node->addChild(sub1_node, 0, 1); - target_node->addChild(sub2_node, 0, 0); - Producer(std::make_shared<Tensor>(Array1D<float, 1>{{1.0f + eps2}})) - ->addChild(sub2_node, 0, 1); + sub1_node->addChild(add2_node, 0, 0); + eps_node->addChild(add2_node, 0, 1); - // Define nodes: ln1 = ln(prediction + eps1) and ln2 = ln(1 - prediction + eps2) + // Define nodes: ln1 = ln(p + eps) and ln2 = ln(1 - p + eps) const std::shared_ptr<Node> ln1_node = Ln("ln1"); const std::shared_ptr<Node> ln2_node = Ln("ln2"); add1_node-> addChild(ln1_node, 0, 0); - sub1_node-> addChild(ln2_node, 0, 0); + add2_node-> addChild(ln2_node, 0, 0); - // Define nodes: mul1 = (target + eps1) * ln(prediction + eps1) and mul2 = - (1 - target + eps2) * ln(1 - prediction + eps2) + // Define nodes: sub2 = - (1 - t) + const std::shared_ptr<Node> sub2_node = Sub("sub2"); + target_node->addChild(sub2_node, 0, 0); + one_node->addChild(sub2_node, 0, 1); + + // Define nodes: mul1 = t * ln(p + eps) and mul2 = - (1 - t) * ln(1 - p + eps) const std::shared_ptr<Node> mul1_node = Mul("mul1"); const std::shared_ptr<Node> mul2_node = Mul("mul2"); - add2_node->addChild(mul1_node, 0, 0); + target_node->addChild(mul1_node, 0, 0); ln1_node->addChild(mul1_node, 0, 1); sub2_node->addChild(mul2_node, 0, 0); ln2_node->addChild(mul2_node, 0, 1); - // Define node: sub3 = - [(target + eps1) * ln(prediction + eps1) + (1 - target + eps2) * ln(1 - prediction + eps2)] + // Define node: sub3 = - [t * ln(p + eps) + (1 - t) * ln(1 - p + eps)] const std::shared_ptr<Node> sub3_node = Sub("sub3"); mul2_node->addChild(sub3_node, 0, 0); mul1_node->addChild(sub3_node, 0, 1); - // Define nodes: div1 = (target + eps1) / (prediction + eps1) and div2 = - (1 - target + eps2)/(1 - prediction + eps2) - const std::shared_ptr<Node> div1_node = Div("div1"); - const std::shared_ptr<Node> div2_node = Div("div2"); - add2_node->addChild(div1_node, 0, 0); - add1_node->addChild(div1_node, 0, 1); - sub2_node->addChild(div2_node, 0, 0); - sub1_node->addChild(div2_node, 0, 1); + // Define node: sub4 = t - p + const std::shared_ptr<Node> sub4_node = Sub("sub4"); + target_node->addChild(sub4_node, 0, 0); + prediction_node->addChild(sub4_node, 0, 1); + + // Define nodes: mul3 = (p + eps) * (1 - p + eps) + const std::shared_ptr<Node> mul3_node = Mul("mul3"); + add1_node->addChild(mul3_node, 0, 0); + add2_node->addChild(mul3_node, 0, 1); - // Define node: add3 = (target + eps1) / (prediction + eps1) - (1 - target + eps2)/(1 - prediction + eps2) - const std::shared_ptr<Node> add3_node = Add("add3"); - div1_node->addChild(add3_node, 0, 0); - div2_node->addChild(add3_node, 0, 1); + // Define nodes: div1 = (t - p) / [ (p + eps) * (1 - p + eps) ] + const std::shared_ptr<Node> div1_node = Div("div1"); + sub4_node->addChild(div1_node, 0, 0); + mul3_node->addChild(div1_node, 0, 1); // Define node: loss std::vector<int> axes_dims(prediction->nbDims()); std::iota(std::begin(axes_dims), std::end(axes_dims), 0); - auto loss_node = ReduceMean(axes_dims, true, false, "loss"); + auto loss_node = ReduceMean(axes_dims, "loss"); sub3_node->addChild(loss_node, 0, 0); // Define node: gradient const std::shared_ptr<Node> gradient_node = Mul("gradient"); - add3_node->addChild(gradient_node, 0, 0); + div1_node->addChild(gradient_node, 0, 0); Producer(std::make_shared<Tensor>(Array1D<float, 1>{{-1.0f/float(target->dims()[0])}})) ->addChild(gradient_node, 0, 1); // Create GraphView std::shared_ptr<GraphView> gv_loss = std::make_shared<GraphView>("BCE"); gv_loss->add({prediction_node, target_node, - add1_node->getParent(1), add1_node, - add2_node->getParent(1), add2_node, - sub1_node->getParent(0), sub1_node, - sub2_node->getParent(1), sub2_node, - ln1_node, ln2_node, mul1_node, mul2_node, div1_node, div2_node, - sub3_node, loss_node, - add3_node, gradient_node->getParent(1), gradient_node}); + one_node, eps_node, + add1_node, sub1_node, add2_node, + ln1_node, ln2_node, + sub2_node, mul1_node, mul2_node, sub3_node, loss_node, + sub4_node, mul3_node, div1_node, gradient_node->getParent(1), gradient_node}); gv_loss->compile(prediction->getImpl()->backend(), prediction->dataType()); // Compute loss and gradient SequentialScheduler ss_loss{gv_loss}; ss_loss.forward(false); - // prediction->initGrad(); // Enable gradient for output std::shared_ptr<Tensor> outputGrad = prediction->grad(); const std::shared_ptr<OperatorTensor> gradient_op = std::dynamic_pointer_cast<OperatorTensor>(gradient_node->getOperator()); outputGrad->copyFrom(gradient_op->getOutput(0)->clone()); // Update gradient const std::shared_ptr<OperatorTensor> loss_op = std::dynamic_pointer_cast<OperatorTensor>(loss_node->getOperator()); - // return loss_op->getOutput(0)->clone(); // Return loss std::shared_ptr<Tensor> fallback; return loss_op->getOutput(0)->refFrom(fallback, "cpu"); diff --git a/unit_tests/loss/classification/Test_BCE.cpp b/unit_tests/loss/classification/Test_BCE.cpp index 9190e81012d0a530e677822a973847a0bbeff72e..5a7fafd6816093134d384747d1ac315156797490 100644 --- a/unit_tests/loss/classification/Test_BCE.cpp +++ b/unit_tests/loss/classification/Test_BCE.cpp @@ -63,11 +63,10 @@ TEST_CASE("[loss/classification] BCE", "[loss][classification][BCE]") { } // compute the BCE manually - const float eps1 = 1.0e-10f; - const float eps2 = 1.0e-10f; + const float eps = 1.0e-12f; std::unique_ptr<float[]> tmp_res_manual = std::make_unique<float[]>(nb_elements); for (std::size_t i = 0; i < nb_elements; ++i) { - tmp_res_manual[i] = - ((targ[i] + eps1) * std::log(pred[i] + eps1) + (1.0f - targ[i] + eps2) * std::log(1.0f - pred[i] + eps2)); + tmp_res_manual[i] = - (targ[i] * std::log(pred[i] + eps) + (1.0f - targ[i]) * std::log((1.0f - pred[i]) + eps)); } fmt::println("Output manual:"); std::shared_ptr<Tensor> tmp_tensor = std::make_shared<Tensor>(dims); @@ -126,11 +125,10 @@ TEST_CASE("[loss/classification] BCE", "[loss][classification][BCE]") { cudaMemcpy(d_targ, targ.get(), nb_elements * sizeof(float), cudaMemcpyHostToDevice); // compute the BCE manually - const float eps1 = 1.0e-10f; - const float eps2 = 1.0e-10f; + const float eps = 1.0e-12f; std::unique_ptr<float[]> tmp_res_manual = std::make_unique<float[]>(nb_elements); for (std::size_t i = 0; i < nb_elements; ++i) { - tmp_res_manual[i] = - ((targ[i] + eps1) * std::log(pred[i] + eps1) + (1.0f - targ[i] + eps2) * std::log(1.0f - pred[i] + eps2)); + tmp_res_manual[i] = - (targ[i] * std::log(pred[i] + eps) + (1.0f - targ[i]) * std::log((1.0f - pred[i]) + eps)); } float * d_tmp_res_manual; cudaMalloc(reinterpret_cast<void **>(&d_tmp_res_manual), nb_elements * sizeof(float));