diff --git a/.gitlab/ci/build.gitlab-ci.yml b/.gitlab/ci/build.gitlab-ci.yml
index c0b72d3e179b696b3776de7444adca263ab58c27..39b6ace150d146082045e820953236db559393d3 100644
--- a/.gitlab/ci/build.gitlab-ci.yml
+++ b/.gitlab/ci/build.gitlab-ci.yml
@@ -15,7 +15,6 @@ build:ubuntu_cpp:
     # aidge_backend_cpu
     - DEPENDENCY_NAME="aidge_backend_cpu"
     - !reference [.download_dependency, script]
-
     # Build current module
     - export CMAKE_PREFIX_PATH=../install_cpp
     - mkdir -p build_cpp
diff --git a/CHANGELOG b/CHANGELOG
index d6594bc686a7c8c0e244c77ef7c69496d0eb8643..40caa5a799c6904df84cf2e0b3fc38eb9caf6683 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,8 @@
+# Version 0.1.1 (May 14, 2024)
+
+* Fix loss function to return Tensor with gradient
+* Add Python binding for the loss function
+
 # Version 0.1.0 (April 4, 2024)
 
 Initial release
diff --git a/include/aidge/learning/optimizer/Optimizer.hpp b/include/aidge/learning/optimizer/Optimizer.hpp
index 9e621875beb1cfd58bf8474753c536b8c4e5183c..195d64965d3ba4eb89c9c4d0ca2155cb719f76f3 100644
--- a/include/aidge/learning/optimizer/Optimizer.hpp
+++ b/include/aidge/learning/optimizer/Optimizer.hpp
@@ -49,7 +49,7 @@ public:
     virtual void setParameters(const std::vector<std::shared_ptr<Tensor>>& parameters) {
         mParameters = parameters;
         for (const auto& param : parameters) {
-            param->initGradient(); // create gradient and set it to zeros
+            param->initGrad(); // create gradient and set it to zeros
         }
     }
 
diff --git a/include/aidge/loss/LossList.hpp b/include/aidge/loss/LossList.hpp
index e65123dde897610f82ca876f1260a165b785e33f..5a0241d9816becbaace75185e796c5ec7c787e89 100644
--- a/include/aidge/loss/LossList.hpp
+++ b/include/aidge/loss/LossList.hpp
@@ -20,10 +20,19 @@
 namespace Aidge {
 namespace loss {
 
-Tensor MSE(const std::shared_ptr<Tensor>& prediction,
+/**
+ * @brief Compute the Mean Square Error loss.
+ * This function returns the loss and set the ``grad()`` of the prediction
+ * input.
+ * @param prediction Tensor returned by the Aidge Graph, it is important that
+ * this tensor is not a copy as overwise the backward function will not have a
+ * gradient to start.
+ * @param target Tensor representing the ground truth, it must be one hot encoded.
+ */
+Tensor MSE(std::shared_ptr<Tensor>& prediction,
            const std::shared_ptr<Tensor>& target);
 
-} // loss
-} // namespace Aidge
+}  // namespace loss
+}  // namespace Aidge
 
 #endif /* AIDGE_CORE_LOSS_LOSSLIST_H_ */
diff --git a/python_binding/learning/loss/pybind_Loss.cpp b/python_binding/learning/loss/pybind_Loss.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5e3c3af23cb81effc87888f91ac108f8b1cfd61a
--- /dev/null
+++ b/python_binding/learning/loss/pybind_Loss.cpp
@@ -0,0 +1,27 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <pybind11/pybind11.h>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/GraphView.hpp"
+#include "aidge/loss/LossList.hpp"
+
+namespace py = pybind11;
+
+namespace Aidge {
+
+void init_Loss(py::module &m) {
+    auto m_loss =
+        m.def_submodule("loss", "Submodule dedicated to loss functions");
+    m_loss.def("MSE", &loss::MSE, py::arg("graph"), py::arg("target"));
+}
+}  // namespace Aidge
diff --git a/python_binding/pybind_learning.cpp b/python_binding/pybind_learning.cpp
index c93884e318847121d00504a6b5602f5a1eaea910..3b4a16ceffb0db7bd7e1d407bcef5d5df830cb2f 100644
--- a/python_binding/pybind_learning.cpp
+++ b/python_binding/pybind_learning.cpp
@@ -16,12 +16,13 @@ namespace py = pybind11;
 namespace Aidge {
 // namespace learning {
 
+void init_Loss(py::module&);
 void init_Optimizer(py::module&);
 void init_SGD(py::module&);
-
 void init_LRScheduler(py::module&);
 
 void init_Aidge(py::module& m) {
+    init_Loss(m);
     init_Optimizer(m);
     init_SGD(m);
 
diff --git a/src/loss/regression/MSE.cpp b/src/loss/regression/MSE.cpp
index 3245fcdbce33c5966c18bf56579eb18b2aa790bd..87f685a0f550a1cb60563503447407f70868ce9a 100644
--- a/src/loss/regression/MSE.cpp
+++ b/src/loss/regression/MSE.cpp
@@ -9,8 +9,6 @@
  *
  ********************************************************************************/
 
-#include "aidge/loss/LossList.hpp"
-
 #include <memory>
 #include <numeric>  // std::iota
 
@@ -20,54 +18,95 @@
 #include "aidge/data/Tensor.hpp"
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/graph/OpArgs.hpp"
+#include "aidge/loss/LossList.hpp"
 #include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/operator/Pow.hpp"
 #include "aidge/operator/ReduceMean.hpp"
 #include "aidge/operator/Sub.hpp"
+#include "aidge/recipes/GraphViewHelper.hpp"
 #include "aidge/scheduler/Scheduler.hpp"
 #include "aidge/scheduler/SequentialScheduler.hpp"
 
-Aidge::Tensor Aidge::loss::MSE(const std::shared_ptr<Tensor>& prediction, const std::shared_ptr<Tensor>& target) {
+Aidge::Tensor Aidge::loss::MSE(std::shared_ptr<Tensor>& prediction,
+                               const std::shared_ptr<Tensor>& target) {
+    /*
+    Implementation note:
+    MSE is computed using a graph in order to not be backend dependant.
+
+    The graph used is the following:
+
+    pred->Sub
+    label->Sub
+    Sub->Pow
+    (2)->Pow->ReduceMean->Loss
+    Sub->Mul
+    (2/NbBatch)->Mul->Gradient
+    */
+
+    prediction->initGrad(); // Enable gradient for output
+
+    // compile_gradient(graph);  // Warning compile gradient here, without
+    //                           // it, grad is nullptr. Maybe we can find a better
+    //                           // place to do so ?
+
+    AIDGE_ASSERT(target->dims().size() == 2,
+                 "Label must have two dims: [BatchSize, NbChannel]");
+
+    std::shared_ptr<Tensor> outputGrad = prediction->grad();
+
     AIDGE_ASSERT(prediction->backend() == target->backend(),
-        "'prediction' and 'target' Tensors must be on the same backend. Found {} and {}.\n",
-        prediction->backend(),
-        target->backend());
+                 "'prediction' and 'target' Tensors must be on the "
+                 "same backend. Found {} and {}.\n",
+                 prediction->backend(), target->backend());
     AIDGE_ASSERT(prediction->dims() == target->dims(),
-        "'prediction' (shape {}) and 'target' (shape {}) Tensors must have the same dimensions.\n",
-        prediction->dims(),
-        target->dims());
+                 "'prediction' (shape {}) and 'target' (shape {}) Tensors must "
+                 "have the same dimensions.\n",
+                 prediction->dims(), target->dims());
     AIDGE_ASSERT(prediction->dataType() == target->dataType(),
-        "'prediction' (shape {}) and 'target' (shape {}) Tensors must have the same dimensions.\n",
-        prediction->dims(),
-        target->dims());
+                 "'prediction' (data type {}) and 'target' (data type {}) "
+                 "Tensors must have the same data type.\n",
+                 prediction->dataType(), target->dataType());
 
     // could be accelerated with constexpr constructors
     std::vector<int> axes_dims(prediction->nbDims());
     std::iota(std::begin(axes_dims), std::end(axes_dims), 0);
     auto rm_node = ReduceMean(axes_dims, 1, "mse_res");
 
-    const std::shared_ptr<Node> pow_node = Pow();
-    const std::shared_ptr<Node> pow_exp_node = Producer(std::make_shared<Tensor>(Array1D<int,1>{{2}}));
+    const std::shared_ptr<Node> pow_node = Pow("square");
+    const std::shared_ptr<Node> pow_exp_node =
+        Producer(std::make_shared<Tensor>(Array1D<int, 1>{{2}}), "exp_val");
     pow_exp_node->addChild(pow_node, 0, 1);
 
-    const std::shared_ptr<Node> sub_node = Sub();
-    Producer(prediction)->addChild(sub_node, 0, 0);
-    Producer(target)->addChild(sub_node, 0, 1);
+    const std::shared_ptr<Node> sub_node = Sub("err");
+    Producer(prediction, "pred")->addChild(sub_node, 0, 0);
+    Producer(target, "label")->addChild(sub_node, 0, 1);
+
+    const std::shared_ptr<Node> mul_node = Mul("gradient");
 
+    // Note: this assume target is [nbBatch, nbChan]
+    Producer(std::make_shared<Tensor>(
+                 Array1D<float, 1>{{2 / float(target->dims()[0])}}))
+        ->addChild(mul_node, 0, 1);
+    sub_node->addChild(mul_node, 0, 0);  // Error computation branch !
 
-    std::shared_ptr<GraphView> gv_local = Sequential({
-        sub_node,
-        pow_node,
-        rm_node
-    });
-    gv_local->add({sub_node->getParent(0), sub_node->getParent(1), pow_exp_node});
+    std::shared_ptr<GraphView> gv_local =
+        Sequential({sub_node, pow_node, rm_node});
+    gv_local->add({sub_node->getParent(0), sub_node->getParent(1), pow_exp_node,
+                   mul_node->getParent(1), mul_node});
     gv_local->compile(prediction->getImpl()->backend(), prediction->dataType());
-    gv_local->save("MSEgraph");
+
     SequentialScheduler ss_local{gv_local};
     ss_local.forward(false);
 
+    // Retrieve gradient
+    // Can we avoid copy ?
+    outputGrad->copyFrom(
+        std::dynamic_pointer_cast<OperatorTensor>(mul_node->getOperator())
+            ->getOutput(0)
+            ->clone());
+
     // TODO: way too complicated to access
-    const std::shared_ptr<OperatorTensor> res = std::dynamic_pointer_cast<OperatorTensor>(rm_node->getOperator());
+    const std::shared_ptr<OperatorTensor> res =
+        std::dynamic_pointer_cast<OperatorTensor>(rm_node->getOperator());
     return res->getOutput(0)->clone();
-
 }
diff --git a/unit_tests/loss/regression/Test_MSE.cpp b/unit_tests/loss/regression/Test_MSE.cpp
index 3899470b5f0141fc747f6a2a52cc35b41a590d49..2b0e6d1edfaa1d452c714a08c4998725331df2c3 100644
--- a/unit_tests/loss/regression/Test_MSE.cpp
+++ b/unit_tests/loss/regression/Test_MSE.cpp
@@ -35,9 +35,8 @@ TEST_CASE("[loss/regression] MSE", "[loss][regression][MSE]") {
     std::uniform_real_distribution<float> valueDist(0.0f, 1.0f);
 
     for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-        // Create a random number generator
-        const std::size_t nb_dims = nbDimsDist(gen);
-        std::vector<std::size_t> dims(nb_dims);
+        const std::size_t nb_dims = 2; // For MSE test, nb_dims is fixed as 2: NbBatch, NbChan
+        std::vector<std::size_t> dims(2);
 
         for (std::size_t i = 0; i < nb_dims; ++i) { dims[i] = dimsDist(gen); }
         const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
@@ -78,11 +77,11 @@ TEST_CASE("[loss/regression] MSE", "[loss][regression][MSE]") {
         targ_tensor->setBackend("cpu");
         targ_tensor->getImpl()->setRawPtr(targ.get(), nb_elements);
         targ_tensor->print();
-        const Tensor res_function = loss::MSE(pred_tensor, targ_tensor);
+            const Tensor res_function = loss::MSE(pred_tensor, targ_tensor);
 
         // compare results
         Tensor res_manual_tensor = Tensor(res_manual);
         REQUIRE(approxEq<float>(res_manual, res_function));
     }
 }
-}  // namespace Aidge
\ No newline at end of file
+}  // namespace Aidge
diff --git a/unit_tests/optimizer/Test_SGD.cpp b/unit_tests/optimizer/Test_SGD.cpp
index 17f946ae1630c2423a37f703c7923a40e5fe66bf..df9924d557d89d0483d018ce08951cf573e233d7 100644
--- a/unit_tests/optimizer/Test_SGD.cpp
+++ b/unit_tests/optimizer/Test_SGD.cpp
@@ -77,7 +77,7 @@ TEST_CASE("[learning/SGD] update", "[Optimizer][SGD]") {
             optim_tensors[i] = std::make_shared<Tensor>(dims);
             optim_tensors[i]->setBackend("cpu");
             optim_tensors[i]->getImpl()->copy(val_tensors[i].get(), size_tensors[i]);
-            optim_tensors[i]->initGradient();
+            optim_tensors[i]->initGrad();
 
             grad_tensors[i] = std::make_shared<Tensor>(dims);
             grad_tensors[i]->setBackend("cpu");
diff --git a/version.txt b/version.txt
index 6e8bf73aa550d4c57f6f35830f1bcdc7a4a62f38..17e51c385ea382d4f2ef124b7032c1604845622d 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.1.0
+0.1.1