diff --git a/unit_tests/loss/regression/Test_MSE.cpp b/unit_tests/loss/regression/Test_MSE.cpp
index 2b0e6d1edfaa1d452c714a08c4998725331df2c3..4e30fd00e2d8136a9da8bcb38875e2a4eafa2c98 100644
--- a/unit_tests/loss/regression/Test_MSE.cpp
+++ b/unit_tests/loss/regression/Test_MSE.cpp
@@ -22,9 +22,15 @@
 #include "aidge/loss/LossList.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
-
+#if USE_AIDGE_BACKEND_CUDA
+#include "aidge/backend/cuda/operator/PowImpl.hpp"
+#include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp"
+#include "aidge/backend/cuda/operator/SubImpl.hpp"
+#include "aidge/backend/cuda/operator/MulImpl.hpp"
+#endif
 namespace Aidge {
 TEST_CASE("[loss/regression] MSE", "[loss][regression][MSE]") {
+
     constexpr std::uint16_t NBTRIALS = 10;
 
     // set random variables
@@ -34,54 +40,120 @@ TEST_CASE("[loss/regression] MSE", "[loss][regression][MSE]") {
     std::uniform_int_distribution<std::size_t> nbDimsDist(1, 2);
     std::uniform_real_distribution<float> valueDist(0.0f, 1.0f);
 
-    for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-        const std::size_t nb_dims = 2; // For MSE test, nb_dims is fixed as 2: NbBatch, NbChan
-        std::vector<std::size_t> dims(2);
+    SECTION("CPU") {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            const std::size_t nb_dims = 2; // For MSE test, nb_dims is fixed as 2: NbBatch, NbChan
+            std::vector<std::size_t> dims(2);
 
-        for (std::size_t i = 0; i < nb_dims; ++i) { dims[i] = dimsDist(gen); }
-        const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+            for (std::size_t i = 0; i < nb_dims; ++i) { dims[i] = dimsDist(gen); }
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
 
-        // create random predictions
-        std::unique_ptr<float[]> pred = std::make_unique<float[]>(nb_elements);
-        for (std::size_t i = 0; i < nb_elements; ++i) {
-            pred[i] = valueDist(gen);
-        }
+            // create random predictions
+            std::unique_ptr<float[]> pred = std::make_unique<float[]>(nb_elements);
+            for (std::size_t i = 0; i < nb_elements; ++i) {
+                pred[i] = valueDist(gen);
+            }
 
-        // create random targets
-        std::unique_ptr<float[]> targ = std::make_unique<float[]>(nb_elements);
-        for (std::size_t i = 0; i < nb_elements; ++i) {
-            targ[i] = valueDist(gen);
-        }
+            // create random targets
+            std::unique_ptr<float[]> targ = std::make_unique<float[]>(nb_elements);
+            for (std::size_t i = 0; i < nb_elements; ++i) {
+                targ[i] = valueDist(gen);
+            }
+
+            // compute the MSE manually
+            std::unique_ptr<float[]> tmp_res_manual = std::make_unique<float[]>(nb_elements);
+            for (std::size_t i = 0; i < nb_elements; ++i) {
+                tmp_res_manual[i] = std::pow(pred[i] - targ[i],2);
+            }
+            std::cout << "Pow output manual:" << std::endl;
+            std::shared_ptr<Tensor> tmp_tensor = std::make_shared<Tensor>(dims);
+            tmp_tensor->setBackend("cpu");
+            tmp_tensor->getImpl()->setRawPtr(tmp_res_manual.get(), nb_elements);
+            tmp_tensor->print();
+            const float res_manual = std::accumulate(&tmp_res_manual[0], &tmp_res_manual[nb_elements], 0.0f, std::plus<float>()) / static_cast<float>(nb_elements);
 
-        // compute the MSE manually
-        std::unique_ptr<float[]> tmp_res_manual = std::make_unique<float[]>(nb_elements);
-        for (std::size_t i = 0; i < nb_elements; ++i) {
-            tmp_res_manual[i] = std::pow(pred[i] - targ[i],2);
+            // compute the MSE using Aidge::loss::MSE function
+            std::cout << "Sub input 0 manual:" << std::endl;
+            std::shared_ptr<Tensor> pred_tensor = std::make_shared<Tensor>(dims);
+            pred_tensor->setBackend("cpu");
+            pred_tensor->getImpl()->setRawPtr(pred.get(), nb_elements);
+            pred_tensor->print();
+
+            std::cout << "Sub input 1 manual:" << std::endl;
+            std::shared_ptr<Tensor> targ_tensor = std::make_shared<Tensor>(dims);
+            targ_tensor->setBackend("cpu");
+            targ_tensor->getImpl()->setRawPtr(targ.get(), nb_elements);
+            targ_tensor->print();
+                const Tensor res_function = loss::MSE(pred_tensor, targ_tensor);
+
+            // compare results
+            Tensor res_manual_tensor = Tensor(res_manual);
+            REQUIRE(approxEq<float>(res_manual, res_function));
         }
-        std::cout << "Pow output manual:" << std::endl;
-        std::shared_ptr<Tensor> tmp_tensor = std::make_shared<Tensor>(dims);
-        tmp_tensor->setBackend("cpu");
-        tmp_tensor->getImpl()->setRawPtr(tmp_res_manual.get(), nb_elements);
-        tmp_tensor->print();
-        const float res_manual = std::accumulate(&tmp_res_manual[0], &tmp_res_manual[nb_elements], 0.0f, std::plus<float>()) / static_cast<float>(nb_elements);
-
-        // compute the MSE using Aidge::loss::MSE function
-        std::cout << "Sub input 0 manual:" << std::endl;
-        std::shared_ptr<Tensor> pred_tensor = std::make_shared<Tensor>(dims);
-        pred_tensor->setBackend("cpu");
-        pred_tensor->getImpl()->setRawPtr(pred.get(), nb_elements);
-        pred_tensor->print();
-
-        std::cout << "Sub input 1 manual:" << std::endl;
-        std::shared_ptr<Tensor> targ_tensor = std::make_shared<Tensor>(dims);
-        targ_tensor->setBackend("cpu");
-        targ_tensor->getImpl()->setRawPtr(targ.get(), nb_elements);
-        targ_tensor->print();
+    }
+#if USE_AIDGE_BACKEND_CUDA
+    SECTION("CUDA") {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            const std::size_t nb_dims = 2; // For MSE test, nb_dims is fixed as 2: NbBatch, NbChan
+            std::vector<std::size_t> dims(2);
+
+            for (std::size_t i = 0; i < nb_dims; ++i) { dims[i] = dimsDist(gen); }
+            const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+            // create random predictions
+            std::unique_ptr<float[]> pred = std::make_unique<float[]>(nb_elements);
+            for (std::size_t i = 0; i < nb_elements; ++i) {
+                pred[i] = valueDist(gen);
+            }
+            float * d_pred;
+            cudaMalloc(&d_pred, nb_elements * sizeof(float));
+            cudaMemcpy(d_pred, pred.get(), nb_elements * sizeof(float), cudaMemcpyHostToDevice);
+
+            // create random targets
+            std::unique_ptr<float[]> targ = std::make_unique<float[]>(nb_elements);
+            for (std::size_t i = 0; i < nb_elements; ++i) {
+                targ[i] = valueDist(gen);
+            }
+            float * d_targ;
+            cudaMalloc(&d_targ, nb_elements * sizeof(float));
+            cudaMemcpy(d_targ, targ.get(), nb_elements * sizeof(float), cudaMemcpyHostToDevice);
+
+            // compute the MSE manually
+            std::unique_ptr<float[]> tmp_res_manual = std::make_unique<float[]>(nb_elements);
+            for (std::size_t i = 0; i < nb_elements; ++i) {
+                tmp_res_manual[i] = std::pow(pred[i] - targ[i],2);
+            }
+            float * d_tmp_res_manual;
+            cudaMalloc(&d_tmp_res_manual, nb_elements * sizeof(float));
+            cudaMemcpy(d_tmp_res_manual, tmp_res_manual.get(), nb_elements * sizeof(float), cudaMemcpyHostToDevice);
+    
+            std::shared_ptr<Tensor> tmp_tensor = std::make_shared<Tensor>(dims);
+            tmp_tensor->setBackend("cuda");
+            tmp_tensor->getImpl()->setRawPtr(d_tmp_res_manual, nb_elements);
+
+            const float res_manual = std::accumulate(&tmp_res_manual[0], &tmp_res_manual[nb_elements], 0.0f, std::plus<float>()) / static_cast<float>(nb_elements);
+
+            // compute the MSE using Aidge::loss::MSE function
+            std::shared_ptr<Tensor> pred_tensor = std::make_shared<Tensor>(dims);
+            pred_tensor->setBackend("cuda");
+            pred_tensor->getImpl()->setRawPtr(d_pred, nb_elements);
+
+
+            std::shared_ptr<Tensor> targ_tensor = std::make_shared<Tensor>(dims);
+            targ_tensor->setBackend("cuda");
+            targ_tensor->getImpl()->setRawPtr(d_targ, nb_elements);
+
             const Tensor res_function = loss::MSE(pred_tensor, targ_tensor);
 
-        // compare results
-        Tensor res_manual_tensor = Tensor(res_manual);
-        REQUIRE(approxEq<float>(res_manual, res_function));
+            // compare results
+            Tensor res_manual_tensor = Tensor(res_manual);
+            REQUIRE(approxEq<float>(res_manual, res_function));
+
+            cudaFree(d_pred);
+            cudaFree(d_targ);
+            cudaFree(d_tmp_res_manual);
+        }
     }
+#endif
 }
 }  // namespace Aidge