diff --git a/include/aidge/backend/cpu/operator/AddImpl.hpp b/include/aidge/backend/cpu/operator/AddImpl.hpp
index 0299148d086ae6e2be967232e8157c6a6229b0f7..57669c628b4fa650f137c2b28c8c0a4584bf6c35 100644
--- a/include/aidge/backend/cpu/operator/AddImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl.hpp
@@ -25,10 +25,10 @@ namespace Aidge {
 
 // compute kernel registry for forward and backward
 class AddImplForward_cpu
-    : public Registrable<AddImplForward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const std::vector<const void*>, void*)> {};
+    : public Registrable<AddImplForward_cpu, std::tuple<DataType, DataType>, void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)> {};
 
 class AddImplBackward_cpu
-    : public Registrable<AddImplBackward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const std::vector<const void*>, void*)> {};
+    : public Registrable<AddImplBackward_cpu, std::tuple<DataType, DataType>, void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)> {};
 
 
 class AddImpl_cpu : public OperatorImpl {
diff --git a/include/aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp
index 631ad44a562c17d41ad019a1da112dbf8a69185c..a038c4487c4b4d864ea46a54437685a06e6f0998 100644
--- a/include/aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp
@@ -18,8 +18,33 @@
 
 namespace Aidge {
 
+// Function to get multi-dimensional indices from a flattened index
+std::vector<size_t> getMultiDimIndices(const std::vector<size_t>& dimensions, size_t idx) {
+    std::vector<size_t> indices(dimensions.size(), 0);
+
+    for (int i = dimensions.size() - 1; i >= 0; --i) {
+        indices[i] = idx % dimensions[i];
+        idx /= dimensions[i];
+    }
+
+    return indices;
+}
+
+// Function to get a flattened index from multi-dimensional indices
+std::size_t getFlattenedIndex(const std::vector<size_t>& dimensions, const std::vector<size_t>& indices) {
+    std::size_t flattenedIdx = 0;
+    std::size_t stride = 1;
+
+    for (int i = dimensions.size() - 1; i >= 0; --i) {
+        std::size_t idx = dimensions[i]>1 ? indices[i] : 0;
+        flattenedIdx += idx * stride;
+        stride *= dimensions[i];
+    }
+    return flattenedIdx;
+}
+
 template <class I, class O>
-void AddImpl_cpu_forward_kernel(const std::size_t inputLength, const std::vector<const void*> inputs_, void* output_) {
+void AddImpl_cpu_forward_kernel(const std::vector<const void*> inputs_, const std::vector<std::vector<std::size_t>>& inputDims, const std::size_t outputLength, const std::vector<std::size_t>& outDims, void* output_) {
     // FIXME: missing Add attributes as arguments
     std::vector<const I*> inputs;
     for (const auto& input_ : inputs_) {
@@ -27,12 +52,15 @@ void AddImpl_cpu_forward_kernel(const std::size_t inputLength, const std::vector
     }
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t oIndex = 0; oIndex < inputLength; ++oIndex) {
-        output[oIndex] = 0;
-        for (std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) {
-            output[oIndex] += inputs[iIndex][oIndex];
-        }
-    }
+	for (std::size_t oIndex = 0; oIndex < outputLength; ++oIndex)
+	{
+		std::vector<size_t> indexes = getMultiDimIndices(outDims, oIndex);
+
+		for(std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) {
+			std::size_t idx = getFlattenedIndex(inputDims[iIndex], indexes);
+			output[oIndex] += inputs[iIndex][idx];
+		}
+	}
 }
 
 namespace {
diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp
index 3b53eaf3b88fb418746ab5a7a2297a15606974d3..92c9ee59d48ed8606192c83f346a76792dcd3eeb 100644
--- a/src/operator/AddImpl.cpp
+++ b/src/operator/AddImpl.cpp
@@ -55,15 +55,25 @@ void  Aidge::AddImpl_cpu::forward() {
     // TODO: right now, if needed, memory will be allocated/deallocated at each
     // call to forward(). We might put the following shared_ptr as members of
     // this class to avoid that.
+    std::size_t nbDims = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->nbDims();
+    std::vector<std::vector<std::size_t>> inputsDims;
     std::vector<const void*> opInputs;
     std::vector<std::shared_ptr<Tensor>> inputsFallback(mOp.nbInputs());
     for (IOIndex_t i = 0; i < mOp.nbInputs(); ++i) {
-        const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(i))->refCastFrom(inputsFallback[i], *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
-        opInputs.push_back(input.getImpl()->rawPtr());
+        std::vector<std::size_t> inputDims(nbDims, 1);
+        auto dims = std::static_pointer_cast<Tensor>(mOp.getRawInput(i))->dims();
+		for(std::size_t j=dims.size()-1; j+1>0; --j)
+		{
+			std::size_t idx = nbDims - (dims.size()-j);
+			inputDims[idx] = dims[j];
+		}
+        inputsDims.push_back(inputDims);
+        opInputs.push_back(getCPUPtr(mOp.getRawInput(i)));
     }
 
-    // Call kernel
-    kernelFunc(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size(),
-               opInputs,
+    kernelFunc(opInputs,
+               inputsDims,
+               std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->size(),
+               std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
                getCPUPtr(mOp.getRawOutput(0)));
 }
diff --git a/unit_tests/operator/Test_AddImpl.cpp b/unit_tests/operator/Test_AddImpl.cpp
index 740b1a5322b55e2347d93ed2e515358080a108a5..e2e7051afda5e7f72c3142987587179bc759f1e8 100644
--- a/unit_tests/operator/Test_AddImpl.cpp
+++ b/unit_tests/operator/Test_AddImpl.cpp
@@ -117,4 +117,63 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
 
         REQUIRE(*op->getOutput(0) == *expectedOutput);
     }
+
+    SECTION("Broadcasting") {
+        std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<int,3,1,3,2> {
+        {                                       //
+            {                                   //
+                {{0, 1},{2, 3},{4, 5}}          //
+            },                                  //
+            {                                   //
+                {{6, 7},{8, 9},{10, 11}}        //
+            },                                  //
+            {                                   //
+                {{12, 13},{14, 15},{16, 17}}    //
+            }                                   //
+        }                                       //
+        });                                     //
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+        {                                       //
+            {                                   //
+                {{20, 21},{22, 23},{24, 25}},   //
+                {{26, 27},{28, 29},{30, 31}},   //
+                {{32, 33},{34, 35},{36, 37}}    //
+            }                                   //
+        }                                       //
+        });                                     //
+
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{100,200}});  
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+            {                                               //
+                {                                           //
+                    {{ 120, 222},{ 124, 226},{ 128, 230}},  //
+                    {{ 126, 228},{ 130, 232},{ 134, 236}},  //
+                    {{ 132, 234},{ 136, 238},{ 140, 242}}   //
+                },                                          //
+                {                                           //
+                    {{ 126, 228},{ 130, 232},{ 134, 236}},  //
+                    {{ 132, 234},{ 136, 238},{ 140, 242}},  //
+                    {{ 138, 240},{ 142, 244},{ 146, 248}}   //
+                },                                          //
+                {                                           //
+                    {{ 132, 234},{ 136, 238},{140, 242}},   //
+                    {{ 138, 240},{ 142, 244},{146, 248}},   //
+                    {{ 144, 246},{ 148, 250},{152, 254}}    //
+                }                                           //
+            }                                               //
+        });                                                 //
+
+        std::shared_ptr<Node> myAdd = Add(3);
+        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
+        op->associateInput(0, input_0);
+        op->associateInput(1, input_1);
+        op->associateInput(2, input_2);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
+        op->computeOutputDims();
+        myAdd->forward();
+        op->getOutput(0)->print();
+        expectedOutput->print();
+        REQUIRE(*op->getOutput(0) == *expectedOutput);
+    }
 }
\ No newline at end of file