diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index f3af2bf3015fb45bda9736e9925c8527e2b0cad0..7cd492f4ea035a0624fe68fc35b0a4246fe084a8 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -28,7 +28,8 @@
 #include "aidge/backend/cpu/operator/PowImpl.hpp"
 #include "aidge/backend/cpu/operator/ProducerImpl.hpp"
 #include "aidge/backend/cpu/operator/ReLUImpl.hpp"
-#include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
 #include "aidge/backend/cpu/operator/ScalingImpl.hpp"
+#include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
+#include "aidge/backend/cpu/operator/SubImpl.hpp"
 
 #endif /* AIDGE_CPU_IMPORTS_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cpu/operator/SubImpl.hpp b/include/aidge/backend/cpu/operator/SubImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..08ec69e509b2b6c02e30f613abd83208de254f75
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/SubImpl.hpp
@@ -0,0 +1,50 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_SUBIMPL_H_
+#define AIDGE_CPU_OPERATOR_SUBIMPL_H_
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Sub.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// class Sub_Op;
+
+// compute kernel registry for forward and backward
+class SubImplForward_cpu
+    : public Registrable<SubImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t, const std::size_t, const void*, const void*,void*)> {
+};
+class SubImplBackward_cpu
+    : public Registrable<SubImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t, const std::size_t, const void*, const void*, void*)> {
+};
+
+class SubImpl_cpu : public OperatorImpl {
+public:
+    SubImpl_cpu(const Sub_Op& op) : OperatorImpl(op) {}
+
+    static std::unique_ptr<SubImpl_cpu> create(const Sub_Op& op) {
+        return std::make_unique<SubImpl_cpu>(op);
+    }
+
+    NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
+    void forward() override;
+};
+
+namespace {
+static Registrar<Sub_Op> registrarSubImpl_cpu("cpu", Aidge::SubImpl_cpu::create);
+}
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_SUBIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/SubImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/SubImpl_forward_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..08f2e24fa38d2739943279666187a55d7076a89b
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/SubImpl_forward_kernels.hpp
@@ -0,0 +1,65 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_SUBIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_SUBIMPL_FORWARD_KERNEL_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/SubImpl.hpp"
+
+namespace Aidge {
+template <class I1, class I2, class O>
+void SubImpl_cpu_forward_kernel(std::size_t input1Length,
+                                     std::size_t input2Length,
+                                     const void* input1_,
+                                     const void* input2_,
+                                     void* output_) {
+
+    const I1* input_1 = static_cast<const I1*>(input1_);
+    const I2* input_2 = static_cast<const I2*>(input2_);
+    O* output = static_cast<O*>(output_);
+
+    if (input2Length == input1Length)
+    {
+        for (std::size_t i = 0; i < input1Length; ++i) {
+            output[i] = input_1[i] - input_2[i];
+        }
+    }
+    else if (input2Length == 1)
+    {
+        for (std::size_t i = 0; i < input1Length; ++i) {
+            output[i] = input_1[i] - input_2[0];
+        }
+    }
+    else // input_2 is 1d and of size the number of channels of input_1
+    {
+        for (std::size_t i = 0; i < input1Length; ++i) {
+            std::size_t channelIdx = i % input2Length;
+            output[i] = input_1[i] - input_2[channelIdx];
+        }
+    }
+}
+
+namespace {
+static Registrar<SubImplForward_cpu> registrarSubImplForward_cpu_Float32(
+        {DataType::Float32, DataType::Float32, DataType::Float32},
+        Aidge::SubImpl_cpu_forward_kernel<float, float, float>);
+static Registrar<SubImplForward_cpu> registrarSubImplForward_cpu_Int32(
+        {DataType::Int32, DataType::Int32, DataType::Int32},
+        Aidge::SubImpl_cpu_forward_kernel<int, int, int>);
+static Registrar<SubImplForward_cpu> registrarSubImplForward_cpu_Float64(
+        {DataType::Float64, DataType::Float64, DataType::Float64},
+        Aidge::SubImpl_cpu_forward_kernel<double, double, double>);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_SUBIMPL_FORWARD_KERNEL_H_ */
diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6d87821d89ff84aa1046a9ecf0fdd83dcc5dda53
--- /dev/null
+++ b/src/operator/SubImpl.cpp
@@ -0,0 +1,51 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/operator/Sub.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cpu/operator/SubImpl.hpp"
+#include "aidge/backend/cpu/operator/SubImpl_forward_kernels.hpp"
+
+Aidge::NbElts_t Aidge::SubImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
+    // this implementation can be in-place
+    return 0;
+}
+
+void Aidge::SubImpl_cpu::forward() {
+    assert(mOp.getInput(0) && "missing input #0");
+    assert(mOp.getInput(1) && "missing input #1");
+
+    assert(((mOp.getInput(1)->size() == 1) || 
+            (mOp.getInput(1)->size() == mOp.getInput(0)->size()) ||
+            (mOp.getInput(1)->nbDims() == 1 && mOp.getInput(1)->size() == mOp.getInput(0)->dims()[mOp.getInput(0)->nbDims()-1])
+           ) &&
+           "input #1 must either be a tensor of size 1, the number of channels of input # or the same size of input #0");
+
+    // Find the correct kernel type
+    auto kernelFunc = Registrar<SubImplForward_cpu>::create({
+        mOp.getInput(0)->dataType(),
+        mOp.getInput(1)->dataType(),
+        mOp.getOutput(0)->dataType()});
+
+    // Call kernel
+    kernelFunc(std::static_pointer_cast<Tensor>(mOp.getInput(0))->size(),
+        std::static_pointer_cast<Tensor>(mOp.getInput(1))->size(),
+        mOp.getInput(0)->getImpl()->rawPtr(),
+        mOp.getInput(1)->getImpl()->rawPtr(),
+        mOp.getOutput(0)->getImpl()->rawPtr());
+}
diff --git a/unit_tests/operator/Test_DivImpl.cpp b/unit_tests/operator/Test_DivImpl.cpp
index a6d780e2cef3af1797d39d50e7331a0b3a4bb6f9..c33319c88b63ee834bbcb388bbbe0775699edbd7 100644
--- a/unit_tests/operator/Test_DivImpl.cpp
+++ b/unit_tests/operator/Test_DivImpl.cpp
@@ -93,6 +93,7 @@ TEST_CASE("[cpu/operator] Div(forward)") {
             {
                 {{0.24180168, 0.44319558, 0.06437260},
                  {0.21270001, 0.34570599, 0.44151264}},
+
                 {{0.62294692, 0.98043168, 0.18628585},
                  {0.33591706, 0.03432965, 0.32130069}}
             }
@@ -120,7 +121,7 @@ TEST_CASE("[cpu/operator] Div(forward)") {
 
         float* resPtr = static_cast<float*>(myDiv->getOperator()->getOutput(0)->getImpl()->rawPtr());
         float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< 4; ++i) {
+        for (std::size_t i = 0; i< 12; ++i) {
             REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
         }
 
diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp
index bfb4ffc2d3aea74ada4ed3656041deb7cafb5a99..cea62f998cfc538d1d5800639e461eb4d15cb270 100644
--- a/unit_tests/operator/Test_MulImpl.cpp
+++ b/unit_tests/operator/Test_MulImpl.cpp
@@ -121,7 +121,7 @@ TEST_CASE("[cpu/operator] Mul(forward)") {
 
         float* resPtr = static_cast<float*>(myMul->getOperator()->getOutput(0)->getImpl()->rawPtr());
         float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< 4; ++i) {
+        for (std::size_t i = 0; i< 12; ++i) {
             REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
         }
 
diff --git a/unit_tests/operator/Test_PowImpl.cpp b/unit_tests/operator/Test_PowImpl.cpp
index 21ea1a526c2dbcaa18308ba0d21d6dff0f7d819b..7293198f411510904ee73aced47b69dfc37374af 100644
--- a/unit_tests/operator/Test_PowImpl.cpp
+++ b/unit_tests/operator/Test_PowImpl.cpp
@@ -85,7 +85,7 @@ TEST_CASE("[cpu/operator] Pow(forward)") {
 
         float* resPtr = static_cast<float*>(myPow->getOperator()->getOutput(0)->getImpl()->rawPtr());
         float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< 4; ++i) {
+        for (std::size_t i = 0; i< 12; ++i) {
             REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
         }
 
diff --git a/unit_tests/operator/Test_SubImpl.cpp b/unit_tests/operator/Test_SubImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d741602cf02958a88bb41bbd2927577027acb069
--- /dev/null
+++ b/unit_tests/operator/Test_SubImpl.cpp
@@ -0,0 +1,129 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Sub.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+#include <memory>
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] Sub(forward)") {
+    SECTION("2D Tensor by Singleton") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array2D<float,2,2> {
+            {
+                {0.34234560, 0.92812711},
+                {0.73706615, 0.69953883}
+            }
+        });
+        std::shared_ptr<Tensor> input_2 =  std::make_shared<Tensor>(Array2D<float,1,1>{{2.5}});
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array2D<float,2,2> {
+            {
+                {-2.15765429, -1.57187295},
+                {-1.76293385, -1.80046117}
+            }
+        });
+
+        std::shared_ptr<Node> mySub = Sub();
+        mySub->getOperator()->setDatatype(DataType::Float32);
+        mySub->getOperator()->setBackend("cpu");
+        mySub->getOperator()->associateInput(0, input_1);
+        mySub->getOperator()->associateInput(1, input_2);
+        mySub->getOperator()->computeOutputDims();
+        mySub->forward();
+
+        float* resPtr = static_cast<float*>(mySub->getOperator()->getOutput(0)->getImpl()->rawPtr());
+        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
+        for (std::size_t i = 0; i< 4; ++i) {
+            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
+        }
+
+    }
+
+    SECTION("2D Tensors") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array2D<float,2,2> {
+            {
+                {0.34234560, 0.92812711},
+                {0.73706615, 0.69953883}
+            }
+        });
+        std::shared_ptr<Tensor> input_2 =  std::make_shared<Tensor>(Array2D<float,2,2>{
+            {
+                {0.61729127, 0.83004373},
+                {0.72002089, 0.52473849}
+            }
+        });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array2D<float,2,2> {
+            {
+                {-0.27494568,  0.09808338},
+                {0.01704526,  0.17480034}
+            }
+        });
+
+        std::shared_ptr<Node> mySub = Sub();
+        mySub->getOperator()->setDatatype(DataType::Float32);
+        mySub->getOperator()->setBackend("cpu");
+        mySub->getOperator()->associateInput(0, input_1);
+        mySub->getOperator()->associateInput(1, input_2);
+        mySub->getOperator()->computeOutputDims();
+        mySub->forward();
+
+        float* resPtr = static_cast<float*>(mySub->getOperator()->getOutput(0)->getImpl()->rawPtr());
+        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
+        for (std::size_t i = 0; i< 4; ++i) {
+            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
+        }
+
+    }
+
+    SECTION("3D Tensor by 1D Tensor") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array3D<float,2,2,3> {
+            {
+                {{0.84181279, 0.20655948, 0.09750116},
+                 {0.37723488, 0.73120135, 0.04666907}},
+
+                {{0.91483921, 0.93985939, 0.58823180},
+                 {0.39963132, 0.67879969, 0.33209187}}
+            }
+        });
+        std::shared_ptr<Tensor> input_2 =  std::make_shared<Tensor>(Array1D<float,3>{
+            {0.04784805, 0.91903114, 0.38606840}
+        });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array3D<float,2,2,3> {
+            {
+                {{0.79396474, -0.71247166, -0.28856725},
+                 {0.32938683, -0.18782979, -0.33939934}},
+
+                {{0.86699116,  0.02082825,  0.20216340},
+                 {0.35178328, -0.24023145, -0.05397654}}
+            }
+        });
+
+        std::shared_ptr<Node> mySub = Sub();
+        mySub->getOperator()->setDatatype(DataType::Float32);
+        mySub->getOperator()->setBackend("cpu");
+        mySub->getOperator()->associateInput(0, input_1);
+        mySub->getOperator()->associateInput(1, input_2);
+        mySub->getOperator()->computeOutputDims();
+        mySub->forward();
+
+        float* resPtr = static_cast<float*>(mySub->getOperator()->getOutput(0)->getImpl()->rawPtr());
+        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
+        for (std::size_t i = 0; i< 12; ++i) {
+            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
+        }
+
+    }
+}
\ No newline at end of file