Compare revisions

Cyril Moineau · Cyril Moineau · Cyril Moineau · Grégoire Kubler · Cyril Moineau · Cyril Moineau
--- a/aidge_backend_cpu/unit_tests/test_scheduler.py
+++ b/aidge_backend_cpu/unit_tests/test_scheduler.py
@@ -13,16 +13,17 @@ class test_scheduler(unittest.TestCase):
        pass

    def test_relu_forward(self):
-        values = np.arange(6) - 3

-        input_node = aidge_core.Producer(aidge_core.Tensor(values), "Input")
+        t = aidge_core.Tensor(np.arange(6, dtype=np.int32) - 3)
+
+        input_node = aidge_core.Producer(t)
        relu = aidge_core.ReLU()
+        input_node.add_child(relu)

        gv = aidge_core.GraphView()
        gv.add(relu)
        gv.add(input_node)

-        input_node.add_child(relu)

        gv.set_datatype(aidge_core.dtype.int32)
        gv.set_backend("cpu")
@@ -34,7 +35,7 @@ class test_scheduler(unittest.TestCase):
        out_tensor = relu.get_operator().get_output(0)
        expected_out = [0,0,0,0,1,2]
        for i in range(len(expected_out)):
-            self.assertEqual(expected_out[i], out_tensor[i])
+            self.assertEqual(expected_out[i], out_tensor[i], f"On idx {i}")

    def test_sequential_scheduling(self):
        input_data =  np.array([0]).astype(np.float32)
@@ -69,7 +70,7 @@ class test_scheduler(unittest.TestCase):
            aidge_core.Producer(input_tensor, "X"),
            aidge_core.FC(1, 50, name='0'),
            aidge_core.parallel([aidge_core.FC(50, 50, name='1'), aidge_core.FC(50, 50, name='3')]),
-            aidge_core.Add(2, name='2'),
+            aidge_core.Add(name='2'),
        ])

        EXPECTED_SCHEDULE = [['0', '1', '3', '2'],  ['0', '3', '1', '2']] # Both scheduling are valid !

--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -15,11 +15,14 @@
 #include "aidge/backend/cpu/operator/AbsImpl.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
 #include "aidge/backend/cpu/operator/AndImpl.hpp"
+#include "aidge/backend/cpu/operator/AtanImpl.hpp"
+
 #include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
 #include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
 #include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
+#include "aidge/backend/cpu/operator/ClipImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
@@ -37,6 +40,7 @@
 #include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
 #include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
 #include "aidge/backend/cpu/operator/ReLUImpl.hpp"
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
 #include "aidge/backend/cpu/operator/ScalingImpl.hpp"
 #include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
 #include "aidge/backend/cpu/operator/SqrtImpl.hpp"

--- a/include/aidge/backend/cpu/operator/AtanImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AtanImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ATAN_H_
+#define AIDGE_CPU_OPERATOR_ATAN_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Atan.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using AtanImpl_cpu = OperatorImpl_cpu<Atan_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Atan_Op, "cpu", Aidge::AtanImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ATAN_H_ */
--- a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/AtanImpl.hpp"
+#include <cmath>  // For atan()
+
+
+namespace Aidge {
+template <class I, class O>
+void AtanImpl_cpu_forward_kernel(std::size_t inputLenght,
+                                    const void* input_,
+                                    void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    for (size_t i = 0; i < inputLenght; ++i) {
+        output[i] = static_cast<O>(atan(input[i]));
+    }
+
+}
+
+template <class O, class GI, class GO>
+void AtanImpl_cpu_backward_kernel(const std::size_t inputLenght,
+                                     const void* output_, const void* grad_output_,
+				     void* grad_input_) {
+    const O* output = static_cast<const O*>(output_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+
+    // Apply the derivative of atan for each element in the input array
+    for (size_t i = 0; i < inputLenght; ++i) {
+        // dx = dy * (1 / (1 + x^2))
+        grad_input[i] = grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i]));
+    }
+}
+
+
+// Kernels registration to implementation entry point
+REGISTRAR(AtanImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::AtanImpl_cpu_forward_kernel<float, float>, Aidge::AtanImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(AtanImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::AtanImpl_cpu_forward_kernel<double, double>, Aidge::AtanImpl_cpu_backward_kernel<double, double, double>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ClipImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ClipImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CLIPIMPL_H_
+#define AIDGE_CPU_OPERATOR_CLIPIMPL_H_
+
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>    // std::tuple
+#include <vector>
+#include <algorithm>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Clip.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+    using ClipImpl_cpu = OperatorImpl_cpu<Clip_Op,
+    void(float, //Forward Types
+    float, 
+    const void*,
+    const std::size_t, 
+    void*),
+    void(float,//Backward Types
+    float, 
+    const std::size_t,
+    const void*, 
+    const void*,
+    void*)>;
+
+    REGISTRAR(Clip_Op,"cpu",Aidge::ClipImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_CLIPIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp
+
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/backend/cpu/operator/ClipImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void ClipImpl_cpu_forward_kernel(
+        float min_,
+        float max_,
+        const void* input_,
+        const std::size_t length,
+        void* output_) 
+{
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    for (std::size_t i = 0; i < length; ++i) {
+        output[i] = std::min(std::max(static_cast<float>(input[i]), min_), max_);
+    }
+}
+
+template <class I, class GI, class GO>
+void ClipImpl_cpu_backward_kernel(
+        float min_,
+        float max_,
+        const std::size_t length,
+        const void* input_, 
+        const void* grad_output_,
+		void* grad_input_)           
+{
+    const I* input = static_cast<const I*>(input_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+
+    for (std::size_t i = 0; i < length; ++i) {
+        grad_input[i] = ((input[i] > min_) && (input[i] < max_)) ? grad_output[i] : 0;
+    }
+}
+
+REGISTRAR(ClipImpl_cpu,
+{DataType::Float32},
+{ProdConso::inPlaceModel,
+Aidge::ClipImpl_cpu_forward_kernel<float,float>,
+Aidge::ClipImpl_cpu_backward_kernel<float,float,float>});
+REGISTRAR(ClipImpl_cpu,
+{DataType::Float64},
+{ProdConso::inPlaceModel,
+Aidge::ClipImpl_cpu_forward_kernel<double,double>,
+Aidge::ClipImpl_cpu_backward_kernel<double,double,double>});
+REGISTRAR(ClipImpl_cpu,
+{DataType::Int32},
+{ProdConso::inPlaceModel,
+Aidge::ClipImpl_cpu_forward_kernel<std::int32_t,std::int32_t>,
+Aidge::ClipImpl_cpu_backward_kernel<std::int32_t,std::int32_t,std::int32_t>});
+REGISTRAR(ClipImpl_cpu,
+{DataType::Int64},
+{ProdConso::inPlaceModel,
+Aidge::ClipImpl_cpu_forward_kernel<std::int64_t,std::int64_t>,
+Aidge::ClipImpl_cpu_backward_kernel<std::int64_t,std::int64_t,std::int64_t>});
+
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
@@ -38,7 +38,7 @@ namespace Aidge {
 */
 template <class I, class W, class B, class O>
 void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
-                            const std::array<DimSize_t, 1>& /*dilationDims*/,
+                            const std::array<DimSize_t, 1>& dilationDims,
                            const std::array<DimSize_t, 1>& kernelDims,
                            const std::array<DimSize_t, 3>& inputDims,
                            const void *input_,
@@ -53,10 +53,12 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri


    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
                                static_cast<float>(strideDims[0])));

+
    // TODO: kernel computation
    // output (batch, outCh, Xout, Yout)
    // input  (batch, ch, Xin, Yin)
@@ -71,15 +73,17 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri
            const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2];
            const std::size_t wIndex = ch * kernelDims[0];
            for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                const std::size_t sxMin = 0;
+                const std::size_t sxMax = dilated_kernel_x;
                const std::size_t oIndexFull = oIndex + ox;
                const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);

-                for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
+                for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
                    output[oIndexFull] += weights[wIndex + sx] *
-                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))];
+                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
                }
            }
        }
@@ -113,7 +117,7 @@ REGISTRAR(ConvDepthWiseImpl1D_cpu,
 */
 template <class I, class W, class B, class O>
 void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
-                            const std::array<DimSize_t, 2>& /*dilationDims*/,
+                            const std::array<DimSize_t, 2>& dilationDims,
                            const std::array<DimSize_t, 2>& kernelDims,
                            const std::array<DimSize_t, 4>& inputDims,
                            const void *input_,
@@ -129,12 +133,15 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri


    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
                                static_cast<float>(strideDims[0])));
+
    // output W size
+    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - kernelDims[1] + strideDims[1]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
                                static_cast<float>(strideDims[1])));

    // TODO: kernel computation
@@ -143,49 +150,106 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
    // weight (outCh, ch, kernelX, kernelY)
    // does not take Dilation attribute into account
    using signedsize = std::make_signed<std::size_t>::type;
-    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-        for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
-            const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize * oySize;
-            B biasVal = (biases != nullptr) ? biases[ch] : B(0);
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
-            const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-            const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
-            for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
-                for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
-                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                    const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify);
-                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
-                    const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
-                    const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
-
-                    if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
-                        output[oIndexFull] +=  (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
-                    } else {
-                        for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
-                            for (std::size_t sy = syMin; sy < syMax; ++sy) {
+    const std::size_t outChannels_s =  oxSize * oySize;
+
+    if (dilated_kernel_x ==3 && dilated_kernel_y == 3) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+
+                std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch * 9;
+
+                if (strideDims[0] == 1 && strideDims[1]==1) {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
+                        }
+                        iIndex+=inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
+                        }
+                        iIndex+=inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                        }
+                    }
+                } else {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) {
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                        iIndex+=strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                        iIndex+=strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+        std::size_t index = 0;
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+
+                const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch;
+
+                if (strideDims[0] == 1 && strideDims[1] == 1) {
+                    for (; index < iIndex + oxSize*oySize; ++index) {
+                        output[index] = biasVal + weights[wIndex] * input[index];
+                    }
+                } else  {
+                    std::size_t oIndex =  (ch + batch*inputDims[1]) * oxSize * oySize;
+                    for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize) {
+                        index = iIndex + strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                            output[oIndex + oy] += weights[wIndex]*input[index+iy];
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+
+                const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
+
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    for (std::size_t oy = 0; oy < oySize; ++oy) {
+
+                        const std::size_t oIndexFull = ox*oySize + oy;
+                        const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
+                        const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
+
+                        for (std::size_t sx = 0; sx*dilationDims[0] < dilated_kernel_x; ++sx) {
+                            for (std::size_t sy = 0; sy*dilationDims[1] < dilated_kernel_y; ++sy) {
                                output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
-                                                        input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy))];
+                                                        input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))];
                            }
                        }
                    }
                }
            }
+            output += outChannels_s;
        }
    }
 }

+
 // Kernels registration to implementation entry point
 REGISTRAR(ConvDepthWiseImpl2D_cpu,
    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},

--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -40,7 +40,7 @@ namespace Aidge {
 */
 template <class I, class W, class B, class O>
 void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
-                            const std::array<DimSize_t, 1>& /*dilationDims*/,
+                            const std::array<DimSize_t, 1>& dilationDims,
                            const std::array<DimSize_t, 1>& kernelDims,
                            const std::array<DimSize_t, 3>& inputDims,
                            DimSize_t outChannels,
@@ -57,8 +57,9 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,

    // output H size
    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) /
                                static_cast<float>(strideDims[0])));
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;

    // TODO: kernel computation
    // output (batch, outCh, Xout, Yout)
@@ -76,15 +77,17 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2];
                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0];
                for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                    const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                    const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                    const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                    // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                    // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                    // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                    const std::size_t sxMin = 0;
+                    const std::size_t sxMax = dilated_kernel_x;
                    const std::size_t oIndexFull = oIndex + ox;
                    const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);

-                    for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
+                    for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
                        output[oIndexFull] += weights[wIndex + sx] *
-                                                input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))];
+                                                input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
                    }
                }
            }
@@ -122,7 +125,7 @@ REGISTRAR(ConvImpl1D_cpu,
 */
 template <class I, class W, class B, class O>
 void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
-                            const std::array<DimSize_t, 2>& /*dilationDims*/,
+                            const std::array<DimSize_t, 2>& dilationDims,
                            const std::array<DimSize_t, 2>& kernelDims,
                            const std::array<DimSize_t, 4> &inputDims,
                            DimSize_t outChannels,
@@ -138,66 +141,124 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
    O *output = static_cast<O *>(output_);

    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
                                static_cast<float>(strideDims[0])));
    // output W size
+    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - kernelDims[1] + strideDims[1]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
                                static_cast<float>(strideDims[1])));

+
    // TODO: kernel computation
    // output (batch, outCh, Xout, Yout)
    // input  (batch, inCh, Xin, Yin)
    // weight (outCh, inCh, kernelX, kernelY)
    // does not take Dilation attribute into account
+    const std::size_t outChannels_s =  oxSize * oySize;
    using signedsize = std::make_signed<std::size_t>::type;
-    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
-            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize;
-            // If bias = nullptr, set B(0)
-            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
-            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
-                for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                    const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                    const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                    const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
-                    for (std::size_t oy = 0; oy < oySize; ++oy) {
-                        const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
-                        const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                        const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify);
-                        const std::size_t oIndexFull = oIndex + ox*oySize + oy;
-                        const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
-                        const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
-
-                        if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
-                            output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                   weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                   weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
-                        } else {
-                            for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
-                                for (std::size_t sy = syMin; sy < syMax; ++sy) {
-                                    output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
-                                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy))];
+
+    if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9;
+                    if (strideDims[0] == 1 && strideDims[1]==1) {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
+                            }
+                            iIndex+=inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
+                            }
+                            iIndex+=inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                            }
+                        }
+                    } else {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) {
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2];
+                            }
+                            iIndex+=strideDims[0]*inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2];
+                            }
+                            iIndex+=strideDims[0]*inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2];
+                            }
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]);
+                    if (strideDims[0] == 1 && strideDims[1] == 1) {
+                        for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) {
+                            output[oIndex] += weights[wIndex] * input[iIndex];
+                        }
+                    } else  {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) {
+                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy];
+                            }
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
+
+                    // loop over each ouput line
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) {
+                        // loop over associated input line
+                        for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) {
+                            // loop over the entire line
+                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                                const std::size_t iIndex = iIndex_channel + ix + iy;
+                                // loop over elements assosicated with one output
+                                for (std::size_t kx = 0;  kx < kernelDims[0]; ++kx) {
+                                    output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]];
                                }
                            }
                        }
                    }
                }
+                output += outChannels_s;
            }
        }
    }
 }

+
+
 // Kernels registration to implementation entry point
 REGISTRAR(ConvImpl2D_cpu,
    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},

--- a/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
@@ -96,11 +96,11 @@ void GridSampleImpl1D_cpu_forward_kernel(const GridSample_Op& op,
                            const std::shared_ptr<Tensor>& in1,
                            const std::shared_ptr<Tensor>& out)
 {
-    const I* const input = static_cast<const I * const>(in0->getImpl()->rawPtr());
+    const I* const input = static_cast<const I *>(in0->getImpl()->rawPtr());
    const I* input_ptr = input;
-    float* const grid = static_cast<float* const>(in1->getImpl()->rawPtr());
+    float* const grid = static_cast<float*>(in1->getImpl()->rawPtr());
    float* grid_ptr = grid;
-    O* const output = static_cast<O* const>(out->getImpl()->rawPtr());
+    O* const output = static_cast<O*>(out->getImpl()->rawPtr());
    O* output_ptr = output;

    const std::size_t N = in0->dim(0);
@@ -243,9 +243,9 @@ void GridSampleImpl2D_cpu_forward_kernel(const GridSample_Op& op,
 {
    const I* input = static_cast<const I *>(in0->getImpl()->rawPtr());
    const I* input_ptr = input;
-    float* const grid = static_cast<float* const>(in0->getImpl()->rawPtr());
+    float* const grid = static_cast<float*>(in0->getImpl()->rawPtr());
    float* grid_ptr = grid;
-    O* const output = static_cast<O* const>(out->getImpl()->rawPtr());
+    O* const output = static_cast<O*>(out->getImpl()->rawPtr());

    const std::size_t N = in0->dim(0);
    const std::size_t C = in0->dim(1);

--- a/include/aidge/backend/cpu/operator/OperatorImpl.hpp
+++ b/include/aidge/backend/cpu/operator/OperatorImpl.hpp
@@ -38,8 +38,10 @@ public:
        return impl.prodConso(mOp);
    }

-    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
-        return Registrar<OperatorImpl_cpu>::getKeys();
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
+        // return Registrar<OperatorImpl_cpu>::getKeys(); // Note: cannot return set due to python binding 
+        std::set<ImplSpec> implSpecsSet = Registrar<OperatorImpl_cpu>::getKeys();
+        return std::vector<ImplSpec>(implSpecsSet.begin(), implSpecsSet.end());
    }

    void forward() override;

--- a/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
@@ -55,19 +55,19 @@ void PadImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorder
                O outputValue = static_cast<O>(borderValue);

                if (borderType == PadBorderType::Constant) {
-                    int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[1]);
+                    int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[0]);

                    if (ix >= 0  && ix < static_cast<int>(dims[2])) {
                        outputValue = input[iIndex + static_cast<std::size_t>(ix)];
                    }
                }
                else if (borderType == PadBorderType::Edge) {
-                    int ix = std::max(0, std::min(static_cast<int>(dims[2]) - 1, static_cast<int>(ox) - static_cast<int>(beginEndBorders[1])));
+                    int ix = std::max(0, std::min(static_cast<int>(dims[2]) - 1, static_cast<int>(ox) - static_cast<int>(beginEndBorders[0])));

                    outputValue = input[iIndex + static_cast<std::size_t>(ix)];
                }
                else if (borderType == PadBorderType::Reflect) {
-                    int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[1]);
+                    int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[0]);

                    if (ix < 0)
                        ix = 0 - ix;
@@ -77,7 +77,7 @@ void PadImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorder
                    outputValue = input[iIndex + static_cast<std::size_t>(ix)];
                }
                else if (borderType == PadBorderType::Wrap) {
-                    int ix = (static_cast<int>(dims[2]) + static_cast<int>(ox) - static_cast<int>(beginEndBorders[1])) % static_cast<int>(dims[2]);
+                    int ix = (static_cast<int>(dims[2]) + static_cast<int>(ox) - static_cast<int>(beginEndBorders[0])) % static_cast<int>(dims[2]);

                    outputValue = input[iIndex + static_cast<std::size_t>(ix)];
                }
@@ -120,8 +120,8 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder
    const I *input = static_cast<const I *>(input_);
    O *output = static_cast<O *>(output_);

-    const std::size_t oySize = dims[2] + beginEndBorders[0] + beginEndBorders[1];
-    const std::size_t oxSize = dims[3] + beginEndBorders[2] + beginEndBorders[3];
+    const std::size_t oySize = dims[2] + beginEndBorders[0] + beginEndBorders[2];
+    const std::size_t oxSize = dims[3] + beginEndBorders[1] + beginEndBorders[3];

    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
        for (std::size_t ch = 0; ch < dims[1]; ++ch) {
@@ -135,22 +135,22 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder
                    O outputValue = static_cast<O>(borderValue);

                    if (borderType == PadBorderType::Constant) {
-                        std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3]);
-                        std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1]);
+                        std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1]);
+                        std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0]);

                        if (ix >= 0  && ix < static_cast<std::int32_t>(dims[3]) && iy >= 0  && iy < static_cast<std::int32_t>(dims[2])) {
                            outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                        }
                    }
                    else if (borderType == PadBorderType::Edge) {
-                        std::int32_t ix = std::max(0, std::min(static_cast<std::int32_t>(dims[3]) - 1, static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3])));
-                        std::int32_t iy = std::max(0, std::min(static_cast<std::int32_t>(dims[2]) - 1, static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1])));
+                        std::int32_t ix = std::max(0, std::min(static_cast<std::int32_t>(dims[3]) - 1, static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1])));
+                        std::int32_t iy = std::max(0, std::min(static_cast<std::int32_t>(dims[2]) - 1, static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0])));

                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                    }
                    else if (borderType == PadBorderType::Reflect) {
-                        std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3]);
-                        std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1]);
+                        std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1]);
+                        std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0]);

                        if (ix < 0)
                            ix = 0 - ix;
@@ -164,8 +164,8 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder
                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                    }
                    else if (borderType == PadBorderType::Wrap) {
-                        std::int32_t ix = (static_cast<std::int32_t>(dims[3]) + static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3])) % static_cast<std::int32_t>(dims[3]);
-                        std::int32_t iy = (static_cast<std::int32_t>(dims[2]) + static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1])) % static_cast<std::int32_t>(dims[2]);
+                        std::int32_t ix = (static_cast<std::int32_t>(dims[3]) + static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1])) % static_cast<std::int32_t>(dims[3]);
+                        std::int32_t iy = (static_cast<std::int32_t>(dims[2]) + static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0])) % static_cast<std::int32_t>(dims[2]);

                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                    }

--- a/include/aidge/backend/cpu/operator/RoundImpl.hpp
+++ b/include/aidge/backend/cpu/operator/RoundImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ROUNDIMPL_H_
+#define AIDGE_CPU_OPERATOR_ROUNDIMPL_H_
+
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Round.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using RoundImpl_cpu = OperatorImpl_cpu<Round_Op,
+    void(const std::size_t, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Round_Op, "cpu", Aidge::RoundImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ROUNDIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_
+
+#include <cmath>   //std::round 
+#include <cstddef>  // std::size_t
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void RoundImpl_cpu_forward_kernel(const std::size_t inputLenght,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        //std::round would not work since it doesn't follow the halves rules (See ONNX Round)
+        output[i] = static_cast<O>(std::nearbyint(static_cast<float>(input[i])));
+    }
+}
+
+
+REGISTRAR(RoundImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::RoundImpl_cpu_forward_kernel<float, float>,nullptr});
+REGISTRAR(RoundImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::RoundImpl_cpu_forward_kernel<double, double>,nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
@@ -89,13 +89,13 @@ void SliceImpl_cpu_forward_kernel(const std::vector<std::int64_t>& starts,
 }

 REGISTRAR(SliceImpl_cpu,
-    {DataType::Float32},
+    {{DataType::Float32, DataType::Any}, {DataType::Float32}},
    {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<float, float>, nullptr});
 REGISTRAR(SliceImpl_cpu,
-    {DataType::Float64},
+    {{DataType::Float64, DataType::Any}, {DataType::Float64}},
    {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<double, double>, nullptr});
 REGISTRAR(SliceImpl_cpu,
-    {DataType::Int32},
+    {{DataType::Int32, DataType::Any}, {DataType::Int32}},
    {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
 }  // namespace Aidge


--- a/src/operator/AtanImpl.cpp
+++ b/src/operator/AtanImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/operator/Atan.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/AtanImpl.hpp"
+#include "aidge/backend/cpu/operator/AtanImpl_kernels.hpp"
+
+template <>
+void Aidge::AtanImpl_cpu::forward() {
+	const Atan_Op& op_ = dynamic_cast<const Atan_Op&>(mOp);
+    std::shared_ptr<Tensor> in0 = op_.getInput(0);
+    std::shared_ptr<Tensor> out0 = op_.getOutput(0);
+    AIDGE_ASSERT(in0, "missing input #0");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<AtanImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(in0->size(),
+        getCPUPtr(mOp.getRawInput(0)),
+        getCPUPtr(mOp.getRawOutput(0)));
+}
+
+template <>
+void Aidge::AtanImpl_cpu::backward() {
+    const Atan_Op& op_ = dynamic_cast<const Atan_Op&>(mOp);
+    std::shared_ptr<Tensor> out0  = op_.getOutput(0);
+    std::shared_ptr<Tensor> gra_int0 = op_.getInput(0)->grad();
+    std::shared_ptr<Tensor> gra_out0 = op_.getOutput(0)->grad();
+    AIDGE_ASSERT(out0, "missing output #0 for current {} operator", op_.type());
+
+    // Find the correct kernel type
+    const auto impl = Registrar<AtanImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.backward(gra_int0->size(), getCPUPtr(out0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
+}
--- a/src/operator/ClipImpl.cpp
+++ b/src/operator/ClipImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+#include <vector>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Clip.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/utils/ErrorHandling.hpp"
+
+#include "aidge/backend/cpu/operator/ClipImpl.hpp"
+#include "aidge/backend/cpu/operator/ClipImpl_kernels.hpp"
+
+template<>
+void Aidge::ClipImpl_cpu::forward() {
+
+	const Clip_Op& op_ = dynamic_cast<const Clip_Op&>(mOp);
+    std::shared_ptr<Tensor> in0 = op_.getInput(0);
+    std::shared_ptr<Tensor> out0 = op_.getOutput(0);
+    AIDGE_ASSERT(in0, "missing input #0");
+    /*AIDGE_ASSERT(in1, "missing input #1 -> Min value empty shape Tensor");
+    AIDGE_ASSERT(in2, "missing input #2 -> Max value empty shape Tensor");*/
+    // Find the correct kernel type
+    const auto impl = Registrar<ClipImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(
+       op_.min(),
+       op_.max(),
+       getCPUPtr(mOp.getRawInput(0)), 
+       in0->size(), 
+       getCPUPtr(mOp.getRawOutput(0))
+    );
+}
+
+template<>
+void Aidge::ClipImpl_cpu::backward() {
+
+    const Clip_Op& op_ = dynamic_cast<const Clip_Op&>(mOp);
+    std::shared_ptr<Tensor> in0  = op_.getInput(0);
+    std::shared_ptr<Tensor> out0  = op_.getOutput(0);
+    std::shared_ptr<Tensor> gra_in0 = op_.getInput(0)->grad();
+    std::shared_ptr<Tensor> gra_out0 = op_.getOutput(0)->grad();    
+    AIDGE_ASSERT(out0, "missing output #0 for current {} operator", op_.type());
+    
+    // Find the correct kernel type
+    const auto impl = Registrar<ClipImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+    // Call kernel
+    impl.backward(
+        op_.min(),
+        op_.max(),
+        gra_in0->size(), 
+        getCPUPtr(in0), 
+        getCPUPtr(gra_out0), 
+        getCPUPtr(gra_in0)
+    );
+}
--- a/src/operator/RoundImpl.cpp
+++ b/src/operator/RoundImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Round.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
+#include "aidge/backend/cpu/operator/RoundImpl_kernels.hpp"
+
+template <>
+void Aidge::RoundImpl_cpu::forward() {
+    std::shared_ptr<Tensor> in0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0));
+    std::shared_ptr<Tensor> out0 = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0));
+    AIDGE_ASSERT(in0, "missing input #0");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<RoundImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(in0->size(),
+        getCPUPtr(mOp.getRawInput(0)),
+        getCPUPtr(mOp.getRawOutput(0)));
+}
+template <>
+void Aidge::RoundImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Round_Op on backend cpu");
+}
\ No newline at end of file
--- a/unit_tests/data/Test_TensorImpl.cpp
+++ b/unit_tests/data/Test_TensorImpl.cpp
@@ -35,7 +35,7 @@ TEST_CASE("Test addition of Tensors","[TensorImpl][Add]") {
    std::uniform_int_distribution<int> boolDist(0,1);

    // Create MatMul Operator
-    std::shared_ptr<Node> mySub = Add(2);
+    std::shared_ptr<Node> mySub = Add();
    auto op = std::static_pointer_cast<OperatorTensor>(mySub-> getOperator());
    op->setDataType(DataType::Float32);
    op->setBackend("cpu");
@@ -154,6 +154,10 @@ TEST_CASE("Test addition of Tensors","[TensorImpl][Add]") {
        Tensor T4(T1->dims());
        T4.setDataType(DataType::Float64);
        REQUIRE_THROWS(*T0 + T4);
+
+        delete[] array0;
+        delete[] array1;
+        delete[] result;
    }
 }


--- a/unit_tests/operator/Test_AddImpl.cpp
+++ b/unit_tests/operator/Test_AddImpl.cpp
@@ -39,17 +39,6 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
        }                                       //
    });                                         //

-    SECTION("One input") {
-        std::shared_ptr<Node> myAdd = Add(1);
-        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
-        op->associateInput(0, input1);
-        op->setBackend("cpu");
-        op->setDataType(DataType::Int32);
-        myAdd->forward();
-
-        REQUIRE(*(op->getOutput(0)) == *input1);
-    }
-
    SECTION("Two inputs") {
        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
            {
@@ -71,7 +60,7 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
            }
        });

-        std::shared_ptr<Node> myAdd = Add(2);
+        std::shared_ptr<Node> myAdd = Add();
        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
        op->associateInput(0, input1);
        op->associateInput(1, input1);
@@ -82,39 +71,6 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
    }

-    SECTION("Three inputs") {
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
-            {
-                {
-                    {{ 60, 141},{ 63, 144},{ 66, 147}},
-                    {{ 69, 150},{ 72, 153},{ 75, 156}},
-                    {{ 78, 159},{ 81, 162},{ 84, 165}}
-                },
-                {
-                    {{ 87, 168},{ 90, 171},{ 93, 174}},
-                    {{ 96, 177},{ 99, 180},{102, 183}},
-                    {{105, 186},{108, 189},{111, 192}}
-                },
-                {
-                    {{114, 195},{117, 198},{120, 201}},
-                    {{123, 204},{126, 207},{129, 210}},
-                    {{132, 213},{135, 216},{138, 219}}
-                }
-            }
-        });
-
-        std::shared_ptr<Node> myAdd = Add(3);
-        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
-        op->associateInput(0, input1);
-        op->associateInput(1, input1);
-        op->associateInput(2, input1);
-        op->setDataType(DataType::Int32);
-        op->setBackend("cpu");
-        myAdd->forward();
-
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
-    }
-
    SECTION("Broadcasting") {
        std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<int,3,1,3,2> {
        {                                       //
@@ -139,7 +95,7 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
        }                                       //
        });                                     //

-        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{100,200}});  
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{100,200}});
        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
            {                                               //
                {                                           //
@@ -160,16 +116,23 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
            }                                               //
        });                                                 //

-        std::shared_ptr<Node> myAdd = Add(3);
-        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
-        op->associateInput(0, input_0);
-        op->associateInput(1, input_1);
-        op->associateInput(2, input_2);
-        op->setDataType(DataType::Int32);
-        op->setBackend("cpu");
-        myAdd->forward();
-        op->getOutput(0)->print();
+        std::shared_ptr<Node> myAdd_0 = Add();
+        std::shared_ptr<Node> myAdd_1 = Add();
+        auto op_0 = std::static_pointer_cast<OperatorTensor>(myAdd_0 -> getOperator());
+        auto op_1 = std::static_pointer_cast<OperatorTensor>(myAdd_1 -> getOperator());
+        op_0->associateInput(0, input_0);
+        op_0->associateInput(1, input_1);
+
+        op_1->associateInput(0, input_2);
+        op_1->associateInput(1, op_0->getOutput(0));
+        op_0->setDataType(DataType::Int32);
+        op_1->setDataType(DataType::Int32);
+        op_0->setBackend("cpu");
+        op_1->setBackend("cpu");
+        myAdd_0->forward();
+        myAdd_1->forward();
+        op_1->getOutput(0)->print();
        expectedOutput->print();
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
+        REQUIRE(*op_1->getOutput(0) == *expectedOutput);
    }
 }
\ No newline at end of file
--- a/unit_tests/operator/Test_Atan.cpp
+++ b/unit_tests/operator/Test_Atan.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Atan.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+#include <memory>
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] Atan(forward)") {
+  SECTION("1D Tensor") {
+    std::shared_ptr<Tensor> input0 =
+        std::make_shared<Tensor>(Array1D<float, 10>{
+            {0.41384590, 0.43120754, 0.93762982, 0.31049860, 0.77547199,
+             0.09514862, 0.16145366, 0.42776686, 0.43487436, 0.41170865}});
+    std::shared_ptr<Tensor> expectedOutput =
+        std::make_shared<Tensor>(Array1D<float, 10>{
+            {0.39238522, 0.40711672, 0.75322037, 0.30106049, 0.65960488,
+             0.09486303, 0.16007232, 0.40421187, 0.4102045, 0.39055911}});
+
+    std::shared_ptr<Node> myAtan = Atan();
+    auto op = std::static_pointer_cast<OperatorTensor>(myAtan->getOperator());
+    op->associateInput(0, input0);
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+    myAtan->forward();
+
+    float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
+    float* expectedPtr =
+        static_cast<float*>(expectedOutput->getImpl()->rawPtr());
+    for (std::size_t i = 0; i < expectedOutput->size(); ++i) {
+      REQUIRE(std::abs(resPtr[i] - expectedPtr[i]) < 0.00001);
+    }
+  }
+
+  SECTION("3D Tensor") {
+    std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(
+        Array3D<float, 2, 2, 3>{{{
+                                     {0.97037154, 0.86208081, 0.77767169},
+                                     {0.38160080, 0.11422747, 0.77284443},
+                                 },
+                                 {{0.51592529, 0.72543722, 0.54641193},
+                                  {0.93866944, 0.97767913, 0.34172094}}}});
+    std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
+        Array3D<float, 2, 2, 3>{{{{0.77036231, 0.71146592, 0.66097706},
+                                  {0.36454508, 0.11373451, 0.65796196}},
+                                 {{0.47630652, 0.62759472, 0.50008428},
+                                  {0.75377332, 0.77411225, 0.32928031}}}});
+
+    std::shared_ptr<Node> myAtan = Atan();
+    auto op = std::static_pointer_cast<OperatorTensor>(myAtan->getOperator());
+    op->associateInput(0, input0);
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+    myAtan->forward();
+
+    float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
+    float* expectedPtr =
+        static_cast<float*>(expectedOutput->getImpl()->rawPtr());
+    for (std::size_t i = 0; i < expectedOutput->size(); ++i) {
+      REQUIRE(std::abs(resPtr[i] - expectedPtr[i]) < 0.00001);
+    }
+  }
+}
No results found