Merge branch 'fix_operators' into 'dev'

fix failed onnx tests See merge request !130

Merge branch 'fix_operators' into 'dev'
fix failed onnx tests See merge request !130
ab8dd8d8 · Olivier BICHLER · 8a669993 · f3de3e10 · ab8dd8d8 · ab8dd8d8
Commit ab8dd8d8 authored 5 months ago by Olivier BICHLER
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -29,6 +29,7 @@
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
 #include "aidge/backend/cpu/operator/ErfImpl.hpp"
 #include "aidge/backend/cpu/operator/ExpandImpl.hpp"
 #include "aidge/backend/cpu/operator/FCImpl.hpp"

--- a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
@@ -20,7 +20,7 @@ namespace Aidge {
 namespace {
 // suppose values are contiguous in memory
 template <class I, class O>
-void equal_contiguous_arrays(const std::size_t input1size,
+void and_contiguous_arrays(const std::size_t input1size,
                            const std::size_t input2size,
                            const std::size_t output1size,
                            const I* input1,
@@ -31,14 +31,14 @@ void equal_contiguous_arrays(const std::size_t input1size,
    {
        const std::size_t in1_id = (input1size != 1) ? i : 0;
        const std::size_t in2_id = (input2size != 1) ? i : 0;
-        output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
+        output[i] = static_cast<O>(input1[in1_id] && input2[in2_id]);
    }
 }
 }
 template <class I, class O>
-void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+void AndImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
                                std::vector<std::size_t> dims1,
                                const std::vector<std::size_t>& outputDims,
                                const void* input0_,
@@ -60,9 +60,8 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
    // special case for equal dimensions, the kernel is called with the entire arrays at once
    if (dims0 == dims1) {
        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
-        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i) {
-        {
+            output[i] = static_cast<O>(input_0[i] && input_1[i]);
-            output[i] = static_cast<O>(input_0[i] == input_1[i]);
        }
        return;
    }
@@ -126,7 +125,7 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
    std::size_t dim = contiguousIdx - 1;
    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
    for (std::size_t stack = 0; stack < nbStacks;) {
-        equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+        and_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
                    input_0 + offsetIn0*input0_contiguous_size,
                    input_1 + offsetIn1*input1_contiguous_size,
                    output + offsetOut*output_contiguous_size);
@@ -146,17 +145,17 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
 // Kernels registration to implementation entry point
 REGISTRAR(AndImpl_cpu,
-    {DataType::Float32},
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Float64},
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Int32},
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Int64},
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
 }  // namespace Aidge

--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
@@ -28,8 +28,10 @@ namespace Aidge {
 using AvgPooling2D_Op = AvgPooling_Op<2>;
 using AvgPoolingImpl2D_cpu = OperatorImpl_cpu<AvgPooling_Op<2>,
    void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
        const std::array<DimSize_t, 2>&,
        const std::array<DimSize_t, 4>&,
+        bool,
        const void *,
        void *)>;

--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -35,66 +35,68 @@ namespace Aidge {
 template <class I, class O>
 void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                        const std::array<DimSize_t, 2>& kernelDims,
+                                        const std::array<DimSize_t, 2>& dilations,
                                        const std::array<DimSize_t, 4> &dims,
+                                        bool ceilMode,
                                        const void *input_,
                                        void *output_) {
-    // FIXME: missing convolution attributes as arguments
    const I *input = static_cast<const I *>(input_);
    O *output = static_cast<O *>(output_);
    // output H size
-    const std::size_t oxSize =
+    const std::size_t oxSize = 
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) /
+        ceilMode 
-                                static_cast<float>(strideDims[0])));
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])));
    // output W size
-    const std::size_t oySize =
+    const std::size_t oySize = 
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) /
+        ceilMode 
-                                static_cast<float>(strideDims[1])));
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])))
-    // TODO: kernel computation
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
-    // output (batch, outCh, Xout, Yout)
+                                            static_cast<float>(strideDims[1])));
-    // input  (batch, ch, Xin, Yin)
-    // weight (outCh, ch, kernelX, kernelY)
-    // does not take Dilation attribute into account
    using signedsize = std::make_signed<std::size_t>::type;
    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
        for (std::size_t ch = 0; ch < dims[1]; ++ch) {
-            const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
+            const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize;
-            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
+            const std::size_t iIndex = (ch + batch * dims[1]) * dims[2] * dims[3];
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), 0);
            for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                const signedsize difx = static_cast<signedsize>(-ox * strideDims[0]);
                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
                const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx);
                for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
+                    const signedsize dify = static_cast<signedsize>(-oy * strideDims[1]);
                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
                    const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify);
-                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
+                    const std::size_t oIndexFull = oIndex + ox * oySize + oy;
                    const std::size_t ix = ox * strideDims[0];
                    const std::size_t iy = oy * strideDims[1];
-                    if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
+                    O sum = static_cast<O>(0);
-                        output[oIndexFull] += static_cast<O>(
+                    std::size_t count = 0;
-                                               input[iIndex + (ix+0)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+0)*dims[3] + (iy+1)] +
+                    for (unsigned int sy = syMin; sy < syMax; ++sy) {
-                                               input[iIndex + (ix+0)*dims[3] + (iy+2)] +
+                        for (unsigned int sx = sxMin; sx < sxMax; ++sx) {
-                                               input[iIndex + (ix+1)*dims[3] + (iy+0)] +
+                            // Apply dilation factor
-                                               input[iIndex + (ix+1)*dims[3] + (iy+1)] +
+                            const std::size_t dilated_sx = sx * dilations[0];
-                                               input[iIndex + (ix+1)*dims[3] + (iy+2)] +
+                            const std::size_t dilated_sy = sy * dilations[1];
-                                               input[iIndex + (ix+2)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+1)] +
+                            // Ensure within bounds
-                                               input[iIndex + (ix+2)*dims[3] + (iy+2)]) / O(9);
+                            if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) {
-                    } else {
+                                sum += static_cast<O>(input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)]);
-                        for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
+                                ++count;
-                            for (std::size_t sy = syMin; sy < syMax; ++sy) {
-                                output[oIndexFull] += input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
                            }
                        }
-                        // padding not used
-                        output[oIndexFull] /= (sxMax - sxMin) * (syMax - syMin);
                    }
+                    output[oIndexFull] = count > 0 ? sum / static_cast<O>(count) : 0;
                }
            }
        }

--- a/include/aidge/backend/cpu/operator/EqualImpl.hpp
+++ b/include/aidge/backend/cpu/operator/EqualImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_EQUALIMPL_H_
+#define AIDGE_CPU_OPERATOR_EQUALIMPL_H_
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Equal.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+namespace Aidge {
+// Operator implementation entry point for the backend
+using EqualImpl_cpu = OperatorImpl_cpu<Equal_Op,
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
+// Implementation entry point registration to Operator
+REGISTRAR(Equal_Op, "cpu", Aidge::EqualImpl_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_EQUALIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+namespace Aidge {
+namespace {
+// suppose values are contiguous in memory
+template <class I, class O>
+void equal_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I* input1,
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
+    }
+}
+}
+template <class I, class O>
+void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                void* output_) {
+    const I* input_0 = static_cast<const I*>(input0_);
+    const I* input_1 = static_cast<const I*>(input1_);
+    O* output = static_cast<O*>(output_);
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] == input_1[i]);
+        }
+        return;
+    }
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+    const std::size_t nbDims = dims0.size();
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
@@ -28,6 +28,7 @@ namespace Aidge {
 using MaxPooling2D_Op = MaxPooling_Op<2>;
 using MaxPoolingImpl2D_cpu = OperatorImpl_cpu<MaxPooling_Op<2>,
    void(const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
                            const std::array<DimSize_t, 2>&,
                            const bool,
                            const std::array<DimSize_t, 4> &,

--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -16,6 +16,7 @@
 #include <cmath>
 #include <tuple>
 #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 #include "aidge/data/Data.hpp"
@@ -35,28 +36,29 @@ namespace Aidge {
 template <class I, class O>
 void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                        const std::array<DimSize_t, 2>& kernelDims,
-                                        const bool /*ceilMode*/,
+                                        const std::array<DimSize_t, 2>& dilations,
+                                        const bool ceilMode,
                                        const std::array<DimSize_t, 4> &dims,
                                        const void *input_,
                                        void *output_) {
-    // FIXME: missing convolution parameters as arguments
    const I *input = static_cast<const I *>(input_);
    O *output = static_cast<O *>(output_);
    // output H size
-    const std::size_t oxSize =
+    const std::size_t oxSize = 
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) /
+        ceilMode 
-                                static_cast<float>(strideDims[0])));
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])));
    // output W size
-    const std::size_t oySize =
+    const std::size_t oySize = 
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) /
+        ceilMode 
-                                static_cast<float>(strideDims[1])));
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])));
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, ch, Xin, Yin)
-    // weight (outCh, ch, kernelX, kernelY)
-    // does not take Dilation parameter into account
    using signedsize = std::make_signed<std::size_t>::type;
    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
        for (std::size_t ch = 0; ch < dims[1]; ++ch) {
@@ -77,12 +79,15 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
                    I poolValue(0.0);
                    bool valid = false;
-                    for (unsigned int channel = 0; channel < dims[1];
+                    for (unsigned int sy = syMin; sy < syMax; ++sy) {
-                            ++channel){
+                        for (unsigned int sx = sxMin; sx < sxMax; ++sx) {
-                        for (unsigned int sy = syMin; sy < syMax; ++sy) {
+                            // Apply dilation factor to kernel indices
-                            for (unsigned int sx = sxMin; sx < sxMax; ++sx)
+                            const std::size_t dilated_sx = sx * dilations[0];
-                            {
+                            const std::size_t dilated_sy = sy * dilations[1];
-                                const I value = input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
+                            // Ensure indices are within bounds
+                            if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) {
+                                const I value = input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)];
                                if (!valid || value > poolValue) {
                                    poolValue = value;
@@ -98,106 +103,6 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
    }
 }
-//N2D2 version
-/*
-template <class T>
-void N2D2::PoolCell_Frame_Kernels::forwardMax(const T* alpha,
-                                              const Tensor<T>&
-                                              inputs,
-                                              const Descriptor& desc,
-                                              const T* beta,
-                                              Tensor<T>& outputs,
-                                              Tensor<ArgMax>& argMax,
-                                              bool useArgMax,
-                                              const Tensor<bool>& maps)
-{
-    const unsigned int size = inputs.dimB() * outputs.dimZ();
-#if defined(_OPENMP) && _OPENMP >= 200805
-#pragma omp parallel for collapse(2) if (size > 16)
-#else
-#pragma omp parallel for if (inputs.dimB() > 4 && size > 16)
-#endif
-    for (int batchPos = 0; batchPos < (int)inputs.dimB(); ++batchPos) {
-        for (unsigned int output = 0; output < outputs.dimZ(); ++output) {
-            for (unsigned int oy = 0; oy < outputs.dimY(); ++oy) {
-                for (unsigned int ox = 0; ox < outputs.dimX(); ++ox) {
-                    const unsigned int sxMin = (unsigned int)std::max(
-                        desc.padding[0] - (int)(ox * desc.stride[0]), 0);
-                    const unsigned int syMin = (unsigned int)std::max(
-                        desc.padding[1] - (int)(oy * desc.stride[1]), 0);
-                    const unsigned int sxMax = Utils::clamp
-                        <int>(inputs.dimX() + desc.padding[0] - ox * desc.stride[0],
-                              0,
-                              desc.pool[0]);
-                    const unsigned int syMax = Utils::clamp
-                        <int>(inputs.dimY() + desc.padding[1] - oy * desc.stride[1],
-                              0,
-                              desc.pool[1]);
-                    const int ix = (int)(ox * desc.stride[0]) - desc.padding[0];
-                    const int iy = (int)(oy * desc.stride[1]) - desc.padding[1];
-                    T poolValue(0.0);
-                    // For each output, compute the pool value
-                    if (useArgMax) {
-                        const ArgMax inputMax
-                            = argMax(ox, oy, output, batchPos);
-                        if (inputMax.valid) {
-                            poolValue = inputs(inputMax.ix,
-                                               inputMax.iy,
-                                               inputMax.channel,
-                                               batchPos);
-                        }
-                    }
-                    else {
-                        unsigned int ixMax = 0;
-                        unsigned int iyMax = 0;
-                        unsigned int channelMax = 0;
-                        bool valid = false;
-                        for (unsigned int channel = 0; channel < inputs.dimZ();
-                             ++channel)
-                        {
-                            if (!maps.empty() && !maps(output, channel))
-                                continue;
-                            for (unsigned int sy = syMin; sy < syMax; ++sy) {
-                                for (unsigned int sx = sxMin; sx < sxMax; ++sx)
-                                {
-                                    const T value = inputs(ix + sx,
-                                                                 iy + sy,
-                                                                 channel,
-                                                                 batchPos);
-                                    if (!valid || value > poolValue) {
-                                        poolValue = value;
-                                        valid = true;
-                                        ixMax = ix + sx;
-                                        iyMax = iy + sy;
-                                        channelMax = channel;
-                                    }
-                                }
-                            }
-                        }
-                        argMax(ox, oy, output, batchPos)
-                            = ArgMax(ixMax, iyMax, channelMax, valid);
-                    }
-                    outputs(ox, oy, output, batchPos)
-                        = (*alpha) * poolValue
-                          + (*beta) * outputs(ox, oy, output, batchPos);
-                }
-            }
-        }
-    }
-}
-*/
 // Kernels registration to implementation entry point
 REGISTRAR(MaxPoolingImpl2D_cpu,

--- a/src/operator/AvgPoolingImpl.cpp
+++ b/src/operator/AvgPoolingImpl.cpp
@@ -32,7 +32,9 @@ void Aidge::AvgPoolingImpl2D_cpu::forward() {
    // Call kernel
    impl.forward(op_.strideDims(),
               op_.kernelDims(),
+               op_.dilations(),
               op_.getInput(0)->template dims<4>(),
+               op_.ceilMode(),
               getCPUPtr(op_.getInput(0)),
               getCPUPtr(op_.getOutput(0)));
 }

--- a/src/operator/EqualImpl.cpp
+++ b/src/operator/EqualImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+#include "aidge/operator/Equal.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
+#include "aidge/backend/cpu/operator/EqualImpl_kernels.hpp"
+template <>
+void Aidge::EqualImpl_cpu::forward() {
+    const Equal_Op& op = static_cast<const Equal_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in Equal operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Equal forward because the 0-th input has no implementation.");
+    AIDGE_ASSERT(op.getInput(1), "missing input in Equal operator");
+    AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Equal forward because the 1st input has no implementation.");
+    AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot Equal inputs with two differents data type.");
+    // Find the correct kernel type
+    const auto impl = Registrar<EqualImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
+    const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0));
+    const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1));
+    impl.forward(op.getInput(0)->dims(),
+                op.getInput(1)->dims(),
+                op.getOutput(0)->dims(),
+                input0.getImpl()->rawPtr(),
+                input1.getImpl()->rawPtr(),
+                getCPUPtr(op.getRawOutput(0)));
+}
+template <>
+void Aidge::EqualImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Equal_Op on backend cpu");
+}
--- a/src/operator/MaxPoolingImpl.cpp
+++ b/src/operator/MaxPoolingImpl.cpp
@@ -30,6 +30,7 @@ void Aidge::MaxPoolingImpl2D_cpu::forward() {
    // Call kernel
    impl.forward(op_.strideDims(),
                op_.kernelDims(),
+                op_.dilations(),
                op_.ceilMode(),
                op_.getInput(0)->template dims<4>(),
                getCPUPtr(mOp.getRawInput(0)),

--- a/unit_tests/operator/Test_AndImpl.cpp
+++ b/unit_tests/operator/Test_AndImpl.cpp
@@ -26,75 +26,92 @@
 using namespace Aidge;
 TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
-        SECTION("ForwardDims")
+    SECTION("ForwardDims") {
-    {
        constexpr std::uint16_t NBTRIALS = 10;
        // Create a random number generator
        std::random_device rd;
        std::mt19937 gen(rd());
-        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<int> boolDist(0, 1); // Use 0 for false, 1 for true
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> dimSizeDist(2, 10);
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(1, 5);
-        std::uniform_int_distribution<int> boolDist(0,1);
        SECTION("Same dimensions") {
            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                DimSize_t nbDims = nbDimsDist(gen);
                std::vector<DimSize_t> dims(nbDims);
-                for (std::size_t i = 0; i < nbDims; i++) {
+                for (std::size_t i = 0; i < nbDims; ++i) {
                    dims[i] = dimSizeDist(gen);
                }
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array0 = new float[nb_elements];
+                float* array1 = new float[nb_elements];
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = boolDist(gen);
+                    array1[i] = boolDist(gen);
+                }
                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
-                myInput2->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
                myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
+                myInput1->setBackend("cpu");
+                myInput2->setBackend("cpu");
+                myInput1 -> getImpl() -> setRawPtr(array0, nb_elements);
+                myInput2 -> getImpl() -> setRawPtr(array1, nb_elements);
                std::shared_ptr<Node> myAnd = And();
-                auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+                auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
-                op->associateInput(0,myInput1);
+                op->associateInput(0, myInput1);
-                op->associateInput(1,myInput2);
+                op->associateInput(1, myInput2);
                op->setDataType(DataType::Float32);
                op->setBackend("cpu");
                op->forwardDims();
                const auto outputDims = op->getOutput(0)->dims();
                REQUIRE(outputDims == dims);
+                delete[] array0;
+                delete[] array1;
            }
        }
        SECTION("Broadcasting") {
            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                DimSize_t nbDims = nbDimsDist(gen);
                std::vector<DimSize_t> dims1(nbDims, 1);
                std::vector<DimSize_t> dims2(nbDims, 1);
                std::vector<DimSize_t> expectedOutDims;
-                for (std::size_t i = 0; i < nbDims; i++) {
+                for (std::size_t i = 0; i < nbDims; ++i) {
                    DimSize_t dim = dimSizeDist(gen);
-                    if (boolDist(gen)) {
+                    if (boolDist(gen)) dims1[i] = dim;
-                        dims1[i] = dim;
+                    if (boolDist(gen)) dims2[i] = dim;
-                    }
+                    expectedOutDims.push_back(std::max(dims1[i], dims2[i]));
-                    if (boolDist(gen)) {
-                        dims2[i] = dim;
-                    }
-                    expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
                }
+                const std::size_t nb_elements0 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const std::size_t nb_elements1 = std::accumulate(dims2.cbegin(), dims2.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array0 = new float[nb_elements0];
+                float* array1 = new float[nb_elements1];
+                for (std::size_t i = 0; i < nb_elements0; ++i) {
+                    array0[i] = boolDist(gen);
+                }
+                for (std::size_t i = 0; i < nb_elements1; ++i) {
+                    array1[i] = boolDist(gen);
+                }
                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
-                myInput2->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
                myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
+                myInput1->setBackend("cpu");
+                myInput2->setBackend("cpu");
+                myInput1 -> getImpl() -> setRawPtr(array0, nb_elements0);
+                myInput2 -> getImpl() -> setRawPtr(array1, nb_elements1);
                std::shared_ptr<Node> myAnd = And();
-                auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+                auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
-                op->associateInput(0,myInput1);
+                op->associateInput(0, myInput1);
-                op->associateInput(1,myInput2);
+                op->associateInput(1, myInput2);
                op->setDataType(DataType::Float32);
                op->setBackend("cpu");
@@ -102,110 +119,67 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
                const auto outputDims = op->getOutput(0)->dims();
                REQUIRE(outputDims == expectedOutDims);
+                delete[] array0;
+                delete[] array1;
            }
        }
    }
    SECTION("Same size inputs") {
-        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
-        {                                       //
-            {                                   //
-                {{20, 15},{31, 11},{22, 49}},   //
-                {{41, 10},{24, 51},{27, 52}},   //
-                {{26, 53},{27, 54},{28, 55}}    //
-            },                                  //
-            {                                   //
-                {{29, 56},{30, 57},{31, 58}},   //
-                {{32, 59},{33, 60},{34, 61}},   //
-                {{35, 62},{36, 63},{37, 64}}    //
-            },                                  //
-            {                                   //
-                {{38, 65},{39, 66},{40, 67}},   //
-                {{41, 68},{42, 69},{43, 70}},   //
-                {{44, 71},{45, 72},{46, 73}}    //
-            }                                   //
-        }                                       //
-    });                                         //
-        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
-            {                                       //
-                {                                   //
-                    {{20, 47},{21, 48},{22, 49}},   //
-                    {{23, 50},{24, 51},{25, 52}},   //
-                    {{17, 53},{27, 26},{14, 33}}    //
-                },                                  //
-                {                                   //
-                    {{29, 56},{30, 57},{31, 58}},   //
-                    {{72, 44},{33, 20},{27, 55}},   //
-                    {{35, 24},{25, 63},{28, 64}}    //
-                },                                  //
-                {                                   //
-                    {{32, 65},{39, 66},{40, 70}},   //
-                    {{41, 53},{42, 60},{34, 70}},   //
-                    {{44, 71},{30, 12},{46, 73}}    //
-                }                                   //
-            }                                       //
-        });                                         //
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
            {
-                {
+                {{{1, 0}, {0, 1}},
-                    {{1, 0},{0, 0},{1, 1}},
+                {{1, 1}, {0, 0}}},
-                    {{0, 0},{1, 1},{0, 1}},
+                {{{0, 1}, {1, 0}},
-                    {{0, 1},{1, 0},{0, 0}}
+                {{1, 0}, {0, 1}}}}
-                },
+            });
-                {
+        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
-                    {{1, 1},{1, 1},{1, 1}},
+            {
-                    {{0, 0},{1, 0},{0, 0}},
+                {{{1, 1}, {0, 0}},
-                    {{1, 0},{0, 1},{0, 1}}
+                {{0, 1}, {1, 1}}},
-                },
+                {{{1, 1}, {0, 0}},
-                {
+                {{0, 1}, {1, 0}}}}
-                    {{0, 1},{1, 1},{1, 0}},
+            });
-                    {{1, 0},{1, 0},{0, 1}},
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
-                    {{1, 1},{0, 0},{1, 1}}
+            {
-                }
+                {{{1, 0}, {0, 0}},
-            }
+                {{0, 1}, {0, 0}}},
-        });
+                {{{0, 1}, {0, 0}},
+                {{0, 0}, {0, 0}}}}
+            });
        std::shared_ptr<Node> myAnd = And();
-        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
        op->associateInput(0, input1);
        op->associateInput(1, input2);
        op->setBackend("cpu");
-        op->setDataType(DataType::Int32);
+        op->setDataType(DataType::Float32);
        myAnd->forward();
+        op->getOutput(0)->print();
        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
    }
    SECTION("Broadcasting") {
-        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{
-        {                                       //
+            {
-            {                                   //
+                {{{1, 0}, {1, 0}},
-                {{10, 20},{22, 23},{20, 20}},   //
+                {{1, 1}, {0, 0}}}}
-                {{10, 15},{10, 29},{20, 20}},   //
+            });
-                {{26, 25},{33, 20},{10, 20}}    //
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float, 2>{{1, 0}});
-            }                                   //
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{
-        }                                       //
+            {
-        });                                     //
+                {{{1, 0}, {1, 0}},
+                {{1, 0}, {0, 0}}}}
-        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});
+            });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
-            {                                   //
-                {                               //
-                    {{ 1, 1},{ 0, 0},{ 0, 1}},  //
-                    {{ 1, 0},{ 1, 0},{ 0, 1}},  //
-                    {{ 0, 0},{ 0, 1},{ 1, 1}}   //
-                }                               //
-            }                                   //
-        });                                     //
        std::shared_ptr<Node> myAnd = And();
-        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
        op->associateInput(0, input_1);
        op->associateInput(1, input_2);
-        op->setDataType(DataType::Int32);
+        op->setDataType(DataType::Float32);
        op->setBackend("cpu");
        myAnd->forward();
-        op->getOutput(0)->print();
-        expectedOutput->print();
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
    }
 }
\ No newline at end of file
--- a/unit_tests/operator/Test_AvgPoolingImpl.cpp
+++ b/unit_tests/operator/Test_AvgPoolingImpl.cpp
@@ -110,5 +110,95 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") {
            REQUIRE(std::abs(outPtr[i] - expectedOutPtr[i]) < 0.00001);
        }
    }
-    // std::cout << static_cast<Tensor>((*op)["weight"])[0][0][0][0] << std::endl;
+    SECTION("Dilations") {
+        std::shared_ptr<Tensor> myInput3 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW
+        {
+            {
+                {{ 1,  2,  3,  4,  5},
+                { 6,  7,  8,  9, 10},
+                {11, 12, 13, 14, 15},
+                {16, 17, 18, 19, 20},
+                {21, 22, 23, 24, 25}}
+            }
+        }
+        });
+        // Dilation of 2 means we take every second element in the window
+        std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "mycdw", {1,1}, {2,2}); 
+        auto op = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool -> getOperator());
+        std::shared_ptr<Tensor> myOutput3 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
+            {
+                {
+                    {{  7,  8,  9},
+                    { 12, 13, 14},
+                    { 17, 18, 19}}
+                }
+            }
+        });
+        op->associateInput(0, myInput3);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        myAvgPool->forward();
+        op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *myOutput3);
+    }
+    SECTION("Ceil Mode") {
+        std::shared_ptr<Tensor> myInput4 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW
+        {
+            {
+                {
+                    { 1,  2,  3,  4,  5},
+                    { 6,  7,  8,  9, 10},
+                    {11, 12, 13, 14, 15},
+                    {16, 17, 18, 19, 20},
+                    {21, 22, 23, 24, 25}
+                }
+            }
+        }
+        });
+        // AvgPool with ceil_mode = true
+        std::shared_ptr<Node> myAvgPool1 = AvgPooling({2,2}, "mycdw", {2,2}, {1,1}, true);
+        auto op1 = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool1 -> getOperator());
+        std::shared_ptr<Tensor> myOutput4 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
+            {
+                {
+                    {
+                        {  4.0,  6.0,  7.5 },
+                        { 14.0, 16.0, 17.5 },
+                        { 21.5, 23.5, 25.0 }
+                    }
+                }
+            }
+        });
+        op1->associateInput(0, myInput4);
+        op1->setDataType(DataType::Float32);
+        op1->setBackend("cpu");
+        myAvgPool1->forward();
+        op1->getOutput(0)->print();
+        REQUIRE(*(op1->getOutput(0)) == *myOutput4);
+        // AvgPool with ceil_mode = false
+        std::shared_ptr<Node> myAvgPool2 = AvgPooling({2,2}, "mycdw", {2,2}, {1,1}, false);
+        auto op2 = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool2 -> getOperator());
+        std::shared_ptr<Tensor> myOutput5 = std::make_shared<Tensor>(Array4D<float,1,1,2,2> {
+            {
+                {
+                    {
+                        {  4.0,  6.0 },
+                        { 14.0, 16.0 }
+                    }
+                }
+            }
+        });
+        op2->associateInput(0, myInput4);
+        op2->setDataType(DataType::Float32);
+        op2->setBackend("cpu");
+        myAvgPool2->forward();
+        op2->getOutput(0)->print();
+        REQUIRE(*(op2->getOutput(0)) == *myOutput5);
+    }
 }
\ No newline at end of file
--- a/unit_tests/operator/Test_EqualImpl.cpp
+++ b/unit_tests/operator/Test_EqualImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#include <catch2/catch_test_macros.hpp>
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Equal.hpp"
+using namespace Aidge;
+TEST_CASE("[cpu/operator] Equal(forwardDims)", "[Equal][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+    std::uniform_int_distribution<int> boolDist(0,1);
+    SECTION("Same dimensions") {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            DimSize_t nbDims = nbDimsDist(gen);
+            std::vector<DimSize_t> dims(nbDims);
+            for (std::size_t i = 0; i < nbDims; i++) {
+                dims[i] = dimSizeDist(gen);
+            }
+            std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
+            myInput1->setBackend("cpu");
+            myInput1->setDataType(DataType::Float32);
+            myInput1->zeros();
+            std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
+            myInput2->setBackend("cpu");
+            myInput2->setDataType(DataType::Float32);
+            myInput2->zeros();
+            std::shared_ptr<Node> myEqual = Equal();
+            auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+            op->associateInput(0,myInput1);
+            op->associateInput(1,myInput2);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->forwardDims();
+            const auto outputDims = op->getOutput(0)->dims();
+            REQUIRE(outputDims == dims);
+        }
+    }
+    SECTION("Broadcasting") {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            DimSize_t nbDims = nbDimsDist(gen);
+            std::vector<DimSize_t> dims1(nbDims, 1);
+            std::vector<DimSize_t> dims2(nbDims, 1);
+            std::vector<DimSize_t> expectedOutDims;
+            for (std::size_t i = 0; i < nbDims; i++) {
+                DimSize_t dim = dimSizeDist(gen);
+                if (boolDist(gen)) {
+                    dims1[i] = dim;
+                }
+                if (boolDist(gen)) {
+                    dims2[i] = dim;
+                }
+                expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+            }
+            std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
+            myInput1->setBackend("cpu");
+            myInput1->setDataType(DataType::Float32);
+            myInput1->zeros();
+            std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
+            myInput2->setBackend("cpu");
+            myInput2->setDataType(DataType::Float32);
+            myInput2->zeros();
+            std::shared_ptr<Node> myEqual = Equal();
+            auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+            op->associateInput(0,myInput1);
+            op->associateInput(1,myInput2);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->forwardDims();
+            const auto outputDims = op->getOutput(0)->dims();
+            REQUIRE(outputDims == expectedOutDims);
+        }
+    }
+}
+TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
+    SECTION("Same size inputs") {
+        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        {                                       //
+            {                                   //
+                {{20, 15},{31, 11},{22, 49}},   //
+                {{41, 10},{24, 51},{27, 52}},   //
+                {{26, 53},{27, 54},{28, 55}}    //
+            },                                  //
+            {                                   //
+                {{29, 56},{30, 57},{31, 58}},   //
+                {{32, 59},{33, 60},{34, 61}},   //
+                {{35, 62},{36, 63},{37, 64}}    //
+            },                                  //
+            {                                   //
+                {{38, 65},{39, 66},{40, 67}},   //
+                {{41, 68},{42, 69},{43, 70}},   //
+                {{44, 71},{45, 72},{46, 73}}    //
+            }                                   //
+        }                                       //
+    });                                         //
+        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+            {                                       //
+                {                                   //
+                    {{20, 47},{21, 48},{22, 49}},   //
+                    {{23, 50},{24, 51},{25, 52}},   //
+                    {{17, 53},{27, 26},{14, 33}}    //
+                },                                  //
+                {                                   //
+                    {{29, 56},{30, 57},{31, 58}},   //
+                    {{72, 44},{33, 20},{27, 55}},   //
+                    {{35, 24},{25, 63},{28, 64}}    //
+                },                                  //
+                {                                   //
+                    {{32, 65},{39, 66},{40, 70}},   //
+                    {{41, 53},{42, 60},{34, 70}},   //
+                    {{44, 71},{30, 12},{46, 73}}    //
+                }                                   //
+            }                                       //
+        });                                         //
+        Tensor expectedOutput =Tensor(Array4D<int,3,3,3,2> {
+            {
+                {
+                    {{1, 0},{0, 0},{1, 1}},
+                    {{0, 0},{1, 1},{0, 1}},
+                    {{0, 1},{1, 0},{0, 0}}
+                },
+                {
+                    {{1, 1},{1, 1},{1, 1}},
+                    {{0, 0},{1, 0},{0, 0}},
+                    {{1, 0},{0, 1},{0, 1}}
+                },
+                {
+                    {{0, 1},{1, 1},{1, 0}},
+                    {{1, 0},{1, 0},{0, 1}},
+                    {{1, 1},{0, 0},{1, 1}}
+                }
+            }
+        });
+        std::shared_ptr<Node> myEqual = Equal();
+        auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+        op->associateInput(0, input1);
+        op->associateInput(1, input2);
+        op->setBackend("cpu");
+        op->setDataType(DataType::Int32);
+        myEqual->forward();
+        REQUIRE(*(op->getOutput(0)) == expectedOutput);
+    }
+    SECTION("Broadcasting") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+        {                                       //
+            {                                   //
+                {{10, 20},{22, 23},{20, 20}},   //
+                {{10, 15},{10, 29},{20, 20}},   //
+                {{26, 25},{33, 20},{10, 20}}    //
+            }                                   //
+        }                                       //
+        });                                     //
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});  
+        Tensor expectedOutput = Tensor(Array4D<int,1,3,3,2> {
+            {                                   //
+                {                               //
+                    {{ 1, 1},{ 0, 0},{ 0, 1}},  //
+                    {{ 1, 0},{ 1, 0},{ 0, 1}},  //
+                    {{ 0, 0},{ 0, 1},{ 1, 1}}   //
+                }                               //
+            }                                   //
+        });                                     //
+        std::shared_ptr<Node> myEqual = Equal();
+        auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+        op->associateInput(0, input_1);
+        op->associateInput(1, input_2);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
+        myEqual->forward();
+        op->getOutput(0)->print();
+        REQUIRE(*op->getOutput(0) == expectedOutput);
+    }
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -80,4 +80,96 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
        op->getOutput(0)->print();
        REQUIRE(*(op->getOutput(0)) == myOutput);
    }
+    SECTION("Dilation") {
+        std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2}, {2,2}); // Dilation 2x2
+        auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator());
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
+            {
+                {
+                    {
+                        {0.71470, 0.52770},
+                        {0.71470, 0.48740}
+                    },
+                    {
+                        {2.23290, 0.48590},
+                        {2.23290, 0.07000}
+                    }
+                },
+                {
+                    {
+                        {1.76530, 1.20710},
+                        {1.76530, 1.20710}
+                    },
+                    {
+                        {1.04290, 0.67760},
+                        {1.72170, 0.67760}
+                    }
+                }
+            }
+        });
+        myMaxPool->getOperator()->associateInput(0,myInput);
+        myMaxPool->getOperator()->setDataType(DataType::Float32);
+        myMaxPool->getOperator()->setBackend("cpu");
+        myMaxPool->forward();
+        op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+    }
+    SECTION("Ceil Mode") {
+        std::shared_ptr<Tensor> myInput4 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW
+        {
+            {
+                {
+                    { 1,  2,  3,  4,  5},
+                    { 6,  7,  8,  9, 10},
+                    {11, 12, 13, 14, 15},
+                    {16, 17, 18, 19, 20},
+                    {21, 22, 23, 24, 25}
+                }
+            }
+        }
+        });
+        // MaxPool with ceil_mode = true
+        std::shared_ptr<Node> myMaxPool1 = MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, true);
+        auto op1 = std::static_pointer_cast<OperatorTensor>(myMaxPool1 -> getOperator());
+        std::shared_ptr<Tensor> myOutput4 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
+            {
+                {
+                    {
+                        {  7.0,  9.0, 10.0 },
+                        { 17.0, 19.0, 20.0 },
+                        { 22.0, 24.0, 25.0 }
+                    }
+                }
+            }
+        });
+        op1->associateInput(0, myInput4);
+        op1->setDataType(DataType::Float32);
+        op1->setBackend("cpu");
+        myMaxPool1->forward();
+        op1->getOutput(0)->print();
+        REQUIRE(*(op1->getOutput(0)) == *myOutput4);
+        // MaxPool with ceil_mode = false
+        std::shared_ptr<Node> myMaxPool2 = MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, false);
+        auto op2 = std::static_pointer_cast<OperatorTensor>(myMaxPool2 -> getOperator());
+        std::shared_ptr<Tensor> myOutput5 = std::make_shared<Tensor>(Array4D<float,1,1,2,2> {
+            {
+                {
+                    {
+                        {  7.0,  9.0 },
+                        { 17.0, 19.0 }
+                    }
+                }
+            }
+        });
+        op2->associateInput(0, myInput4);
+        op2->setDataType(DataType::Float32);
+        op2->setBackend("cpu");
+        myMaxPool2->forward();
+        op2->getOutput(0)->print();
+        REQUIRE(*(op2->getOutput(0)) == *myOutput5);
+    }
 }
\ No newline at end of file