Merge branch 'dev' into 'main'

v0.4.0 See merge request !118

Merge branch 'dev' into 'main'
v0.4.0 See merge request !118
740a27ba · Maxence Naud · a2477ff1 · ed718fc0 · 740a27ba · 740a27ba
Commit 740a27ba authored 3 months ago by Maxence Naud
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,19 +12,14 @@ stages:
  - deploy
 include:
-  - project: 'eclipse/aidge/gitlab_shared_files' 
+  - project: 'eclipse/aidge/gitlab_shared_files'
    ref: 'main'
-    file: 
+    file: # choose which jobs to run by including the corresponding files.
-      # choose which jobs to run by including the corresponding files.
      - '.gitlab/ci/ubuntu_cpp.gitlab-ci.yml'
      - '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
-      - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'   
+      - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'
      - '.gitlab/ci/windows_cpp.gitlab-ci.yml'
+      - '.gitlab/ci/windows_python.gitlab-ci.yml'
-      - '.gitlab/ci/windows_python.gitlab-ci.yml'   
+      - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'
-      - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'   
--- a/CHANGELOG
+++ b/CHANGELOG
+# Verson 0.4.0 (December 6, 2024)
 # Version 0.2.2 (May 14, 2024)
 * Remove implmentation for Operators soly handling memory and format

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,9 @@ execute_process(
 message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}")
 add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")
+# helper for LSP users
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 # Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME}
 set(module_name _${CMAKE_PROJECT_NAME}) # target name
 set(pybind_module_name ${CMAKE_PROJECT_NAME}) # name of submodule for python bindings

--- a/aidge_backend_cpu/unit_tests/test_scheduler.py
+++ b/aidge_backend_cpu/unit_tests/test_scheduler.py
@@ -13,9 +13,10 @@ class test_scheduler(unittest.TestCase):
        pass
    def test_relu_forward(self):
-        values = np.arange(6) - 3
-        input_node = aidge_core.Producer(aidge_core.Tensor(values), "Input")
+        t = aidge_core.Tensor(np.arange(6, dtype=np.int32) - 3)
+        input_node = aidge_core.Producer(t)
        relu = aidge_core.ReLU()
        input_node.add_child(relu)
@@ -34,7 +35,7 @@ class test_scheduler(unittest.TestCase):
        out_tensor = relu.get_operator().get_output(0)
        expected_out = [0,0,0,0,1,2]
        for i in range(len(expected_out)):
-            self.assertEqual(expected_out[i], out_tensor[i])
+            self.assertEqual(expected_out[i], out_tensor[i], f"On idx {i}")
    def test_sequential_scheduling(self):
        input_data =  np.array([0]).astype(np.float32)
@@ -69,7 +70,7 @@ class test_scheduler(unittest.TestCase):
            aidge_core.Producer(input_tensor, "X"),
            aidge_core.FC(1, 50, name='0'),
            aidge_core.parallel([aidge_core.FC(50, 50, name='1'), aidge_core.FC(50, 50, name='3')]),
-            aidge_core.Add(2, name='2'),
+            aidge_core.Add(name='2'),
        ])
        EXPECTED_SCHEDULE = [['0', '1', '3', '2'],  ['0', '3', '1', '2']] # Both scheduling are valid !

--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -15,11 +15,14 @@
 #include "aidge/backend/cpu/operator/AbsImpl.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
 #include "aidge/backend/cpu/operator/AndImpl.hpp"
+#include "aidge/backend/cpu/operator/AtanImpl.hpp"
 #include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
 #include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
 #include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
+#include "aidge/backend/cpu/operator/ClipImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
@@ -28,15 +31,19 @@
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
 #include "aidge/backend/cpu/operator/FoldImpl.hpp"
 #include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/backend/cpu/operator/LRNImpl.hpp"
 #include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
 #include "aidge/backend/cpu/operator/LnImpl.hpp"
 #include "aidge/backend/cpu/operator/MatMulImpl.hpp"
 #include "aidge/backend/cpu/operator/MulImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
+#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
 #include "aidge/backend/cpu/operator/PowImpl.hpp"
 #include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
 #include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
+#include "aidge/backend/cpu/operator/ResizeImpl.hpp"
 #include "aidge/backend/cpu/operator/ReLUImpl.hpp"
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
 #include "aidge/backend/cpu/operator/ScalingImpl.hpp"
 #include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
 #include "aidge/backend/cpu/operator/SqrtImpl.hpp"

--- a/include/aidge/backend/cpu/data/Interpolation.hpp
+++ b/include/aidge/backend/cpu/data/Interpolation.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_DATA_INTERPOLATION_H_
+#define AIDGE_CPU_DATA_INTERPOLATION_H_
+#include <vector>
+#include <aidge/data/Interpolation.hpp>
+#include <aidge/utils/Types.h>
+namespace Aidge {
+class InterpolationCPU : public Interpolation {
+  public:
+    /*
+     * @brief Interpolates values given via input in given mode.
+     *
+     * Values are contiguously arranged in a "square" shape around the point to
+     * interpolate. Depending on interpolation mode.
+     * The point that will be interpolated is located right in the
+     * middle of all points.
+     * Immediate neighbours :
+     * 1D interp :     2D interp :
+     *                 . . . . . .
+     * . . 1 2 . .     . . . . . .
+     *                 . . 1 2 . .
+     *                 . . 3 4 . .
+     *                 . . . . . .
+     *                 . . . . . .
+     *
+     * 2 neighbours :
+     * 1D interp :         2D interp :
+     *                   .  .  .  .  .  .  . .
+     *                   .  .  .  .  .  .  . .
+     * . . 1 2 3 4 . .   .  .  1  2  3  4  . .
+     *                   .  .  5  6  7  8  . .
+     *                   .  .  9 10 11 12  . .
+     *                   .  . 13 14 15 16  . .
+     *                   .  .  .  .  .  .  . .
+     *                   .  .  .  .  .  .  . .
+     *
+     * @param[in] originalCoords: coord of the point to interpolate in the
+     * original picture. These coords are generated with
+     * Interpolation::untransformCoords(coordsInInterpolatedTensor)
+     * @param[in] points : points to interpolate, arranged in a vector of a
+     * pairs ((point_coord), value) :
+     * [[[X1, X2, ..., XN], Xval], ...., [[A1, A2, ..., AN],Aval]].
+     * With :
+     * - N: the number of dimensions.
+     * - A: the number of points of the grid to interpolate.
+     * - All coordinates expressed in originalTensor frame.
+     * @param[in] interpMode: interpolation mode
+     * @return interpolated value
+     */
+    template <typename T>
+    static T interpolate(const std::vector<float> &coordsToInterpolate,
+                         const std::set<Point<T>> &points,
+                         const Mode interpMode = Interpolation::Mode::Linear);
+    /**
+     * @brief performs linear interpolation on given points.
+     * @param[in] values: values to interpolate, since we only do an average of
+     * all values, their indexes isn't useful.
+     * @return interpolated value
+     */
+    template <typename T>
+    static T linear(const std::vector<float> &originalCoords,
+                    const std::set<Point<T>> &points);
+    /**
+     * @brief performs nearest interpolation on given points.
+     * @note it is a wrapper for linearRecurse() private method
+     * @param[in] coordsToInterpolate: coordinates to interpolate
+     * @param[in] points: points to interpolate
+     * @param[in] interpMode: interpolation method, must be a Nearest...
+     * otherwise function will throw an error.
+     * @return interpolated value
+     */
+    template <typename T>
+    static T nearest(const std::vector<float> &coordsToInterpolate,
+                     const std::set<Point<T>> &points,
+                     const Interpolation::Mode nearestMode);
+  private:
+    /**
+     * @brief actual linear interpolation function.
+     * will :
+     * - Split all points along each dimension depending of if their coords at
+     * idx alongDim are above or under coordsToInterpolate until they are
+     * 1-to-1.
+     * - Perform interpolation in 2 leftover points and return interpolated
+     * point to parent call with a set of size 1.
+     * - repeat until all dimensions have been interpolated.
+     * @param[in] coordsToInterpolate: coordinates to interpolate
+     * @param[in] points: points to interpolate
+     * @param[in] alongDim: discriminant on along which dimension are being
+     * segregated.
+     * @return
+     */
+    template <typename T>
+    static std::set<Interpolation::Point<T>>
+    linearRecurse(const std::vector<float> &coordsToInterpolate,
+                  const std::set<Point<T>> &points,
+                  const DimIdx_t alongDim = 0);
+};
+} // namespace Aidge
+#endif // AIDGE_CPU_DATA_INTERPOLATION_H_
--- a/include/aidge/backend/cpu/operator/AddImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl.hpp
@@ -25,7 +25,7 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using AddImpl_cpu = OperatorImpl_cpu<Add_Op,
-    void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)>;
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
 // Implementation entry point registration to Operator
 REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create);

--- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
@@ -14,31 +14,137 @@
 #include "aidge/utils/Registrar.hpp"
-#include <cstdint>     // std::int32_t, std::int64_t
+#include <cstddef>  // std::size_t
 #include "aidge/backend/cpu/data/Broadcasting.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
 namespace Aidge {
+namespace {
+// suppose values are contiguous in memory
 template <class I, class O>
-void AddImpl_cpu_forward_kernel(const std::vector<const void*> inputs_, const std::vector<std::vector<std::size_t>>& inputDims, const std::size_t outputLength, const std::vector<std::size_t>& outDims, void* output_) {
+void add_contiguous_arrays(const std::size_t input1size,
-    // FIXME: missing Add attributes as arguments
+                            const std::size_t input2size,
-    std::vector<const I*> inputs;
+                            const std::size_t output1size,
-    for (const auto& input_ : inputs_) {
+                            const I* input1,
-        inputs.push_back(static_cast<const I*>(input_));
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] + input2[in2_id]);
    }
+}
+}
+template <class I, class O>
+void AddImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                void* output_) {
+    const I* input_0 = static_cast<const I*>(input0_);
+    const I* input_1 = static_cast<const I*>(input1_);
    O* output = static_cast<O*>(output_);
-	for (std::size_t oIndex = 0; oIndex < outputLength; ++oIndex)
+    // [5,2,1,7] & [2,6,7]
-	{
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
-        output[oIndex] = 0;
+    // 2. Find the highest equal dimension -> 3
-		std::vector<size_t> indexes = getMultiDimIndices(outDims, oIndex);
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
-		for(std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) {
+    // 3. Compute the highest number of contiguous data -> 7
-			std::size_t idx = getFlattenedIndex(inputDims[iIndex], indexes);
+    // 4. Compute stride and offset step for the broadcast mechanism
-            output[oIndex] += inputs[iIndex][idx];
+    // 5. Call a simple kernel
-		}
-	}
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] + input_1[i]);
+        }
+        return;
+    }
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+    const std::size_t nbDims = dims0.size();
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        add_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
 }
 // Kernels registration to implementation entry point
@@ -48,6 +154,12 @@ REGISTRAR(AddImpl_cpu,
 REGISTRAR(AddImpl_cpu,
    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(AddImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}},
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, nullptr});
+REGISTRAR(AddImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}},
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, nullptr});
 REGISTRAR(AddImpl_cpu,
    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});

--- a/include/aidge/backend/cpu/operator/AndImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl.hpp
@@ -23,7 +23,7 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using AndImpl_cpu = OperatorImpl_cpu<And_Op,
-    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>;
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
 // Implementation entry point registration to Operator
 REGISTRAR(And_Op, "cpu", Aidge::AndImpl_cpu::create);

--- a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
@@ -12,52 +12,152 @@
 #ifndef AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_
 #define AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_
-#include "aidge/backend/cpu/data/Broadcasting.hpp"
 #include "aidge/backend/cpu/operator/AndImpl.hpp"
 #include "aidge/utils/Registrar.hpp"
 namespace Aidge {
-template <class I1, class I2, class O>
-void AndImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
+namespace {
-                                const std::vector<std::size_t>& input2Dims,
+// suppose values are contiguous in memory
+template <class I, class O>
+void equal_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I* input1,
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
+    }
+}
+}
+template <class I, class O>
+void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
                                const void* input1_,
-                                const void* input2_,
                                void* output_) {
-    const I1* input_1 = static_cast<const I1*>(input1_);
+    const I* input_0 = static_cast<const I*>(input0_);
-    const I2* input_2 = static_cast<const I2*>(input2_);
+    const I* input_1 = static_cast<const I*>(input1_);
    O* output = static_cast<O*>(output_);
-    size_t totalElements = 1;
+    // [5,2,1,7] & [2,6,7]
-    for (size_t dimSize : outputDims) {
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
-        totalElements *= dimSize;
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] == input_1[i]);
+        }
+        return;
    }
-	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+    // set dimensions to be of equal size by filling the smallest one with ones.
-	{
+    if (dims0.size() > dims1.size()) {
-		std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
-		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+    const std::size_t nbDims = dims0.size();
-		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
-        output[oIndex] = static_cast<O>(input_1[idx1] == input_2[idx2]);
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
    }
 }
 // Kernels registration to implementation entry point
 REGISTRAR(AndImpl_cpu,
    {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr});
 REGISTRAR(AndImpl_cpu,
    {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr});
 REGISTRAR(AndImpl_cpu,
    {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
 REGISTRAR(AndImpl_cpu,
    {DataType::Int64},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
 }  // namespace Aidge
 #endif /* AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/AtanImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AtanImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_ATAN_H_
+#define AIDGE_CPU_OPERATOR_ATAN_H_
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Atan.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+namespace Aidge {
+// Operator implementation entry point for the backend
+using AtanImpl_cpu = OperatorImpl_cpu<Atan_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, const void*, void*)>;
+// Implementation entry point registration to Operator
+REGISTRAR(Atan_Op, "cpu", Aidge::AtanImpl_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_ATAN_H_ */
--- a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/backend/cpu/operator/AtanImpl.hpp"
+#include <cmath>  // For atan()
+namespace Aidge {
+template <class I, class O>
+void AtanImpl_cpu_forward_kernel(std::size_t inputLenght,
+                                    const void* input_,
+                                    void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+    for (size_t i = 0; i < inputLenght; ++i) {
+        output[i] = static_cast<O>(atan(input[i]));
+    }
+}
+template <class O, class GI, class GO>
+void AtanImpl_cpu_backward_kernel(const std::size_t inputLenght,
+                                     const void* output_, const void* grad_output_,
+				     void* grad_input_) {
+    const O* output = static_cast<const O*>(output_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+    // Apply the derivative of atan for each element in the input array
+    for (size_t i = 0; i < inputLenght; ++i) {
+        // dx = dy * (1 / (1 + x^2))
+        grad_input[i] = grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i]));
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(AtanImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::AtanImpl_cpu_forward_kernel<float, float>, Aidge::AtanImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(AtanImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::AtanImpl_cpu_forward_kernel<double, double>, Aidge::AtanImpl_cpu_backward_kernel<double, double, double>});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/BatchNormImpl.hpp
+++ b/include/aidge/backend/cpu/operator/BatchNormImpl.hpp
@@ -29,7 +29,7 @@ using BatchNorm2D_Op = BatchNorm_Op<2>;
 using BatchNormImpl2D_cpu = OperatorImpl_cpu<BatchNorm_Op<2>,
    void(float,
        float,
-        const std::array<DimSize_t, 4> &,
+        const std::vector<DimSize_t> &,
        const void *,
        const void *,
        const void *,

--- a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
@@ -38,7 +38,7 @@ namespace Aidge {
 * @param output_ Output Tensor.
 */
 template <class I, class P, class O>
-void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std::array<DimSize_t, 4> &dims,
+void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std::vector<DimSize_t> &dims,
                                       const void *input_, const void *scale_, const void *shift_, void *batchMean_, void *batchVar_, void *output_, const bool freeze) {
    // FIXME: missing convolution attributes as arguments
    const I *input = static_cast<const I *>(input_);
@@ -49,9 +49,8 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std
    O *output = static_cast<O *>(output_);
    const DimSize_t nbBatch = dims[0];
-    const DimSize_t nbChannels = dims[1];
+    const DimSize_t nbChannels = (dims.size() > 1) ? dims[1] : 1;
-    const DimSize_t featureMapSize = dims[2]*dims[3];
+    const DimSize_t featureMapSize = (dims.size() > 2) ? std::accumulate(dims.begin() + 2, dims.end(), 1, std::multiplies<DimSize_t>()) : 1;
    if ((freeze == true) || (momentum == 0.0f)) {
        for (std::size_t batch = 0; batch < nbBatch; ++batch) {

--- a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp
+++ b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp
@@ -24,13 +24,13 @@ namespace Aidge {
 // Operator implementation entry point for the backend
 using BitShiftImpl_cpu = OperatorImpl_cpu<BitShift_Op,
    void(const BitShift_Op::BitShiftDirection,
-    const std::vector<std::size_t>&, 
+    std::vector<std::size_t>,
-    const std::vector<std::size_t>&, 
+    std::vector<std::size_t>,
-    const std::vector<std::size_t>&, 
+    const std::vector<std::size_t>&,
-    const void*, 
+    const void*,
    const void*,
    void*)>;
    // Implementation entry point registration to Operator
    REGISTRAR(BitShift_Op,"cpu",Aidge::BitShiftImpl_cpu::create);
 }  // namespace Aidge

--- a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp
@@ -12,47 +12,150 @@
 #ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_
 #define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_
-#include "aidge/utils/Registrar.hpp"
-#include <cstdint>     // std::int32_t, std::int64_t
+#include <cstdint>  // std::int32_t, std::int64_t
-#include "aidge/operator/BitShift.hpp"
+#include <cstddef>  // std::size_t
 #include "aidge/backend/cpu/data/Broadcasting.hpp"
 #include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
+#include "aidge/operator/BitShift.hpp"
+#include "aidge/utils/Registrar.hpp"
+namespace {
+// suppose values are contiguous in memory
+template <class I1, class I2, class O>
+void bitshift_contiguous_arrays(
+    const Aidge::BitShift_Op::BitShiftDirection direction,
+    const std::size_t input1size,
+    const std::size_t input2size,
+    const std::size_t output1size,
+    const I1* input_1,
+    const I2* input_2,
+    O* output)
+{
+    if(direction == Aidge::BitShift_Op::BitShiftDirection::right) {
+        for (std::size_t i = 0; i < output1size; ++i) {
+            const std::size_t idx1 = (input1size != 1) ? i : 0;
+            const std::size_t idx2 = (input2size != 1) ? i : 0;
+            output[i]= input_1[idx1] >> input_2[idx2];
+        }
+    } else {
+        for (std::size_t i = 0; i < output1size; ++i) {
+            const std::size_t idx1 = (input1size != 1) ? i : 0;
+            const std::size_t idx2 = (input2size != 1) ? i : 0;
+            output[i] = input_1[idx1] << input_2[idx2];
+        }
+    }
+}
+}
 namespace Aidge {
 template <class I1, class I2, class O>
 void BitShiftImpl_cpu_forward_kernel(
                                const BitShift_Op::BitShiftDirection direction,
-                                const std::vector<std::size_t>& input1Dims,
+                                std::vector<std::size_t> dims0,
-                                const std::vector<std::size_t>& input2Dims,
+                                std::vector<std::size_t> dims1,
                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
                                const void* input1_,
-                                const void* input2_,
                                void* output_
                                ) {
-    const I1* input_1 = static_cast<const I1*>(input1_);
+    const I1* input_0 = static_cast<const I1*>(input0_);
-    const I2* input_2 = static_cast<const I2*>(input2_);
+    const I2* input_1 = static_cast<const I2*>(input1_);
    O* output = static_cast<O*>(output_);
-    const size_t totalElements = std::accumulate(outputDims.begin(), outputDims.end(), std::size_t(1), std::multiplies<std::size_t>());
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
-    for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+    // 2. Find the highest equal dimension -> 3
-    {
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
-        std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+    // 3. Compute the highest number of contiguous data -> 7
-        std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+    // 4. Compute stride and offset step for the broadcast mechanism
-        std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
+    // 5. Call a simple kernel
-        if(direction == BitShift_Op::BitShiftDirection::right)
+    // ## Compute compatible input dimensions
-        {
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
-                output[oIndex]= input_1[idx1] >> input_2[idx2];
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        bitshift_contiguous_arrays(direction, input0_contiguous_size, input0_contiguous_size, input0_contiguous_size, input_0, input_1, output);
+        return;
+    }
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+    const std::size_t nbDims = dims0.size();
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
        }
-        else
+    }
-        {
+    ++contiguousIdx;
-                output[oIndex] = input_1[idx1] << input_2[idx2];
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        bitshift_contiguous_arrays<I1,I2,O>(direction, input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
        }
    }
 }

--- a/include/aidge/backend/cpu/operator/ClipImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ClipImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_CLIPIMPL_H_
+#define AIDGE_CPU_OPERATOR_CLIPIMPL_H_
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>    // std::tuple
+#include <vector>
+#include <algorithm>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Clip.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+    using ClipImpl_cpu = OperatorImpl_cpu<Clip_Op,
+    void(float, //Forward Types
+    float, 
+    const void*,
+    const std::size_t, 
+    void*),
+    void(float,//Backward Types
+    float, 
+    const std::size_t,
+    const void*, 
+    const void*,
+    void*)>;
+    REGISTRAR(Clip_Op,"cpu",Aidge::ClipImpl_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_CLIPIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/backend/cpu/operator/ClipImpl.hpp"
+namespace Aidge {
+template <class I, class O>
+void ClipImpl_cpu_forward_kernel(
+        float min_,
+        float max_,
+        const void* input_,
+        const std::size_t length,
+        void* output_) 
+{
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+    for (std::size_t i = 0; i < length; ++i) {
+        output[i] = std::min(std::max(static_cast<float>(input[i]), min_), max_);
+    }
+}
+template <class I, class GI, class GO>
+void ClipImpl_cpu_backward_kernel(
+        float min_,
+        float max_,
+        const std::size_t length,
+        const void* input_, 
+        const void* grad_output_,
+		void* grad_input_)           
+{
+    const I* input = static_cast<const I*>(input_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+    for (std::size_t i = 0; i < length; ++i) {
+        grad_input[i] = ((input[i] > min_) && (input[i] < max_)) ? grad_output[i] : 0;
+    }
+}
+REGISTRAR(ClipImpl_cpu,
+{DataType::Float32},
+{ProdConso::inPlaceModel,
+Aidge::ClipImpl_cpu_forward_kernel<float,float>,
+Aidge::ClipImpl_cpu_backward_kernel<float,float,float>});
+REGISTRAR(ClipImpl_cpu,
+{DataType::Float64},
+{ProdConso::inPlaceModel,
+Aidge::ClipImpl_cpu_forward_kernel<double,double>,
+Aidge::ClipImpl_cpu_backward_kernel<double,double,double>});
+REGISTRAR(ClipImpl_cpu,
+{DataType::Int32},
+{ProdConso::inPlaceModel,
+Aidge::ClipImpl_cpu_forward_kernel<std::int32_t,std::int32_t>,
+Aidge::ClipImpl_cpu_backward_kernel<std::int32_t,std::int32_t,std::int32_t>});
+REGISTRAR(ClipImpl_cpu,
+{DataType::Int64},
+{ProdConso::inPlaceModel,
+Aidge::ClipImpl_cpu_forward_kernel<std::int64_t,std::int64_t>,
+Aidge::ClipImpl_cpu_backward_kernel<std::int64_t,std::int64_t,std::int64_t>});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
@@ -137,6 +137,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
    const std::size_t oxSize =
            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
                                static_cast<float>(strideDims[0])));
    // output W size
    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
    const std::size_t oySize =
@@ -148,54 +149,106 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
    // input  (batch, ch, Xin, Yin)
    // weight (outCh, ch, kernelX, kernelY)
    // does not take Dilation attribute into account
-    using signedsize = std::make_signed<std::size_t>::type;
+    const std::size_t outChannels_s =  oxSize * oySize;
-    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-        for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+    if (dilated_kernel_x ==3 && dilated_kernel_y == 3) {
-            const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize * oySize;
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-            B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
-            const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
-            const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
-            for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                const std::size_t wIndex = ch * 9;
-                // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                if (strideDims[0] == 1 && strideDims[1]==1) {
-                const std::size_t sxMin = 0;
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
-                const std::size_t sxMax = dilated_kernel_x;
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
-                for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
-                    // const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
+                        }
-                    // const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
+                        iIndex+=inputDims[3];
-                    // const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify);
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const std::size_t syMin = 0;
+                            output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
-                    const std::size_t syMax = dilated_kernel_y;
+                        }
-                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
+                        iIndex+=inputDims[3];
-                    const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
+                            output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                        }
-                    if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
+                    }
-                        output[oIndexFull] +=  (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                } else {
-                                                weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) {
-                                                weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                                weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                            output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2];
-                                                weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                        }
-                                                weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
+                        iIndex+=strideDims[0]*inputDims[3];
-                                                weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                                weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                            output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2];
-                                                weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
+                        }
-                    } else {
+                        iIndex+=strideDims[0]*inputDims[3];
-                        for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
-                            for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) {
+                            output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+        std::size_t index = 0;
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+                const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch;
+                if (strideDims[0] == 1 && strideDims[1] == 1) {
+                    for (; index < iIndex + oxSize*oySize; ++index) {
+                        output[index] = biasVal + weights[wIndex] * input[index];
+                    }
+                } else  {
+                    std::size_t oIndex =  (ch + batch*inputDims[1]) * oxSize * oySize;
+                    for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize) {
+                        index = iIndex + strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                            output[oIndex + oy] += weights[wIndex]*input[index+iy];
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    for (std::size_t oy = 0; oy < oySize; ++oy) {
+                        const std::size_t oIndexFull = ox*oySize + oy;
+                        const std::size_t ix = ox * strideDims[0];
+                        const std::size_t iy = oy * strideDims[1];
+                        for (std::size_t sx = 0; sx*dilationDims[0] < dilated_kernel_x; ++sx) {
+                            for (std::size_t sy = 0; sy*dilationDims[1] < dilated_kernel_y; ++sy) {
                                output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
-                                                        input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))];
+                                                        input[iIndex + static_cast<std::size_t>(ix + sx*dilationDims[0])*inputDims[3] + static_cast<std::size_t>(iy + sy*dilationDims[1])];
                            }
                        }
                    }
                }
            }
+            output += outChannels_s;
        }
    }
 }
 // Kernels registration to implementation entry point
 REGISTRAR(ConvDepthWiseImpl2D_cpu,
    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},

--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -141,15 +141,15 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
    O *output = static_cast<O *>(output_);
    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
                                static_cast<float>(strideDims[0])));
-    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
    // output W size
+    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilationDims[1]*(kernelDims[1] - 1) - 1 + strideDims[1]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
                                static_cast<float>(strideDims[1])));
-    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
    // TODO: kernel computation
@@ -157,57 +157,107 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
    // input  (batch, inCh, Xin, Yin)
    // weight (outCh, inCh, kernelX, kernelY)
    // does not take Dilation attribute into account
-    using signedsize = std::make_signed<std::size_t>::type;
+    const std::size_t outChannels_s =  oxSize * oySize;
-    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+    if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
-            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize;
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-            // If bias = nullptr, set B(0)
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
-            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                // If bias = nullptr, set B(0)
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                std::fill(output, output+outChannels_s, biasVal);
-                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
+                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9;
-                    // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                    if (strideDims[0] == 1 && strideDims[1]==1) {
-                    // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
-                    // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const std::size_t sxMin = 0;
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
-                    const std::size_t sxMax = dilated_kernel_x;
+                            }
-                    for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            iIndex+=inputDims[3];
-                        // const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
-                        // const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
+                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
-                        // const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify);
+                            }
-                        const std::size_t syMin = 0;
+                            iIndex+=inputDims[3];
-                        const std::size_t syMax = dilated_kernel_y;
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
-                        const std::size_t oIndexFull = oIndex + ox*oySize + oy;
+                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
-                        const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
+                            }
-                        const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
+                        }
+                    } else {
-                        if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) {
-                            output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                                   weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2];
-                                                   weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
+                            }
-                                                   weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                            iIndex+=strideDims[0]*inputDims[3];
-                                                   weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                                   weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
+                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2];
-                                                   weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                            }
-                                                   weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                            iIndex+=strideDims[0]*inputDims[3];
-                                                   weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
-                        } else {
+                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2];
-                            for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
+                            }
-                                for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) {
+                        }
-                                    output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
+                    }
-                                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))];
+                }
+                output += outChannels_s;
+            }
+        }
+    } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]);
+                    if (strideDims[0] == 1 && strideDims[1] == 1) {
+                        for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) {
+                            output[oIndex] += weights[wIndex] * input[iIndex];
+                        }
+                    } else  {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) {
+                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy];
+                            }
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
+                    // loop over each ouput line
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) {
+                        // loop over associated input line
+                        for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) {
+                            // loop over the entire line
+                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                                const std::size_t iIndex = iIndex_channel + ix + iy;
+                                // loop over elements assosicated with one output
+                                for (std::size_t kx = 0;  kx < kernelDims[0]; ++kx) {
+                                    output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]];
                                }
                            }
                        }
                    }
                }
+                output += outChannels_s;
            }
        }
    }
 }
 // Kernels registration to implementation entry point
 REGISTRAR(ConvImpl2D_cpu,
    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},