Merge branch 'learning_backend_cuda' into 'dev'

Learning backend cuda See merge request eclipse/aidge/aidge_backend_cpu!75

Merge branch 'learning_backend_cuda' into 'dev'
Learning backend cuda See merge request eclipse/aidge/aidge_backend_cpu!75
6d62d437 · Thibault Allenet · c3027ed5 · 078b5e0b · 6d62d437 · 6d62d437
Commit 6d62d437 authored 10 months ago by Thibault Allenet
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -14,6 +14,8 @@

 #include "aidge/backend/cpu/operator/AbsImpl.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
+#include "aidge/backend/cpu/operator/AndImpl.hpp"
+#include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
 #include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
@@ -25,11 +27,13 @@
 #include "aidge/backend/cpu/operator/FoldImpl.hpp"
 #include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
+#include "aidge/backend/cpu/operator/LnImpl.hpp"
 #include "aidge/backend/cpu/operator/MatMulImpl.hpp"
 #include "aidge/backend/cpu/operator/MulImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
 #include "aidge/backend/cpu/operator/PowImpl.hpp"
 #include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
+#include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
 #include "aidge/backend/cpu/operator/ReLUImpl.hpp"
 #include "aidge/backend/cpu/operator/ScalingImpl.hpp"
 #include "aidge/backend/cpu/operator/SigmoidImpl.hpp"

--- a/include/aidge/backend/cpu/operator/AndImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ANDIMPL_H_
+#define AIDGE_CPU_OPERATOR_ANDIMPL_H_
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/And.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// compute kernel registry for forward and backward
+class AndImplForward_cpu
+    : public Registrable<AndImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)> {
+};
+class AndImplBackward_cpu
+    : public Registrable<AndImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)> {
+};
+
+class AndImpl_cpu : public OperatorImpl {
+public:
+    AndImpl_cpu(const And_Op& op) : OperatorImpl(op, "cpu") {}
+
+    static std::unique_ptr<AndImpl_cpu> create(const And_Op& op) {
+        return std::make_unique<AndImpl_cpu>(op);
+    }
+
+    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
+    void forward() override;
+};
+
+namespace {
+static Registrar<And_Op> registrarAndImpl_cpu("cpu", Aidge::AndImpl_cpu::create);
+}
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ANDIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/AndImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl_forward_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ANDIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_ANDIMPL_FORWARD_KERNEL_H_
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/AndImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+template <class I1, class I2, class O>
+void AndImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
+                                const std::vector<std::size_t>& input2Dims,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input1_,
+                                const void* input2_,
+                                void* output_) {
+
+    const I1* input_1 = static_cast<const I1*>(input1_);
+    const I2* input_2 = static_cast<const I2*>(input2_);
+    O* output = static_cast<O*>(output_);
+
+    size_t totalElements = 1;
+    for (size_t dimSize : outputDims) {
+        totalElements *= dimSize;
+    }
+
+	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+	{
+		std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+
+		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
+
+        output[oIndex] = static_cast<O>(input_1[idx1] == input_2[idx2]);
+    }
+}
+
+namespace {
+static Registrar<AndImplForward_cpu> registrarAndImplForward_cpu_Float32(
+        {DataType::Float32, DataType::Float32, DataType::Float32},
+        Aidge::AndImpl_cpu_forward_kernel<float, float, float>);
+static Registrar<AndImplForward_cpu> registrarAndImplForward_cpu_Float64(
+        {DataType::Float64, DataType::Float64, DataType::Float64},
+        Aidge::AndImpl_cpu_forward_kernel<double, double, double>);
+static Registrar<AndImplForward_cpu> registrarAndImplForward_cpu_Int32(
+        {DataType::Int32, DataType::Int32, DataType::Int32},
+        Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>);
+static Registrar<AndImplForward_cpu> registrarAndImplForward_cpu_Int64(
+        {DataType::Int64, DataType::Int64, DataType::Int64},
+        Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ANDIMPL_FORWARD_KERNEL_H_ */
--- a/include/aidge/backend/cpu/operator/ArgMaxImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ArgMaxImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ARGMAXIMPL_H_
+#define AIDGE_CPU_OPERATOR_ARGMAXIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/ArgMax.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+class ArgMaxImplForward_cpu
+    : public Registrable<ArgMaxImplForward_cpu,
+                        std::tuple<DataType, DataType>,
+                        void(std::int32_t,
+                            DimSize_t,
+                            const std::vector<DimSize_t>&,
+                            const void *,
+                            void *)> {};
+class ArgMaxImplBackward_cpu
+    : public Registrable<ArgMaxImplBackward_cpu,
+                        std::tuple<DataType, DataType>,
+                        void(std::int32_t,
+                            DimSize_t,
+                            const std::vector<DimSize_t>&,
+                            const void *,
+                            void *)> {};
+
+class ArgMaxImpl_cpu : public OperatorImpl {
+   public:
+    ArgMaxImpl_cpu(const ArgMax_Op& op) : OperatorImpl(op, "cpu") {}
+
+    static std::unique_ptr<ArgMaxImpl_cpu> create(const ArgMax_Op &op) {
+        return std::make_unique<ArgMaxImpl_cpu>(op);
+    }
+
+   public:
+    void forward() override;
+};
+
+namespace {
+static Registrar<ArgMax_Op> registrarArgMaxImpl_cpu("cpu", Aidge::ArgMaxImpl_cpu::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ARGMAXIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ArgMaxImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ArgMaxImpl_forward_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ARGMAXIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_ARGMAXIMPL_FORWARD_KERNEL_H_
+
+#include <algorithm>   // std::for_each
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t
+#include <functional>  //std::multiplies
+#include <numeric>     //std::accumulate
+#include <vector>
+#include <limits>
+
+#include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/ArgMax.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void ArgMaxImpl_cpu_forward_kernel(std::int32_t axis_,
+                                    DimSize_t select_last_index,
+                                    const std::vector<DimSize_t>& inputDims,
+                                    const void* input_,
+                                    void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    const std::size_t axis = static_cast<std::size_t>(axis_);
+
+    std::size_t stride_post = 1;
+    for (std::size_t i = axis + 1; i < inputDims.size(); ++i) {
+        stride_post *= inputDims[i];
+    }
+    std::size_t stride_pre = 1;
+    for (std::size_t i = 0; i < axis; ++i) {
+        stride_pre *= inputDims[i];
+    }
+    const std::size_t dim_i = inputDims[axis];
+    for (std::size_t pre = 0; pre < stride_pre; ++pre) {
+        for (std::size_t post = 0; post < stride_post; ++post) {
+            const std::size_t idx_i = pre * dim_i * stride_post + post;
+            const std::size_t idx_o = pre * stride_post + post;
+            I max = std::numeric_limits<I>::min();
+            for (std::size_t i = 0; i < dim_i; ++i) {
+                I curr_value = input[idx_i + i*stride_post];
+                if (select_last_index) {
+                    if (curr_value>=max) {
+                        output[idx_o] = i;
+                        max = curr_value;
+                    }
+                }
+                else {
+                    if (curr_value > max) {
+                        output[idx_o] = i;
+                        max = curr_value;
+                    }
+                }
+            }
+        }
+    }
+
+}
+
+namespace {
+static Registrar<ArgMaxImplForward_cpu> registrarArgMaxImplForward_cpu_Float32(
+        {DataType::Float32, DataType::Float32}, Aidge::ArgMaxImpl_cpu_forward_kernel<float, float>);
+static Registrar<ArgMaxImplForward_cpu> registrarArgMaxImplForward_cpu_Int32(
+        {DataType::Int32, DataType::Int32}, Aidge::ArgMaxImpl_cpu_forward_kernel<int, int>);
+static Registrar<ArgMaxImplForward_cpu> registrarArgMaxImplForward_cpu_Float64(
+        {DataType::Float64, DataType::Float64}, Aidge::ArgMaxImpl_cpu_forward_kernel<double, double>);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ARGMAXIMPL_FORWARD_KERNEL_H_ */
--- a/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp
@@ -38,7 +38,10 @@ void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
    const std::size_t nb_dims = inputDims.size();
    const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());

-    if (axes.size() == 1) {
+    if (axes.empty()){
+        std::copy_n(input,totalElements, output);
+    }
+    else if (axes.size() == 1) {
        const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axes[0], 1, std::multiplies<std::size_t>());
        const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axes[0], 1, std::multiplies<std::size_t>());


--- a/include/aidge/backend/cpu/operator/ReduceSumImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceSumImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_
+#define AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+class ReduceSumImplForward_cpu
+    : public Registrable<ReduceSumImplForward_cpu,
+                        std::tuple<DataType, DataType>,
+                        void(const std::vector<std::int32_t>&,
+                            DimSize_t,
+                            const std::vector<DimSize_t>&,
+                            const void *,
+                            void *)> {};
+class ReduceSumImpl1DBackward_cpu
+    : public Registrable<ReduceSumImpl1DBackward_cpu,
+                        std::tuple<DataType, DataType>,
+                        void(const std::vector<std::int32_t>&,
+                            DimSize_t,
+                            const std::vector<DimSize_t>&,
+                            const void *,
+                            void *)> {};
+
+class ReduceSumImpl_cpu : public OperatorImpl {
+   public:
+    ReduceSumImpl_cpu(const ReduceSum_Op& op) : OperatorImpl(op, "cpu") {}
+
+    static std::unique_ptr<ReduceSumImpl_cpu> create(const ReduceSum_Op &op) {
+        return std::make_unique<ReduceSumImpl_cpu>(op);
+    }
+
+   public:
+    void forward() override;
+};
+
+namespace {
+static Registrar<ReduceSum_Op> registrarReduceSumImpl_cpu("cpu", Aidge::ReduceSumImpl_cpu::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ReduceSumImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceSumImpl_forward_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_REDUCESUMIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_REDUCESUMIMPL_FORWARD_KERNEL_H_
+
+#include <algorithm>   // std::for_each
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t
+#include <functional>  //std::multiplies
+#include <numeric>     //std::accumulate
+#include <vector>
+
+#include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void ReduceSumImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
+                                    DimSize_t /*keepDims*/,
+                                    const std::vector<DimSize_t>& inputDims,
+                                    const void* input_,
+                                    void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    const std::size_t nb_dims = inputDims.size();
+    const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
+
+    if (axes.empty()){
+        std::copy_n(input,totalElements, output);
+    }
+    else if (axes.size() == 1) {
+        const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axes[0], 1, std::multiplies<std::size_t>());
+        const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axes[0], 1, std::multiplies<std::size_t>());
+
+        const std::size_t dim_i = inputDims[axes[0]];
+        for (std::size_t pre = 0; pre < stride_pre; ++pre) {
+            for (std::size_t post = 0; post < stride_post; ++post) {
+                const std::size_t idx_i = pre * dim_i * stride_post + post;
+                const std::size_t idx_o = pre * stride_post + post;
+                O sum = 0;
+                for (std::size_t i = 0; i < dim_i; ++i) {
+                    sum +=input[idx_i + i*stride_post];
+                }
+                output[idx_o]  = sum;
+            }
+        }
+    } else {
+        std::size_t outputElements = totalElements;
+
+        auto stride_post = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
+        stride_post[nb_dims - 1] = 1;
+        for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post[i] = stride_post[i+1]*inputDims[i+1];
+        }
+        auto stride_pre = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
+        stride_pre[0] = 1;
+        for (std::size_t i = 1; i < nb_dims; ++i) {
+            stride_pre[i] = stride_pre[i-1]*inputDims[i-1];
+        }
+
+        const I* inputAccumulation = input;
+        I* outputAccumulation = nullptr;
+
+        for (const auto& axisInt : axes) {
+            const std::size_t a = static_cast<std::size_t>(axisInt);
+            outputElements /= inputDims[a];
+            outputAccumulation = new I[outputElements];
+            const std::size_t dim_i = inputDims[a];
+            for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) {
+                for (std::size_t post = 0; post < stride_post[a]; ++post) {
+                    const std::size_t idx_i = pre * dim_i * stride_post[a] + post;
+                    const std::size_t idx_o = pre * stride_post[a] + post;
+                    I sum = 0;
+                    for (std::size_t i = 0; i < dim_i; ++i) {
+                        sum += inputAccumulation[idx_i + i*stride_post[a]];
+                    }
+                    outputAccumulation[idx_o] = sum;
+                }
+            }
+            std::for_each(stride_pre.get()+a+1, stride_pre.get()+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; });
+            if (inputAccumulation != input) {
+                delete[] inputAccumulation;
+            }
+            inputAccumulation = outputAccumulation;
+        }
+
+        // Copy elements from inputAccumulation to output while dividing by divisor
+        std::copy(inputAccumulation, inputAccumulation + outputElements, output);
+        if (outputAccumulation) {
+            delete[] outputAccumulation;
+        }
+    }
+}
+
+namespace {
+static Registrar<ReduceSumImplForward_cpu> registrarReduceSumImplForward_cpu_Float32(
+        {DataType::Float32, DataType::Float32}, Aidge::ReduceSumImpl_cpu_forward_kernel<float, float>);
+static Registrar<ReduceSumImplForward_cpu> registrarReduceSumImplForward_cpu_Int32(
+        {DataType::Int32, DataType::Int32}, Aidge::ReduceSumImpl_cpu_forward_kernel<int, int>);
+static Registrar<ReduceSumImplForward_cpu> registrarReduceSumImplForward_cpu_Float64(
+        {DataType::Float64, DataType::Float64}, Aidge::ReduceSumImpl_cpu_forward_kernel<double, double>);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_REDUCESUMIMPL_FORWARD_KERNEL_H_ */
--- a/src/operator/AndImpl.cpp
+++ b/src/operator/AndImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/operator/And.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/AndImpl.hpp"
+#include "aidge/backend/cpu/operator/AndImpl_forward_kernels.hpp"
+
+Aidge::Elts_t Aidge::AndImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
+    // this implementation can be in-place
+    return Elts_t::DataElts(0);
+}
+
+void Aidge::AndImpl_cpu::forward() {
+    // Find the correct kernel type
+    auto kernelFunc = Registrar<AndImplForward_cpu>::create({
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
+
+    const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
+    const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
+
+    // Call kernel
+    kernelFunc(inputDims0,
+        inputDims1,
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+        getCPUPtr(mOp.getRawInput(0)),
+        getCPUPtr(mOp.getRawInput(1)),
+        getCPUPtr(mOp.getRawOutput(0)));
+}
--- a/src/operator/ArgMaxImpl.cpp
+++ b/src/operator/ArgMaxImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
+
+#include <memory>
+#include <vector>
+
+#include "aidge/utils/Types.h"
+#include "aidge/operator/ArgMax.hpp"
+#include "aidge/backend/cpu/operator/ArgMaxImpl_forward_kernels.hpp"
+
+void Aidge::ArgMaxImpl_cpu::forward() {
+    const ArgMax_Op& op_ = dynamic_cast<const ArgMax_Op&>(mOp);
+    // Find the correct kernel type
+    auto kernelFunc = Registrar<ArgMaxImplForward_cpu>::create({
+        op_.getInput(0)->dataType(),
+        op_.getOutput(0)->dataType()});
+
+    // Call kernel
+    kernelFunc(op_.axis(),
+                op_.selectLastIndex(),
+                op_.getInput(0)->dims(),
+                op_.getInput(0)->getImpl()->rawPtr(),
+                op_.getOutput(0)->getImpl()->rawPtr());
+}
--- a/src/operator/ReduceSumImpl.cpp
+++ b/src/operator/ReduceSumImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
+
+#include <memory>
+#include <vector>
+
+#include "aidge/utils/Types.h"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/backend/cpu/operator/ReduceSumImpl_forward_kernels.hpp"
+
+void Aidge::ReduceSumImpl_cpu::forward() {
+    const ReduceSum_Op& op_ = dynamic_cast<const ReduceSum_Op&>(mOp);
+    // Find the correct kernel type
+    auto kernelFunc = Registrar<ReduceSumImplForward_cpu>::create({
+        op_.getInput(0)->dataType(),
+        op_.getOutput(0)->dataType()});
+
+    // Call kernel
+    kernelFunc(op_.axes(),
+                op_.keepDims(),
+                op_.getInput(0)->dims(),
+                op_.getInput(0)->getImpl()->rawPtr(),
+                op_.getOutput(0)->getImpl()->rawPtr());
+}
--- a/unit_tests/operator/Test_AndImpl.cpp
+++ b/unit_tests/operator/Test_AndImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/And.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
+        SECTION("ForwardDims")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        SECTION("Same dimensions") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+
+                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
+                myInput1->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
+                myInput1->zeros();
+                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
+                myInput2->setBackend("cpu");
+                myInput2->setDataType(DataType::Float32);
+                myInput2->zeros();
+                std::shared_ptr<Node> myAnd = And();
+                auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+                op->associateInput(0,myInput1);
+                op->associateInput(1,myInput2);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == dims);
+            }
+        }
+        SECTION("Broadcasting") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims1(nbDims, 1);
+                std::vector<DimSize_t> dims2(nbDims, 1);
+                std::vector<DimSize_t> expectedOutDims;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    DimSize_t dim = dimSizeDist(gen);
+                    if (boolDist(gen)) {
+                        dims1[i] = dim;
+                    }
+                    if (boolDist(gen)) {
+                        dims2[i] = dim;
+                    }
+                    expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+                }
+
+
+                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
+                myInput1->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
+                myInput1->zeros();
+                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
+                myInput2->setBackend("cpu");
+                myInput2->setDataType(DataType::Float32);
+                myInput2->zeros();
+                std::shared_ptr<Node> myAnd = And();
+                auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+                op->associateInput(0,myInput1);
+                op->associateInput(1,myInput2);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+    }
+    SECTION("Same size inputs") {
+        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        {                                       //
+            {                                   //
+                {{20, 15},{31, 11},{22, 49}},   //
+                {{41, 10},{24, 51},{27, 52}},   //
+                {{26, 53},{27, 54},{28, 55}}    //
+            },                                  //
+            {                                   //
+                {{29, 56},{30, 57},{31, 58}},   //
+                {{32, 59},{33, 60},{34, 61}},   //
+                {{35, 62},{36, 63},{37, 64}}    //
+            },                                  //
+            {                                   //
+                {{38, 65},{39, 66},{40, 67}},   //
+                {{41, 68},{42, 69},{43, 70}},   //
+                {{44, 71},{45, 72},{46, 73}}    //
+            }                                   //
+        }                                       //
+    });                                         //
+        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+            {                                       //
+                {                                   //
+                    {{20, 47},{21, 48},{22, 49}},   //
+                    {{23, 50},{24, 51},{25, 52}},   //
+                    {{17, 53},{27, 26},{14, 33}}    //
+                },                                  //
+                {                                   //
+                    {{29, 56},{30, 57},{31, 58}},   //
+                    {{72, 44},{33, 20},{27, 55}},   //
+                    {{35, 24},{25, 63},{28, 64}}    //
+                },                                  //
+                {                                   //
+                    {{32, 65},{39, 66},{40, 70}},   //
+                    {{41, 53},{42, 60},{34, 70}},   //
+                    {{44, 71},{30, 12},{46, 73}}    //
+                }                                   //
+            }                                       //
+        });                                         //
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+            {
+                {
+                    {{1, 0},{0, 0},{1, 1}},
+                    {{0, 0},{1, 1},{0, 1}},
+                    {{0, 1},{1, 0},{0, 0}}
+                },
+                {
+                    {{1, 1},{1, 1},{1, 1}},
+                    {{0, 0},{1, 0},{0, 0}},
+                    {{1, 0},{0, 1},{0, 1}}
+                },
+                {
+                    {{0, 1},{1, 1},{1, 0}},
+                    {{1, 0},{1, 0},{0, 1}},
+                    {{1, 1},{0, 0},{1, 1}}
+                }
+            }
+        });
+
+        std::shared_ptr<Node> myAnd = And();
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        op->associateInput(0, input1);
+        op->associateInput(1, input2);
+        op->setBackend("cpu");
+        op->setDataType(DataType::Int32);
+        myAnd->forward();
+
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+    }
+
+    SECTION("Broadcasting") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+        {                                       //
+            {                                   //
+                {{10, 20},{22, 23},{20, 20}},   //
+                {{10, 15},{10, 29},{20, 20}},   //
+                {{26, 25},{33, 20},{10, 20}}    //
+            }                                   //
+        }                                       //
+        });                                     //
+
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});  
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+            {                                   //
+                {                               //
+                    {{ 1, 1},{ 0, 0},{ 0, 1}},  //
+                    {{ 1, 0},{ 1, 0},{ 0, 1}},  //
+                    {{ 0, 0},{ 0, 1},{ 1, 1}}   //
+                }                               //
+            }                                   //
+        });                                     //
+
+        std::shared_ptr<Node> myAnd = And();
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        op->associateInput(0, input_1);
+        op->associateInput(1, input_2);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
+        myAnd->forward();
+        op->getOutput(0)->print();
+        expectedOutput->print();
+        REQUIRE(*op->getOutput(0) == *expectedOutput);
+    }
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_ArgMaxImpl.cpp
+++ b/unit_tests/operator/Test_ArgMaxImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <memory>
+#include <numeric>   // std::accumulate
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/ArgMax.hpp"
+#include "aidge/operator/Conv.hpp"
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") {
+    SECTION("ForwardDims")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        SECTION("KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims(nbDims);
+                std::uniform_int_distribution<std::int32_t> axisDist(std::int32_t(0), std::int32_t(nbDims-1));
+                std::int32_t axis = axisDist(gen);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    if (i == axis) {
+                        expectedOutDims[i] = 1;
+                    }
+                    else {
+                        expectedOutDims[i] = dims[i];
+                    }
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                myInput->zeros();
+                std::shared_ptr<Node> myArgMax = ArgMax(axis);
+                auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+        SECTION("Not KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims;
+                std::uniform_int_distribution<std::int32_t> axisDist(std::int32_t(0), std::int32_t(nbDims-1));
+                std::int32_t axis = axisDist(gen);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    if(i != axis) {
+                        expectedOutDims.push_back(dims[i]);
+                    }
+                }
+                if(expectedOutDims.empty()) {
+                    expectedOutDims.push_back(1);
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myArgMax = ArgMax(axis, false);
+                auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+    }
+    SECTION("3D Tensor") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,2,3,4> {
+                {
+                    {
+                        { 1.0, 2.0, 3.0, 4.0},
+                        { 8.0, 0.0, 17.0, 1.0},
+                        { 5.0, 10.0, 6.0, 0.0}
+                    },
+                    {
+                        { 7.0, 1.0, 9.0, 4.0},
+                        { 0.0, 8.0, 4.0, 2.0},
+                        { 9.0, 2.0, 0.0, 5.0}
+                    }
+                }
+            });
+        SECTION("Axis 2") {
+
+            Tensor myOutput = Tensor(Array3D<float,2,3, 1> {
+               { 
+                    { 
+                        {3.0},
+                        {2.0},
+                        {1.0}
+                    },
+                    {
+                        {2.0},
+                        {1.0},
+                        {0.0}
+                    }
+               }
+            });
+
+            std::shared_ptr<Node> myArgMax = ArgMax(2);
+            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myArgMax->forward();
+
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+        SECTION("Axis 2 with keep_dims false") {
+
+            Tensor myOutput = Tensor(Array2D<float,2,3> {
+               { 
+                    { 3.0, 2.0, 1.0 },
+                    { 2.0, 1.0, 0.0 }
+               }
+            });
+
+            std::shared_ptr<Node> myArgMax = ArgMax(2,0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myArgMax->forward();
+
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+        SECTION("Axis 1") {
+            Tensor myOutput = Tensor(Array3D<float,2,1,4> {
+                {
+                    {
+                        { 1.0, 2.0, 1.0, 0.0 }
+                    },
+                    {
+                        { 2.0, 1.0, 0.0, 2.0 }
+                    }
+                }
+            });
+
+            std::shared_ptr<Node> myArgMax = ArgMax(1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myArgMax->forward();
+
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+        SECTION("Axis 0") {
+            Tensor myOutput = Tensor(Array3D<float,1,3,4> {
+                {
+                    {
+                        { 1.0, 0.0, 1.0, 0.0 },
+                        { 0.0, 1.0, 0.0, 1.0 },
+                        { 1.0, 0.0, 0.0, 1.0 }
+                    }
+                }
+            });
+
+            std::shared_ptr<Node> myArgMax = ArgMax(0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            std::cout << " ...............  "<< std::endl;
+            myArgMax->forward();
+            op->getOutput(0)->print();
+            std::cout <<"------"<<std::endl;
+            myOutput.print();
+
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+    }
+    SECTION("Select_Last_Index") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array1D<float,10> {
+            {
+                1.0, 5.0, 9.0, 0.0, 6.0, 2.0, 9.0, 4.0, 3.0, 9.0
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {{9}});
+
+        std::shared_ptr<Node> myArgMax = ArgMax(0, 1, 1);
+        auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        myArgMax->forward();
+        op->getOutput(0)->print();
+
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+
+    }
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_ReduceMeanImpl.cpp
+++ b/unit_tests/operator/Test_ReduceMeanImpl.cpp
@@ -11,6 +11,8 @@

 #include <catch2/catch_test_macros.hpp>
 #include <memory>
+#include <numeric>   // std::accumulate
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution

 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/ReduceMean.hpp"
@@ -22,6 +24,129 @@
 using namespace Aidge;

 TEST_CASE("[cpu/operator] ReduceMean(forward)", "[ReduceMean][CPU]") {
+    SECTION("ForwardDims")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        SECTION("KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims(nbDims);
+                std::vector<std::int32_t> axes;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    expectedOutDims[i] = dims[i];
+                    if(boolDist(gen)) {
+                        axes.push_back(i);
+                        expectedOutDims[i] = 1;
+                    }
+                }
+                if (axes.empty()) { // Default behaviour if no axes are provided is to reduce all dimensions
+                   std::fill(expectedOutDims.begin(), expectedOutDims.end(), 1);
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                myInput->zeros();
+                std::shared_ptr<Node> myReduceMean = ReduceMean(axes, true);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+        SECTION("Not KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims;
+                std::vector<std::int32_t> axes;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    if(boolDist(gen)) {
+                        axes.push_back(i);
+                    }
+                    else {
+                        expectedOutDims.push_back(dims[i]);
+                    }
+                }
+                if (axes.empty() || expectedOutDims.empty()) { // Default behaviour if no axes are provided is to reduce all dimensions
+                   expectedOutDims = std::vector<DimSize_t>{1};
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceMean = ReduceMean(axes, false);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+        SECTION("NoopWithEmptyAxes") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceMean = ReduceMean(std::vector<int32_t>{}, false, true);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == dims);
+            }
+        }
+        SECTION("Not NoopWithEmptyAxes") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceMean = ReduceMean({}, false, false);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                REQUIRE(op->getOutput(0)->nbDims() == 1);
+                REQUIRE(op->getOutput(0)->size() == 1);
+            }
+        }
+    }
    SECTION("KeepDims") {
        SECTION("test 1") {
            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
@@ -157,7 +282,7 @@ TEST_CASE("[cpu/operator] ReduceMean(forward)", "[ReduceMean][CPU]") {
                {18.25}
            });

-            std::shared_ptr<Node> myReduceMean = ReduceMean({0, 1, 2}, 0);
+            std::shared_ptr<Node> myReduceMean = ReduceMean({}, 0);
            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
            op->associateInput(0,myInput);
            op->setDataType(DataType::Float32);
@@ -179,15 +304,42 @@ TEST_CASE("[cpu/operator] ReduceMean(forward)", "[ReduceMean][CPU]") {
                {0.1293547f}
            });

-            std::shared_ptr<Node> myReduceMean = ReduceMean({0, 1}, 0);
+            std::shared_ptr<Node> myReduceMean = ReduceMean({}, 0);
            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
            op->associateInput(0,myInput);
            op->setDataType(DataType::Float32);
            op->setBackend("cpu");
            myReduceMean->forward();
-            op->getOutput(0)->print();
-            // approxEq<float>(*(op->getOutput(0)), *myOutput);
+
            REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
        }
+        SECTION("noop_with_empty_axes") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+
+            std::shared_ptr<Node> myReduceMean = ReduceMean({}, 0, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceMean->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == *myInput);
+        }
    }
 }
\ No newline at end of file
--- a/unit_tests/operator/Test_ReduceSumImpl.cpp
+++ b/unit_tests/operator/Test_ReduceSumImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <memory>
+#include <numeric>   // std::accumulate
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/operator/Conv.hpp"
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] ReduceSum(forward)", "[ReduceSum][CPU]") {
+    SECTION("ForwardDims")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        SECTION("KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims(nbDims);
+                std::vector<std::int32_t> axes;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    expectedOutDims[i] = dims[i];
+                    if(boolDist(gen)) {
+                        axes.push_back(i);
+                        expectedOutDims[i] = 1;
+                    }
+                }
+                if (axes.empty()) { // Default behaviour if no axes are provided is to reduce all dimensions
+                   std::fill(expectedOutDims.begin(), expectedOutDims.end(), 1);
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                myInput->zeros();
+                std::shared_ptr<Node> myReduceSum = ReduceSum(axes, true);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+        SECTION("Not KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims;
+                std::vector<std::int32_t> axes;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    if(boolDist(gen)) {
+                        axes.push_back(i);
+                    }
+                    else {
+                        expectedOutDims.push_back(dims[i]);
+                    }
+                }
+                if (axes.empty() || expectedOutDims.empty()) { // Default behaviour if no axes are provided is to reduce all dimensions
+                   expectedOutDims = std::vector<DimSize_t>{1};
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceSum = ReduceSum(axes, false);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+        SECTION("NoopWithEmptyAxes") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceSum = ReduceSum(std::vector<int32_t>{}, false, true);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == dims);
+            }
+        }
+        SECTION("Not NoopWithEmptyAxes") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceSum = ReduceSum({}, false, false);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                REQUIRE(op->getOutput(0)->nbDims() == 1);
+                REQUIRE(op->getOutput(0)->size() == 1);
+            }
+        }
+    }
+    SECTION("KeepDims") {
+        SECTION("test 1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            Tensor myOutput = Tensor(Array3D<float,3,1,2> {
+                {
+
+                    {{ 25.0, 3.0 }},
+                    {{ 70.0, 3.0 }},
+                    {{ 115.0, 3.0 }}
+                }
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({1}, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+        SECTION("test 2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> {
+                {
+                    {
+                        { 0.0, 0.0 },
+                        { 1.0, 1.0 },
+                        { 2.0, 2.0 }
+                    },
+                    {
+                        { 3.0, 3.0 },
+                        { 4.0, 4.0 },
+                        { 5.0, 5.0 }
+                    },
+                    {
+                        { 6.0, 6.0 },
+                        { 7.0, 7.0 },
+                        { 8.0, 8.0 }
+                    }
+                }
+            });
+            Tensor myOutput = Tensor(Array3D<float,3,1,1> {
+                {
+
+                    {{ 6.0 }},
+                    {{ 24.0 }},
+                    {{ 42.0 }}
+                }
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({1, 2}, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            myOutput.print();
+            op->getOutput(0)->print();
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+    }
+    SECTION("not_KeepDims") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+            {
+                {
+                    { 5.0, 1.0 },
+                    { 20.0, 2.0 }
+                },
+                {
+                    { 30.0, 1.0 },
+                    { 40.0, 2.0 }
+                },
+                {
+                    { 55.0, 1.0 },
+                    { 60.0, 2.0 }
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> {
+            {
+                { 25.0, 3.0 },
+                { 70.0, 3.0 },
+                { 115.0, 3.0 }
+            }
+        });
+
+        std::shared_ptr<Node> myReduceSum = ReduceSum({1}, 0);
+        auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        myReduceSum->forward();
+        op->getOutput(0)->print();
+
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+
+    }
+    SECTION("all_axes") {
+        SECTION("1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
+                {219.0}
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({}, 0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == *myOutput);
+        }
+        SECTION("2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,5,4> {
+               {{ 0.004232f, 0.105120f, 0.045124f, 0.009205f},
+                { 0.000766f, 0.272162f, 0.503560f, 0.044163f},
+                { 0.049755f, 0.000305f, 0.143634f, 0.013253f},
+                { 0.096258f, 0.311231f, 0.358143f, 0.000452f},
+                { 0.468617f, 0.015693f, 0.145316f, 0.000105f}}
+            });
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
+                {2.587094f}
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1}, 0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            op->getOutput(0)->print();
+            REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
+        }
+        SECTION("noop_with_empty_axes") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({}, 0, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == *myInput);
+        }
+    }
+}
\ No newline at end of file