Compare revisions

Noam Zerah · Cyril Moineau · Noam Zerah · Cyril Moineau · Cyril Moineau · Olivier BICHLER
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -31,6 +31,7 @@
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
 #include "aidge/backend/cpu/operator/FoldImpl.hpp"
 #include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/backend/cpu/operator/LRNImpl.hpp"
 #include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
 #include "aidge/backend/cpu/operator/LnImpl.hpp"
 #include "aidge/backend/cpu/operator/MatMulImpl.hpp"
@@ -40,6 +41,7 @@
 #include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
 #include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
 #include "aidge/backend/cpu/operator/ReLUImpl.hpp"
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
 #include "aidge/backend/cpu/operator/ScalingImpl.hpp"
 #include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
 #include "aidge/backend/cpu/operator/SqrtImpl.hpp"

--- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
@@ -48,6 +48,12 @@ REGISTRAR(AddImpl_cpu,
 REGISTRAR(AddImpl_cpu,
    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(AddImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}},
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, nullptr});
+REGISTRAR(AddImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}},
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, nullptr});
 REGISTRAR(AddImpl_cpu,
    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});

--- a/include/aidge/backend/cpu/operator/BatchNormImpl.hpp
+++ b/include/aidge/backend/cpu/operator/BatchNormImpl.hpp
@@ -29,7 +29,7 @@ using BatchNorm2D_Op = BatchNorm_Op<2>;
 using BatchNormImpl2D_cpu = OperatorImpl_cpu<BatchNorm_Op<2>,
    void(float,
        float,
-        const std::array<DimSize_t, 4> &,
+        const std::vector<DimSize_t> &,
        const void *,
        const void *,
        const void *,

--- a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
@@ -38,7 +38,7 @@ namespace Aidge {
 * @param output_ Output Tensor.
 */
 template <class I, class P, class O>
-void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std::array<DimSize_t, 4> &dims,
+void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std::vector<DimSize_t> &dims,
                                       const void *input_, const void *scale_, const void *shift_, void *batchMean_, void *batchVar_, void *output_, const bool freeze) {
    // FIXME: missing convolution attributes as arguments
    const I *input = static_cast<const I *>(input_);
@@ -49,9 +49,8 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std
    O *output = static_cast<O *>(output_);

    const DimSize_t nbBatch = dims[0];
-    const DimSize_t nbChannels = dims[1];
-    const DimSize_t featureMapSize = dims[2]*dims[3];
-
+    const DimSize_t nbChannels = (dims.size() > 1) ? dims[1] : 1;
+    const DimSize_t featureMapSize = (dims.size() > 2) ? std::accumulate(dims.begin() + 2, dims.end(), 1, std::multiplies<DimSize_t>()) : 1;

    if ((freeze == true) || (momentum == 0.0f)) {
        for (std::size_t batch = 0; batch < nbBatch; ++batch) {

--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
@@ -149,7 +149,6 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
    // input  (batch, ch, Xin, Yin)
    // weight (outCh, ch, kernelX, kernelY)
    // does not take Dilation attribute into account
-    using signedsize = std::make_signed<std::size_t>::type;
    const std::size_t outChannels_s =  oxSize * oySize;

    if (dilated_kernel_x ==3 && dilated_kernel_y == 3) {
@@ -232,13 +231,13 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
                    for (std::size_t oy = 0; oy < oySize; ++oy) {

                        const std::size_t oIndexFull = ox*oySize + oy;
-                        const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
-                        const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
+                        const std::size_t ix = ox * strideDims[0];
+                        const std::size_t iy = oy * strideDims[1];

                        for (std::size_t sx = 0; sx*dilationDims[0] < dilated_kernel_x; ++sx) {
                            for (std::size_t sy = 0; sy*dilationDims[1] < dilated_kernel_y; ++sy) {
                                output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
-                                                        input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))];
+                                                        input[iIndex + static_cast<std::size_t>(ix + sx*dilationDims[0])*inputDims[3] + static_cast<std::size_t>(iy + sy*dilationDims[1])];
                            }
                        }
                    }

--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -158,7 +158,6 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
    // weight (outCh, inCh, kernelX, kernelY)
    // does not take Dilation attribute into account
    const std::size_t outChannels_s =  oxSize * oySize;
-    using signedsize = std::make_signed<std::size_t>::type;

    if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {

--- a/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
@@ -65,7 +65,7 @@ static float update_normalized_coord_with_padding(float coord, Aidge::GridSample
    return coord;
 }

-static inline std::int64_t update_unnormalized_coord_with_padding(std::int64_t coord, std::int64_t size, Aidge::GridSample_Op::PaddingMode padding_mode) {
+static std::int64_t update_unnormalized_coord_with_padding(std::int64_t coord, std::int64_t size, Aidge::GridSample_Op::PaddingMode padding_mode) {
    if (!in_bound(coord, 0, size)) {
        // out of bound. switch padding mode
        if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) {

--- a/include/aidge/backend/cpu/operator/LRNImpl.hpp
+++ b/include/aidge/backend/cpu/operator/LRNImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_LRNIMPL_H_
+#define AIDGE_CPU_OPERATOR_LRNIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/LRN.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using LRNImpl_cpu = OperatorImpl_cpu<LRN_Op,
+    void(float, float, float, std::size_t, const std::vector<DimSize_t>&, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(LRN_Op, "cpu", Aidge::LRNImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_LRNIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/LRNImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LRNImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_LRNIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_LRNIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+#include <cstddef>
+#include <cmath>
+#include "aidge/data/Data.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/LRNImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void LRNImpl_cpu_forward_kernel(float alpha, float beta, float bias, std::size_t size, const std::vector<DimSize_t>& inputDims, const void* input_, void* output_)
+{
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    const DimSize_t nbBatch = inputDims[0];
+    const DimSize_t nbChannels = (inputDims.size() > 1) ? inputDims[1] : 1;
+    const DimSize_t featureMapSize = (inputDims.size() > 2) ? std::accumulate(inputDims.begin() + 2, inputDims.end(), 1, std::multiplies<DimSize_t>()) : 1;
+
+    for (std::size_t batch = 0; batch < nbBatch; ++batch) {
+        for (std::size_t ch = 0; ch < nbChannels; ++ch) {
+            const std::size_t ioIndex = (ch + batch*nbChannels) * featureMapSize;
+            const unsigned int channelMin
+                = std::max<int>(0, ch - size / 2);
+            const unsigned int channelMax
+                = std::min<size_t>(nbChannels - 1, ch + size / 2);
+
+            for (std::size_t feature = 0; feature<featureMapSize; ++feature) {
+                // For each input channel, accumulate the value
+                O accAccrossChannels(0.0);
+
+                for (unsigned int accChannel = channelMin;
+                    accChannel < channelMax; ++accChannel)
+                {
+                    accAccrossChannels += input[ioIndex + feature];
+                }
+
+                // Compute the output signal
+                output[ioIndex + feature] = input[ioIndex + feature]
+                    / std::pow((bias + (accAccrossChannels * accAccrossChannels) * alpha), beta);
+            }
+        }
+    }
+}
+
+REGISTRAR(LRNImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::LRNImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(LRNImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::LRNImpl_cpu_forward_kernel<double, double>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_LRNIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/RoundImpl.hpp
+++ b/include/aidge/backend/cpu/operator/RoundImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ROUNDIMPL_H_
+#define AIDGE_CPU_OPERATOR_ROUNDIMPL_H_
+
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Round.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using RoundImpl_cpu = OperatorImpl_cpu<Round_Op,
+    void(const std::size_t, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Round_Op, "cpu", Aidge::RoundImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ROUNDIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_
+
+#include <cmath>   //std::round 
+#include <cstddef>  // std::size_t
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void RoundImpl_cpu_forward_kernel(const std::size_t inputLenght,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        //std::round would not work since it doesn't follow the halves rules (See ONNX Round)
+        output[i] = static_cast<O>(std::nearbyint(static_cast<float>(input[i])));
+    }
+}
+
+
+REGISTRAR(RoundImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::RoundImpl_cpu_forward_kernel<float, float>,nullptr});
+REGISTRAR(RoundImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::RoundImpl_cpu_forward_kernel<double, double>,nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
@@ -89,13 +89,13 @@ void SliceImpl_cpu_forward_kernel(const std::vector<std::int64_t>& starts,
 }

 REGISTRAR(SliceImpl_cpu,
-    {DataType::Float32},
+    {{DataType::Float32, DataType::Any}, {DataType::Float32}},
    {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<float, float>, nullptr});
 REGISTRAR(SliceImpl_cpu,
-    {DataType::Float64},
+    {{DataType::Float64, DataType::Any}, {DataType::Float64}},
    {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<double, double>, nullptr});
 REGISTRAR(SliceImpl_cpu,
-    {DataType::Int32},
+    {{DataType::Int32, DataType::Any}, {DataType::Int32}},
    {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
 }  // namespace Aidge


--- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
@@ -56,6 +56,12 @@ REGISTRAR(SubImpl_cpu,
 REGISTRAR(SubImpl_cpu,
    {DataType::Float64},
    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<double, double, double>, nullptr});
+REGISTRAR(SubImpl_cpu,
+    {DataType::Int8},
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::int8_t, std::int8_t, std::int8_t>, nullptr});
+REGISTRAR(SubImpl_cpu,
+    {DataType::UInt8},
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t, std::uint8_t>, nullptr});
 REGISTRAR(SubImpl_cpu,
    {DataType::Int32},
    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});

--- a/src/operator/BatchNormImpl.cpp
+++ b/src/operator/BatchNormImpl.cpp
@@ -30,15 +30,13 @@ void Aidge::BatchNormImpl2D_cpu::forward() {
    AIDGE_ASSERT(op_.getInput(3), "missing input #3 for BatchNorm Operator");
    AIDGE_ASSERT(op_.getInput(4), "missing input #4 for BatchNorm Operator");

-    AIDGE_ASSERT(op_.getOutput(0)->nbDims() == 4, "");
-
    // Find the correct kernel type
    const auto impl = Registrar<BatchNormImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));

    // Call kernel
    impl.forward(op_.epsilon(),
            op_.momentum(),
-            op_.getInput(0)->template dims<4>(),
+            op_.getInput(0)->dims(),
            getCPUPtr(op_.getRawInput(0)),
            getCPUPtr(op_.getRawInput(1)),
            getCPUPtr(op_.getRawInput(2)),

--- a/src/operator/ConvDepthWiseImpl.cpp
+++ b/src/operator/ConvDepthWiseImpl.cpp
@@ -65,7 +65,6 @@ void Aidge::ConvDepthWiseImpl2D_cpu::forward() {

    AIDGE_ASSERT(op_.getInput(0), "missing input #0 in ConvDepthWise Operator");
    AIDGE_ASSERT(op_.getInput(1), "missing input #1 in ConvDepthWise Operator");
-    AIDGE_ASSERT(op_.getInput(2), "missing input #2 in ConvDepthWise Operator");

    AIDGE_ASSERT((op_.getInput(0)->nbDims() == 4), "support for 4-dimensions tensors only");


--- a/src/operator/LRNImpl.cpp
+++ b/src/operator/LRNImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/operator/LRN.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/LRNImpl.hpp"
+#include "aidge/backend/cpu/operator/LRNImpl_kernels.hpp"
+
+template <>
+void Aidge::LRNImpl_cpu::forward() {
+    const auto& op_ = dynamic_cast<const LRN_Op&>(mOp);
+    AIDGE_ASSERT(!op_.getInput(0)->empty(), "LRN input empty");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<LRNImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(op_.alpha(),
+               op_.beta(),
+               op_.bias(),
+               op_.size(),
+               std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims(),
+               std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->getImpl()->rawPtr(),
+               std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
+}
+
+template <>
+void Aidge::LRNImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for LRN_Op on backend cpu");
+}
--- a/src/operator/RoundImpl.cpp
+++ b/src/operator/RoundImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Round.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
+#include "aidge/backend/cpu/operator/RoundImpl_kernels.hpp"
+
+template <>
+void Aidge::RoundImpl_cpu::forward() {
+    std::shared_ptr<Tensor> in0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0));
+    std::shared_ptr<Tensor> out0 = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0));
+    AIDGE_ASSERT(in0, "missing input #0");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<RoundImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(in0->size(),
+        getCPUPtr(mOp.getRawInput(0)),
+        getCPUPtr(mOp.getRawOutput(0)));
+}
+template <>
+void Aidge::RoundImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Round_Op on backend cpu");
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
+++ b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
@@ -124,7 +124,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
            dims_in[1]; //  averaging per channel : 1 addition per element in
                        //  the channel + 1 division this for every batch
        // create out nb_elems
-        std::vector<std::size_t> dims_out{dims_in[0], dims_in[1]};
+        std::vector<std::size_t> dims_out(dims_in.size(), 1);
+        dims_out[0] = dims_in[0];
+        dims_out[1] = dims_in[1];
        const std::size_t out_nb_elems =
            std::accumulate(dims_out.cbegin(), dims_out.cend(), std::size_t(1),
                            std::multiplies<std::size_t>());
@@ -192,7 +194,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
                          //  the channel + 1 division this for every batch

          // create out nb_elems
-          std::vector<std::size_t> dims_out{dims_in[0], dims_in[1]};
+          std::vector<std::size_t> dims_out(dims_in.size(), 1);
+          dims_out[0] = dims_in[0];
+          dims_out[1] = dims_in[1];
          const std::size_t out_nb_elems =
              std::accumulate(dims_out.cbegin(), dims_out.cend(),
                              std::size_t(1), std::multiplies<std::size_t>());
@@ -253,7 +257,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
        SECTION("2D_img") {
          const std::vector<DimSize_t> in_dims{batch_size, channels, height,
                                               width};
-          const std::vector<DimSize_t> out_dims{batch_size, channels};
+          std::vector<std::size_t> out_dims(in_dims.size(), 1);
+          out_dims[0] = in_dims[0];
+          out_dims[1] = in_dims[1];
          DimSize_t in_nb_elems = batch_size * channels * height * width;
          DimSize_t out_nb_elems = batch_size * channels;
          number_of_operation +=
@@ -368,7 +374,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
        SECTION("3D_img") {
          const std::vector<DimSize_t> in_dims{batch_size, channels, height,
                                               width, depth};
-          const std::vector<DimSize_t> out_dims{batch_size, channels};
+          std::vector<std::size_t> out_dims(in_dims.size(), 1);
+          out_dims[0] = in_dims[0];
+          out_dims[1] = in_dims[1];
          DimSize_t in_nb_elems =
              batch_size * channels * height * width * depth;
          number_of_operation +=

--- a/unit_tests/operator/Test_RoundImpl.cpp
+++ b/unit_tests/operator/Test_RoundImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <cstddef>   // std::size_t
+#include <cstdint>   // std::uint16_t
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <numeric>   
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <iomanip>
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Round.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 15;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(-15, 15); 
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
+
+    // Create BitShift Operator
+    std::shared_ptr<Node> myRound = Round();
+    auto op = std::static_pointer_cast<OperatorTensor>(myRound-> getOperator());
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // Create 2 input Tensors
+    std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+    op->associateInput(0,T0);
+    T0->setDataType(DataType::Float32);
+    T0->setBackend("cpu");
+    // Create results Tensor
+    std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
+    Tres->setDataType(DataType::Float32);
+    Tres->setBackend("cpu");
+
+    // To measure execution time of 'Round_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+
+    SECTION("Round [Forward]") {
+        SECTION("Test Forward Kernel") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                
+                // generate 2 random Tensors
+                const std::size_t nbDims = nbDimsDist(gen);
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // without broadcasting
+                float* array0 = new float[nb_elements];
+                float* result = new float[nb_elements];
+                
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = valueDist(gen);
+                    result[i] = std::nearbyint(array0[i]);
+
+                }
+
+                // input0
+                T0->resize(dims);
+                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
+
+                // results
+                Tres->resize(dims);
+                Tres -> getImpl() -> setRawPtr(result, nb_elements);
+                
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myRound->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                bool is_eq = approxEq<float>(*(op->getOutput(0)), *Tres);
+
+                auto Output = *(op->getOutput(0));
+                
+                auto prt = Output.getImpl()->rawPtr();
+
+                REQUIRE(is_eq);
+                
+
+                delete[] array0;
+                delete[] result;
+
+
+            }
+            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
+            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+        }
+    }
+} // namespace Aidge
+}
\ No newline at end of file
--- a/unit_tests/recipies/Test_MatMulTiling.cpp
+++ b/unit_tests/recipies/Test_MatMulTiling.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cstddef>
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/recipes/Recipes.hpp"
+#include "aidge/operator/MatMul.hpp"
+#include "aidge/operator/AvgPooling.hpp"
+#include "aidge/operator/MaxPooling.hpp"
+#include "aidge/operator/GenericOperator.hpp"
+#include "aidge/operator/Producer.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/graph/Matching.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[MatMulTiling]") {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(-1.0f, 1.0f);
+
+    auto dataProvider = Producer({2, 3, 80, 80}, "dataProvider");
+    auto w1 = Producer({2, 3, 80, 80}, "w1");
+    auto matmul1 = MatMul("matmul1");
+    auto w2 = Producer({2, 3, 80, 80}, "w1");
+    auto matmul2 = MatMul("matmul2");
+    auto w3 = Producer({2, 3, 80, 80}, "w1");
+    auto matmul3 = MatMul("matmul3");
+
+    dataProvider->addChild(matmul1, 0, 0);
+    w1->addChild(matmul1, 0, 1);
+    matmul1->addChild(matmul2, 0, 0);
+    w2->addChild(matmul2, 0, 1);
+    matmul2->addChild(matmul3, 0, 0);
+    w3->addChild(matmul3, 0, 1);
+
+    auto g1 = getConnectedGraphView(matmul1);
+    g1->setBackend("cpu");
+    g1->forwardDims();
+    g1->save("MatMulSplitting_graph");
+
+    // Fill random values
+    fmt::println("Fill random values");
+    auto tData = std::static_pointer_cast<OperatorTensor>(dataProvider->getOperator())->getOutput(0);
+    for (size_t i = 0; i < tData->size(); ++i) {
+        tData->set<float>(i, valueDist(gen));
+    }
+    auto tw1 = std::static_pointer_cast<OperatorTensor>(w1->getOperator())->getOutput(0);
+    for (size_t i = 0; i < tw1->size(); ++i) {
+        tw1->set<float>(i, valueDist(gen));
+    }
+    auto tw2 = std::static_pointer_cast<OperatorTensor>(w2->getOperator())->getOutput(0);
+    for (size_t i = 0; i < tw2->size(); ++i) {
+        tw2->set<float>(i, valueDist(gen));
+    }
+    auto tw3 = std::static_pointer_cast<OperatorTensor>(w3->getOperator())->getOutput(0);
+    for (size_t i = 0; i < tw3->size(); ++i) {
+        tw3->set<float>(i, valueDist(gen));
+    }
+
+    fmt::println("Schedule forward graph");
+    auto s1 = SequentialScheduler(g1);
+    s1.forward();
+
+    const auto tOut = std::static_pointer_cast<OperatorTensor>(g1->getOrderedOutputs()[0].first->getOperator())->getOutput(0)->clone();
+
+    // Tiling
+    fmt::println("Tiling");
+    matMulTiling(matmul1, {16, 16});
+    removeIdentity(g1);
+
+    g1->setBackend("cpu");
+    g1->save("MatMulSplitting_graph_split");
+
+    auto gm = SinglePassGraphMatching(g1);
+    gm.addNodeLambda("16x16", [](const NodePtr& node) {
+        const auto op =
+            std::static_pointer_cast<OperatorTensor>(node->getOperator());
+        const auto dims = op->getOutput(0)->dims();
+        return (dims.end()[-2] == 16 && dims.end()[-1] == 16);
+    });
+
+    const auto results = gm.match("MatMul[16x16]");
+    REQUIRE(results.size() == 25);
+
+    // Check result
+    fmt::println("Schedule forward tiled graph");
+    s1 = SequentialScheduler(g1);
+    s1.resetScheduling();
+    s1.forward();
+
+    const auto tOutTiled = std::static_pointer_cast<OperatorTensor>(g1->getOrderedOutputs()[0].first->getOperator())->getOutput(0)->clone();
+    REQUIRE(approxEq<float>(tOut, tOutTiled));
+}
No results found