Compare revisions

99af25ca · 99af25ca · 99af25ca · 99af25ca · 99af25ca · 99af25ca
--- a/include/aidge/backend/cpu/operator/RoundImpl.hpp
+++ b/include/aidge/backend/cpu/operator/RoundImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ROUNDIMPL_H_
+#define AIDGE_CPU_OPERATOR_ROUNDIMPL_H_
+
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Round.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using RoundImpl_cpu = OperatorImpl_cpu<Round_Op,
+    void(const std::size_t, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Round_Op, "cpu", Aidge::RoundImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ROUNDIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_
+
+#include <cmath>   //std::round 
+#include <cstddef>  // std::size_t
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void RoundImpl_cpu_forward_kernel(const std::size_t inputLength,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    for (std::size_t i = 0; i < inputLength; ++i) {
+        //std::round would not work since it doesn't follow the halves rules (See ONNX Round)
+        output[i] = static_cast<O>(std::nearbyint(static_cast<float>(input[i])));
+    }
+}
+
+
+REGISTRAR(RoundImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::RoundImpl_cpu_forward_kernel<float, float>,nullptr});
+REGISTRAR(RoundImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::RoundImpl_cpu_forward_kernel<double, double>,nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ScalingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ScalingImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef __AIDGE_CPU_OPERATOR_ScalingIMPL_H__
+#define __AIDGE_CPU_OPERATOR_ScalingIMPL_H__
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Scaling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+#include <array>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ScalingImpl_cpu = OperatorImpl_cpu<Scaling_Op,
+    void(const float,
+        const std::size_t,
+        const bool,
+        std::size_t,
+        const void*,
+        void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Scaling_Op, "cpu", Aidge::ScalingImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* __AIDGE_CPU_OPERATOR_ScalingIMPL_H__ */
\ No newline at end of file
--- a/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_SCALINGIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_SCALINGIMPL_KERNELS_H_
+
+#include <cmath>
+#include <cstddef>
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/backend/cpu/operator/ScalingImpl.hpp"
+
+//TODO : improve propagate, n2d2 :
+/*
+template<typename T>
+void N2D2::floatingPointScaling_propagate(const Tensor<T>& input, Tensor<T>& output,
+                                          std::size_t batchSize, std::size_t nbChannels,
+                                          std::size_t height, std::size_t width,
+                                          bool isClipped,
+                                          const std::vector<Float_T>& clippingFactorPerChannel,
+                                          const std::vector<Float_T>& scalingFactorPerChannel,
+                                          std::size_t quantizedNbBits, bool isOutputUnsigned)
+{
+    std::size_t index = 0;
+    for (std::size_t batch = 0; batch < batchSize; batch++) {
+        for(std::size_t ch = 0; ch < nbChannels; ch++) {
+            for(std::size_t y = 0; y < height; y++) {
+                for(std::size_t x = 0; x < width; x++) {
+
+                    T res = isClipped ? Clip(input(index), clippingFactorPerChannel[ch])
+                                    : input(index);
+                    res = Scale(res, scalingFactorPerChannel[ch]);
+
+                    if(quantizedNbBits > 0) {
+                        res = saturate(std::round(res), quantizedNbBits, isOutputUnsigned);
+                    }
+                    output(index) = (T) res;
+                    index++;
+                }
+            }
+        }
+    }
+}
+*/
+
+
+namespace Aidge {
+
+template <class O>
+const O& clamp(const O& x, const O& min, const O& max)
+{
+    return (x < min) ? min : (x > max) ? max : x;
+}
+
+template<class O>
+O saturate(const O value, const std::size_t quantizedNbBits, const bool isOutputUnsigned) {
+    // TODO: no assertions in kernel
+    assert(quantizedNbBits > 0);
+
+    const O min = isOutputUnsigned ? 0 :
+                                  -(1ll << (quantizedNbBits - 1ll));
+    const O max = isOutputUnsigned ? (1ll << quantizedNbBits) - 1ll :
+                                   (1ll << (quantizedNbBits - 1ll)) - 1ll;
+
+    return clamp(value, min, max);
+}
+
+template <class I, class O>
+void ScalingImpl_cpu_forward_kernel(const float scalingFactor,
+                                    const std::size_t quantizedNbBits,
+                                    const bool isOutputUnsigned,
+                                    std::size_t inputLength,
+                                    const void* input_,
+                                    void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    for (std::size_t i = 0; i < inputLength; ++i) {
+        output[i] = static_cast<O>(input[i] * static_cast<I>(scalingFactor));
+
+        if(quantizedNbBits > 0) {
+            output[i] = saturate(std::round(output[i]), quantizedNbBits, isOutputUnsigned);
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ScalingImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ScalingImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(ScalingImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ScalingImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(ScalingImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ScalingImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_SCALINGIMPL_KERNELS_H_ */
\ No newline at end of file
--- a/include/aidge/backend/cpu/operator/SigmoidImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SigmoidImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_SIGMOIDIMPL_H_
+#define AIDGE_CPU_OPERATOR_SIGMOIDIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Sigmoid.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using SigmoidImpl_cpu = OperatorImpl_cpu<Sigmoid_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Sigmoid_Op, "cpu", Aidge::SigmoidImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_SIGMOIDIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_SIGMOIDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_SIGMOIDIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void SigmoidImpl_cpu_forward_kernel(std::size_t inputLength,
+                                    const void* input_,
+                                    void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
+		if (input[i] > I(0)) {
+			output[i] = O(1) / (O(1) + std::exp(-input[i]));
+		} else {
+			output[i] = std::exp(input[i]) / (O(1) + std::exp(input[i]));
+		}
+    }
+}
+
+template <class O, class GI, class GO>
+void SigmoidImpl_cpu_backward_kernel(const std::size_t inputLength,
+                                     const void* output_, const void* grad_output_,
+				     void* grad_input_) {
+    const O* output = static_cast<const O*>(output_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+    for (std::size_t i = 0; i < inputLength; ++i) {
+        grad_input[i] += output[i] * (O(1) - output[i]) * grad_output[i];
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(SigmoidImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::SigmoidImpl_cpu_forward_kernel<float, float>, Aidge::SigmoidImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(SigmoidImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::SigmoidImpl_cpu_forward_kernel<double, double>, Aidge::SigmoidImpl_cpu_backward_kernel<double, double, double>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_SIGMOIDIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/SliceImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SliceImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_SLICEIMPL_H__
+#define AIDGE_CPU_OPERATOR_SLICEIMPL_H__
+
+#include <memory>
+#include <vector>
+#include <array>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Slice.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using SliceImpl_cpu = OperatorImpl_cpu<Slice_Op,
+    void(const std::vector<std::int64_t>&,
+                            const std::vector<std::int64_t>&,
+                            const std::vector<std::int8_t>&,
+                            const std::vector<std::int64_t>&,
+                            const std::vector<DimSize_t>&,
+                            const void*,
+                            void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Slice_Op, "cpu", Aidge::SliceImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* __AIDGE_CPU_OPERATOR_SLICEIMPL_H__ */
--- a/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_SLICEIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_SLICEIMPL_KERNELS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <iterator>
+
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/backend/cpu/operator/SliceImpl.hpp"
+
+namespace Aidge {
+
+template<class I, class O>
+void SliceImpl_cpu_forward_kernel(const std::vector<std::int64_t>& starts,
+                                const std::vector<std::int64_t>& ends,
+                                const std::vector<std::int8_t>& axes,
+                                const std::vector<std::int64_t>& steps,
+                                const std::vector<DimSize_t>& inputDims,
+                                const void* input_,
+                                void* output_)
+{
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    const std::size_t nbDims = inputDims.size();
+    std::vector<DimSize_t> dims = inputDims;
+    DimSize_t totalSize = std::accumulate(inputDims.cbegin(), inputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const I* inputAccumulation = input;
+    I* outputAccumulation = nullptr;
+    const std::size_t nbAxes = starts.size();
+    for (std::size_t i = 0; i < nbAxes; ++i) {
+        const DimIdx_t axis = axes[i] >= 0 ?
+                                    static_cast<DimIdx_t>(axes[i]) :
+                                    static_cast<DimIdx_t>(axes[i] + static_cast<DimIdx_t>(inputDims.size()));
+        const DimSize_t start = std::min(starts[i] >= 0 ?
+                                                static_cast<DimSize_t>(starts[i]) :
+                                                static_cast<DimSize_t>(starts[i] + static_cast<std::int64_t>(inputDims[axis])),
+                                         dims[axis]-1);
+        const DimSize_t end = std::min(ends[i] >= 0 ?
+                                        static_cast<DimSize_t>(ends[i]) :
+                                        static_cast<DimSize_t>(ends[i] + static_cast<std::int64_t>(inputDims[axis])),
+                                         dims[axis]);
+        const std::int64_t step = steps[i];
+
+        const std::size_t sliceSize = static_cast<std::size_t>(std::ceil((static_cast<float>(end) - static_cast<float>(start)) / static_cast<float>(step)));
+
+        totalSize /= dims[axis];
+        totalSize *= sliceSize;
+        outputAccumulation = new I[totalSize];
+        const std::size_t stride_pre = std::accumulate(dims.cbegin(), dims.cbegin() + axis, 1, std::multiplies<std::size_t>());
+        const std::size_t stride_post = std::accumulate(dims.crbegin(), dims.crbegin() + nbDims -1 - axis, 1, std::multiplies<std::size_t>());
+        for (std::size_t outer = 0; outer < stride_pre; ++outer)
+        {
+            const std::size_t idx_in = outer * stride_post * dims[axis] + start * stride_post;
+            const std::size_t idx_out = outer * stride_post * sliceSize;
+            for (std::size_t inner = 0; inner < sliceSize; ++inner)
+            {
+                std::copy_n(std::next(inputAccumulation, idx_in + inner * step * stride_post),
+                            stride_post,
+                            std::next(outputAccumulation, idx_out + inner * stride_post));
+            }
+        }
+        dims[axis] = sliceSize;
+
+        if (inputAccumulation != input) {
+            delete[] inputAccumulation;
+        }
+        inputAccumulation = outputAccumulation;
+
+    }
+    // Copy elements from inputAccumulation to output while dividing by divisor
+    std::copy_n(inputAccumulation, totalSize, output);
+    if (outputAccumulation) {
+        delete[] outputAccumulation;
+    }
+}
+
+REGISTRAR(SliceImpl_cpu,
+    {{DataType::Float32, DataType::Any}, {DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(SliceImpl_cpu,
+    {{DataType::Float64, DataType::Any}, {DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(SliceImpl_cpu,
+    {{DataType::Int32, DataType::Any}, {DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_SLICEIMPL_KERNELS_H_ */
--- a/include/aidge/operator/SoftmaxImpl.hpp
+++ b/include/aidge/operator/SoftmaxImpl.hpp
@@ -12,52 +12,21 @@
 #ifndef AIDGE_CPU_OPERATOR_SOFTMAXIMPL_H_
 #define AIDGE_CPU_OPERATOR_SOFTMAXIMPL_H_

-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Softmax.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
 #include <memory>
 #include <vector>

 namespace Aidge {
-// class Softmax_Op;
+// Operator implementation entry point for the backend
+using SoftmaxImpl_cpu = OperatorImpl_cpu<Softmax_Op,
+    void(std::size_t, const std::vector<DimSize_t>&, const void*, void*)>;

-// compute kernel registry for forward and backward
-class SoftmaxImplForward_cpu
-    : public Registrable<SoftmaxImplForward_cpu, std::tuple<DataType, DataType>, void(const DimSize_t, const DimSize_t, const DimSize_t, const void*, void*)> {
-};
-class SoftmaxImplBackward_cpu
-    : public Registrable<SoftmaxImplBackward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-
-class SoftmaxImpl_cpu : public OperatorImpl {
-   private:
-    const Softmax_Op& mOp;
-    std::array<NbElts_t, 1> mNbConsumedData;
-    std::array<NbElts_t, 1> mNbProducedData;
-
-   public:
-    SoftmaxImpl_cpu(const Softmax_Op& op) : mOp(op), mNbConsumedData({0}), mNbProducedData({0}) {}
-
-    static std::unique_ptr<SoftmaxImpl_cpu> create(const Softmax_Op& op) {
-        return std::make_unique<SoftmaxImpl_cpu>(op);
-    }
-
-   public:
-    NbElts_t getNbRequiredData(const IOIndex_t inputIdx) const override final;
-    NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    NbElts_t getRequiredMemory(__attribute__((unused)) const IOIndex_t outputIdx, __attribute__((unused)) const std::vector<DimSize_t>& inputsSize) const override final;
-    NbElts_t getNbConsumedData(const IOIndex_t inputIdx) const override final;
-    NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final;
-
-    void forward();
-
-    void backward();
-};
-
-namespace {
-static Registrar<Softmax_Op> registrarSoftmaxImpl_cpu("cpu", Aidge::SoftmaxImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(Softmax_Op, "cpu", Aidge::SoftmaxImpl_cpu::create);
 }  // namespace Aidge

-#endif /* AIDGE_CPU_OPERATOR_SOFTMAXIMPL_H_ */
\ No newline at end of file
+#endif /* AIDGE_CPU_OPERATOR_SOFTMAXIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_SOFTMAXIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_SOFTMAXIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+#include <cstddef>
+#include <cmath>
+#include "aidge/data/Data.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSize_t>& inputDims, const void* input_, void* output_)
+{
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    std::size_t postAxisElems = 1;
+    for (std::size_t i = axisIdx + 1; i < inputDims.size(); ++i) {
+        postAxisElems *= inputDims[i];
+    }
+    std::size_t preAxisElems = 1;
+    for (std::size_t i = 0; i < axisIdx; ++i) {
+        preAxisElems *= inputDims[i];
+    }
+
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems >= 16)
+#endif
+    for (int i = 0; i < static_cast<int>(preAxisElems); ++i) {
+        for (int j = 0; j < static_cast<int>(postAxisElems); ++j) {
+            I maxVal = input[i * inputDims[axisIdx] * postAxisElems + j];
+            for (std::size_t k = 1; k < inputDims[axisIdx]; ++k) {
+                std::size_t inIdx = i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j;
+                maxVal = std::max(maxVal, input[inIdx]);
+            }
+
+            // Calculate sum of exponentials within the axis
+            I sumExp = 0;
+            for (std::size_t k = 0; k < inputDims[axisIdx]; ++k) {
+                std::size_t inIdx = i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j;
+                sumExp += std::exp(input[inIdx] - maxVal);
+            }
+
+            // Calculate softmax for the current slice along the axis
+            for (std::size_t  k = 0; k < inputDims[axisIdx]; ++k) {
+                std::size_t inIdx = i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j;
+                output[inIdx] = std::exp(input[inIdx] - maxVal) / sumExp;
+            }
+        }
+    }
+}
+
+REGISTRAR(SoftmaxImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::SoftmaxImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(SoftmaxImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::SoftmaxImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(SoftmaxImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::SoftmaxImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_SOFTMAXIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/SqrtImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SqrtImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_SQRTIMPL_H_
+#define AIDGE_CPU_OPERATOR_SQRTIMPL_H_
+
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Sqrt.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using SqrtImpl_cpu = OperatorImpl_cpu<Sqrt_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Sqrt_Op, "cpu", Aidge::SqrtImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_SQRTIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_SQRTIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_SQRTIMPL_KERNELS_H_
+
+#include <cmath>    // std::sqrt
+#include <cstddef>  // std::size_t
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/SqrtImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void SqrtImpl_cpu_forward_kernel(const std::size_t inputLength,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    for (std::size_t i = 0; i < inputLength; ++i) {
+        output[i] = static_cast<O>(std::sqrt(static_cast<float>(input[i])));
+    }
+}
+
+template <class I, class O>
+void SqrtImpl_cpu_backward_kernel(const std::size_t inputLength,
+                                     const void* output_,
+                                     const void* grad_output_,
+                                     void* grad_input_) {
+
+    const I* output = static_cast<const I*>(output_);
+    const I* grad_output = static_cast<const I*>(grad_output_);
+    O* grad_input = static_cast<O*>(grad_input_);
+
+    for (std::size_t i = 0; i < inputLength; ++i) {
+        grad_input[i] += static_cast<O>(0.5/output[i]) * grad_output[i];
+    }
+}
+
+REGISTRAR(SqrtImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::SqrtImpl_cpu_forward_kernel<float, float>, Aidge::SqrtImpl_cpu_backward_kernel<float, float>});
+REGISTRAR(SqrtImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::SqrtImpl_cpu_forward_kernel<double, double>, Aidge::SqrtImpl_cpu_backward_kernel<double, double>});
+REGISTRAR(SqrtImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::SqrtImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::SqrtImpl_cpu_backward_kernel<int32_t, int32_t>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_SQRTIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/SubImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_SUBIMPL_H_
+#define AIDGE_CPU_OPERATOR_SUBIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Sub.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using SubImpl_cpu = OperatorImpl_cpu<Sub_Op,
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*),  
+    void(const std::size_t,
+         const std::size_t,
+         const std::size_t,
+         const std::vector<std::size_t>&,
+         const std::vector<std::size_t>&,
+         const std::vector<std::size_t>&,
+         const void*,
+         void*,
+         void*)
+>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Sub_Op, "cpu", Aidge::SubImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_SUBIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t, std::int64_t
+#include <vector>
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/SubImpl.hpp"
+
+namespace {
+// suppose values are contiguous in memory
+template <class I1, class I2, class O>
+void sub_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I1* input1,
+                            const I2* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] - input2[in2_id]);
+    }
+}
+}
+
+
+namespace Aidge {
+
+template <class I1, class I2, class O>
+void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                void* output_) {
+
+    const I1* input_0 = static_cast<const I1*>(input0_);
+    const I2* input_1 = static_cast<const I2*>(input1_);
+    O* output = static_cast<O*>(output_);
+
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] - input_1[i]);
+        }
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        sub_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+
+template <class I1, class I2, class O>
+void SubImpl_cpu_backward_kernel(const std::size_t input0Length,
+                               const std::size_t input1Length,
+                               const std::size_t gradOutputLength,
+                               const std::vector<std::size_t>& dims0,
+                               const std::vector<std::size_t>& dims1,
+                               const std::vector<std::size_t>& outputDims,
+                               const void* grad_output_,
+                               void* gradientInput0_,
+                               void* gradientInput1_)
+{
+    const O* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0_);
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1_);
+
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
+
+        // For subtraction: gradient of first input is 1 * grad_output
+        grad_input_0[idx0] += static_cast<I1>(grad_output[i]);
+        // For subtraction: gradient of second input is -1 * grad_output
+        grad_input_1[idx1] += static_cast<I2>(-grad_output[i]);
+    }
+}
+
+
+// Kernels registration to implementation entry point
+REGISTRAR(SubImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<float, float, float>, Aidge::SubImpl_cpu_backward_kernel<float,float,float>});
+REGISTRAR(SubImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<double, double, double>, nullptr});
+REGISTRAR(SubImpl_cpu,
+    {DataType::Int8},
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::int8_t, std::int8_t, std::int8_t>, nullptr});
+REGISTRAR(SubImpl_cpu,
+    {DataType::UInt8},
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t, std::uint8_t>, nullptr});
+REGISTRAR(SubImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(SubImpl_cpu,
+    {DataType::Int64},
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/TanhImpl.hpp
+++ b/include/aidge/backend/cpu/operator/TanhImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_TANHIMPL_H_
+#define AIDGE_CPU_OPERATOR_TANHIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Tanh.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using TanhImpl_cpu = OperatorImpl_cpu<Tanh_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Tanh_Op, "cpu", Aidge::TanhImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_TANHIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_TANHIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_TANHIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/TanhImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void TanhImpl_cpu_forward_kernel(std::size_t inputLength,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
+        output[i] = std::tanh(input[i]);
+    }
+}
+
+template <class O, class GI, class GO>
+void TanhImpl_cpu_backward_kernel(const std::size_t inputLength,
+                                  const void* output_, const void* grad_output_,
+			          void* grad_input_) {
+    const O* output = static_cast<const O*>(output_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+    for (std::size_t i = 0; i < inputLength; ++i) {
+        grad_input[i] += (O(1) - output[i] * output[i]) * grad_output[i];
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(TanhImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::TanhImpl_cpu_forward_kernel<float, float>, Aidge::TanhImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(TanhImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::TanhImpl_cpu_forward_kernel<double, double>, Aidge::TanhImpl_cpu_backward_kernel<double, double, double>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_TANHIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/TopKImpl.hpp
+++ b/include/aidge/backend/cpu/operator/TopKImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_TOPKIMPL_H_
+#define AIDGE_CPU_OPERATOR_TOPKIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/TopK.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using TopKImpl_cpu = OperatorImpl_cpu<TopK_Op,
+    void(int64_t,
+        bool,
+        bool,
+        IOIndex_t,
+        const std::vector<DimSize_t>&,
+        const void*,
+        void*,
+        void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(TopK_Op, "cpu", Aidge::TopKImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_TOPKIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_TOPKIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_TOPKIMPL_KERNELS_H_
+
+#include <algorithm>   // std::for_each
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t
+#include <functional>  //std::multiplies
+#include <numeric>     //std::accumulate
+#include <vector>
+
+#include "aidge/backend/cpu/operator/TopKImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/TopK.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+
+template <class I, class O>
+void TopKImpl_cpu_forward_kernel(int64_t axis,
+                                 bool largest,
+                                 bool /*sorted*/,
+                                 IOIndex_t k,
+                                 const std::vector<DimSize_t>& inputDims,
+                                 const void* input_,
+                                 void* output_,
+                                 void* indices_)
+{
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+    int64_t* indices = static_cast<int64_t*>(indices_);
+
+    const std::size_t nb_dims = inputDims.size();
+    const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axis, 1, std::multiplies<std::size_t>());
+    const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axis, 1, std::multiplies<std::size_t>());
+
+    const std::size_t dim_i = inputDims[axis];
+    std::vector<std::pair<I, int64_t>> buffer(dim_i);
+
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (stride_pre * stride_post >= 16)
+#endif
+    for (int pre = 0; pre < static_cast<int>(stride_pre); ++pre) {
+        for (int post = 0; post < static_cast<int>(stride_post); ++post) {
+            const std::size_t idx_i = pre * dim_i * stride_post + post;
+            const std::size_t idx_o = pre * k * stride_post + post;
+
+            for (size_t i = 0; i < dim_i; ++i) {
+                const auto idx = idx_i + i * stride_post;
+                buffer[i] = std::make_pair(input[idx], i);
+            }
+
+            if (largest) {
+                std::partial_sort(buffer.begin(), buffer.begin() + k, buffer.end(),
+                    [](const auto& lhs, const auto& rhs) { return lhs.first > rhs.first; });
+            }
+            else {
+                std::partial_sort(buffer.begin(), buffer.begin() + k, buffer.end(),
+                    [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; });
+            }
+
+            for (size_t i = 0; i < k; ++i) {
+                output[idx_o + i] = buffer[i].first;
+                indices[idx_o + i] = buffer[i].second;
+            }
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(TopKImpl_cpu,
+    {
+        {{DataType::Float32}, {DataType::Any}},
+        {{DataType::Float32}, {DataType::Int64}}
+    },
+    {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(TopKImpl_cpu,
+    {
+        {{DataType::Float64}, {DataType::Any}},
+        {{DataType::Float64}, {DataType::Int64}}
+    },
+    {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(TopKImpl_cpu,
+    {
+        {{DataType::Int32}, {DataType::Any}},
+        {{DataType::Int32}, {DataType::Int64}}
+    },
+    {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_TOPKIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/WeightInterleavedImpl.hpp
+++ b/include/aidge/backend/cpu/operator/WeightInterleavedImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_H_
+#define AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_H_
+
+#include <array>
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/WeightInterleaving.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using WeightInterleavedImpl_cpu = OperatorImpl_cpu<WeightInterleaving_Op,
+    void(const DimSize_t,
+        const DimSize_t,
+        const DimSize_t,
+        const void *,
+        void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(WeightInterleaving_Op, "cpu", Aidge::WeightInterleavedImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_WeightInterleavingIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_
+
+#include <cstddef>  // std::size_t
+#include <cstdint>  // std::int8_t, std::uint8_t
+
+#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp"
+#include "aidge/data/DataType.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+
+
+namespace Aidge {
+
+    /**
+     * @brief Compacts 8-bit data into a smaller bit-width representation.
+     *
+     * This function takes an array of 8-bit data and compacts it into smaller chunks
+     * based on the specified bit-width `nb_bits`. Each element in `compactData` will
+     * store multiple packed `nb_bits` segments extracted from `data`.
+     *
+     * @param data The input array of 8-bit values to be compacted.
+     * @param dataSize The size of the input `data` array.
+     * @param compactData The output array storing the compacted data.
+     * @param nb_bits The number of bits to extract from each `data` element (must be less than 8).
+     */
+    template <typename T>
+    void compact_data(const T* data, std::size_t dataSize, T* compactData, std::uint8_t nb_bits) {
+        AIDGE_ASSERT(nb_bits > 0 && nb_bits < 5, "Cannot compact with the given nb_bits"); // Ensure valid bit width
+
+        // Mask to extract `nb_bits` from each data element
+        const unsigned int mask = (1U << nb_bits) - 1;
+
+        // Calculate the number of `nb_bits` segments that fit into an 8-bit compacted value
+        const unsigned int nbSlot = 8 / nb_bits;
+
+        // Case nb_bits=3 or 4, then shift is 4
+        // Case nb_bits=2, then shift is 2
+        // Case nb_bits=1, then shift is 1
+        std::uint8_t shift = 8 / nbSlot;
+
+        const unsigned int nbFullCompactbytes = dataSize / nbSlot;
+
+        // Main loop to process data in groups of `nbSlot`
+        for (std::size_t i = 0; i < nbFullCompactbytes; ++i) {
+            T compact = 0;
+
+            for (unsigned int j = 0; j < nbSlot; ++j) {
+                compact |= (data[i * nbSlot + j] & mask);    // Apply mask to keep `nb_bits` only
+
+                // Shift only if not on the last slot to make room for the next `nb_bits`
+                if (j < nbSlot - 1) {
+                    compact <<= shift;
+                }
+            }
+            // Store the compacted value in the output array
+            compactData[i] = compact;
+        }
+
+
+        // Handle any remaining data elements (if dataSize is not a multiple of nbSlot).
+        std::size_t remaining = dataSize % nbSlot;
+        if (remaining != 0) {
+            std::int8_t compact = 0;
+            for (std::size_t j = 0; j < remaining; ++j) {
+                compact |= (data[nbFullCompactbytes*nbSlot + j] & mask);
+
+                if (j < remaining - 1) {
+                    compact <<= shift;
+                }
+            }
+            compact <<= (shift*(nbSlot - remaining));
+            // Store the last compacted value
+            compactData[dataSize / nbSlot] = compact;
+        }
+    }
+
+template <class I, class O, int nb_bits>
+void WeightInterleavedImpl_cpu_forward_kernel(const DimSize_t input_interleaving,
+                            const DimSize_t nb_interleaving,
+                            const DimSize_t output_interleaving,
+                            const void* input_,
+                            void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    // Aidge::compact_data(const std::int8_t* data, std::size_t dataSize, std::int8_t* compactData, std::uint8_t nb_bits) {
+    for (std::size_t i=0; i<nb_interleaving; ++i){
+        compact_data(input+(i*input_interleaving), input_interleaving, output+(i*output_interleaving), static_cast<std::uint8_t>(nb_bits));
+    }
+
+}
+
+
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int4>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int3>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int2>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Binary, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Binary>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 1>, nullptr});
+
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt4, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt4>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 4>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt3, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt3>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 3>, nullptr});
+REGISTRAR(WeightInterleavedImpl_cpu,
+    {ImplSpec::IOSpec{DataType::UInt2, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt2>, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 2>, nullptr});
+
+
+// REGISTRAR(WeightInterleavedImpl_cpu,
+//     {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}},
+//     {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr});
+// REGISTRAR(WeightInterleavedImpl_cpu,
+//     {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}},
+//     {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr});
+// REGISTRAR(WeightInterleavedImpl_cpu,
+//     {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}},
+//     {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr});
+
+
+}
+
+#endif /* AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_ */
\ No newline at end of file
No results found