Compare revisions

758439d7 · 8ad1d47c · 684f7bc6 · bb485484 · 1b3bb038 · 3b646a93
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 # C++ Build
 build*/
 install*/
+include/aidge/backend/cpu_version.h

 # VSCode
 .vscode

--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,19 +12,14 @@ stages:
  - deploy

 include:
-  - project: 'eclipse/aidge/gitlab_shared_files' 
+  - project: 'eclipse/aidge/gitlab_shared_files'
    ref: 'main'
-    file: 
-      # choose which jobs to run by including the corresponding files.
+    file: # choose which jobs to run by including the corresponding files.
      - '.gitlab/ci/ubuntu_cpp.gitlab-ci.yml'

      - '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
-      - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'   
+      - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'

      - '.gitlab/ci/windows_cpp.gitlab-ci.yml'
-
-      - '.gitlab/ci/windows_python.gitlab-ci.yml'   
-      - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'   
-
-    
-
+      - '.gitlab/ci/windows_python.gitlab-ci.yml'
+      - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'
--- a/CHANGELOG
+++ b/CHANGELOG
+# Verson 0.5.0 (January 31, 2025)
+
+# Verson 0.4.0 (December 6, 2024)
+
 # Version 0.2.2 (May 14, 2024)

 * Remove implmentation for Operators soly handling memory and format

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.18)
-set(CXX_STANDARD 14)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS  OFF)

 file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version)

+# Parse version.txt to retrieve Major, Minor and Path
+string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" _ MATCHES ${version})
+set(PROJECT_VERSION_MAJOR ${CMAKE_MATCH_1})
+set(PROJECT_VERSION_MINOR ${CMAKE_MATCH_2})
+set(PROJECT_VERSION_PATCH ${CMAKE_MATCH_3})
+
 project(aidge_backend_cpu
        VERSION ${version}
        DESCRIPTION "CPU implementations of the operators of aidge framework."
        LANGUAGES CXX)

-message(STATUS "Project name: ${CMAKE_PROJECT_NAME}")
-message(STATUS "Project version: ${version}")
-add_definitions(-DPROJECT_VERSION="${version}")
-
+# Retrieve latest git commit
 execute_process(
    COMMAND git rev-parse --short HEAD
    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
@@ -19,8 +25,13 @@ execute_process(
    OUTPUT_STRIP_TRAILING_WHITESPACE
    ERROR_QUIET
 )
+
+message(STATUS "Project name: ${CMAKE_PROJECT_NAME}")
+message(STATUS "Project version: ${version}")
 message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}")
-add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")
+
+# helper for LSP users
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

 # Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME}
 set(module_name _${CMAKE_PROJECT_NAME}) # target name
@@ -53,6 +64,16 @@ if(NOT $ENV{AIDGE_INSTALL} STREQUAL "")
 endif()
 find_package(aidge_core REQUIRED)

+find_package(OpenMP)
+
+find_package(OpenSSL QUIET)
+if(OpenSSL_FOUND)
+    message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
+    add_definitions(-DWITH_OPENSSL)
+else()
+    message(WARNING "OpenSSL not found, SHA256 will not be available.")
+endif()
+
 ##############################################
 # Create target and set properties
 file(GLOB_RECURSE src_files "src/*.cpp")
@@ -61,10 +82,22 @@ file(GLOB_RECURSE inc_files "include/*.hpp")
 add_library(${module_name} ${src_files} ${inc_files})

 target_link_libraries(${module_name}
+    PRIVATE
+        fmt::fmt
    PUBLIC
        _aidge_core # _ is added because we link the exported target and not the project
 )

+if(OpenMP_CXX_FOUND)
+    target_link_libraries(${module_name} PRIVATE OpenMP::OpenMP_CXX)
+    set(AIDGE_REQUIRES_OPENMP TRUE)
+endif()
+
+# Add definition _USE_MATH_DEFINES to enable math constant definitions from math.h/cmath.
+if (WIN32)
+    target_compile_definitions(${module_name} PRIVATE _USE_MATH_DEFINES)
+endif()
+
 #Set target properties
 set_property(TARGET ${module_name} PROPERTY POSITION_INDEPENDENT_CODE ON)

@@ -99,6 +132,12 @@ target_include_directories(${module_name}
        ${CMAKE_CURRENT_SOURCE_DIR}/src
 )

+set(AIDGE_REQUIRES_OPENSSL FALSE)
+if(OpenSSL_FOUND)
+    target_link_libraries(${module_name} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
+    set(AIDGE_REQUIRES_OPENSSL TRUE)
+endif()
+
 target_compile_features(${module_name} PRIVATE cxx_std_14)

 target_compile_options(${module_name} PRIVATE
@@ -112,6 +151,13 @@ if(CMAKE_COMPILER_IS_GNUCXX AND COVERAGE)
    append_coverage_compiler_flags()
 endif()

+message(STATUS "Creating ${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cpu_version.h")
+# Generate version.h file from config file version.h.in
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/version.h.in"
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cpu_version.h"
+)
+
 ##############################################
 # Installation instructions
 include(GNUInstallDirs)

--- a/aidge_backend_cpu-config.cmake.in
+++ b/aidge_backend_cpu-config.cmake.in
@@ -2,6 +2,14 @@

 include(CMakeFindDependencyMacro)
 find_dependency(aidge_core)
+set(AIDGE_REQUIRES_OPENMP @AIDGE_REQUIRES_OPENMP@)
+if (AIDGE_REQUIRES_OPENMP)
+    find_dependency(OpenMP)
+endif()
+set(AIDGE_REQUIRES_OPENSSL @AIDGE_REQUIRES_OPENSSL@)
+if (AIDGE_REQUIRES_OPENSSL)
+    find_dependency(OpenSSL)
+endif()

 include(CMakeFindDependencyMacro)


--- a/aidge_backend_cpu/__init__.py
+++ b/aidge_backend_cpu/__init__.py
 import aidge_core
 from aidge_backend_cpu.aidge_backend_cpu import * # import so generated by PyBind
-from ._version import *
+from . import benchmark
--- a/aidge_backend_cpu/benchmark.py
+++ b/aidge_backend_cpu/benchmark.py
+import time
+
+import numpy as np
+
+import aidge_core
+
+def prepare_model_scheduler_inputs(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> tuple[aidge_core.GraphView, aidge_core.SequentialScheduler]:
+    # update model and inputs backend
+    model.set_backend("cpu")
+    ordered_inputs = [aidge_core.Tensor(i[1]) for i in input_data]
+    for ordered_input in ordered_inputs:
+        ordered_input.set_backend("cpu")
+
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+
+    return model, scheduler, ordered_inputs
+
+
+def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
+    model, scheduler, ordered_inputs = prepare_model_scheduler_inputs(model, input_data)
+
+    timings = []
+    # Warm-up runs.
+    for i in range(nb_warmup + nb_iterations):
+        if i < nb_warmup:
+            scheduler.forward(forward_dims=False, data=ordered_inputs)
+        else:
+            start = time.process_time()
+            scheduler.forward(forward_dims=False, data=ordered_inputs)
+            end = time.process_time()
+            timings.append((end - start))
+    return timings
+
+def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
+    model, scheduler, ordered_inputs = prepare_model_scheduler_inputs(model, input_data)
+
+    scheduler.forward(forward_dims=False, data=ordered_inputs)
+
+    return [np.array(t[0].get_operator().get_output(t[1])) for t in model.get_ordered_outputs()]
\ No newline at end of file
--- a/aidge_backend_cpu/unit_tests/test_scheduler.py
+++ b/aidge_backend_cpu/unit_tests/test_scheduler.py
@@ -13,16 +13,17 @@ class test_scheduler(unittest.TestCase):
        pass

    def test_relu_forward(self):
-        values = np.arange(6) - 3

-        input_node = aidge_core.Producer(aidge_core.Tensor(values), "Input")
+        t = aidge_core.Tensor(np.arange(6, dtype=np.int32) - 3)
+
+        input_node = aidge_core.Producer(t)
        relu = aidge_core.ReLU()
+        input_node.add_child(relu)

        gv = aidge_core.GraphView()
        gv.add(relu)
        gv.add(input_node)

-        input_node.add_child(relu)

        gv.set_datatype(aidge_core.dtype.int32)
        gv.set_backend("cpu")
@@ -34,7 +35,7 @@ class test_scheduler(unittest.TestCase):
        out_tensor = relu.get_operator().get_output(0)
        expected_out = [0,0,0,0,1,2]
        for i in range(len(expected_out)):
-            self.assertEqual(expected_out[i], out_tensor[i])
+            self.assertEqual(expected_out[i], out_tensor[i], f"On idx {i}")

    def test_sequential_scheduling(self):
        input_data =  np.array([0]).astype(np.float32)
@@ -56,9 +57,9 @@ class test_scheduler(unittest.TestCase):
        scheduler = aidge_core.SequentialScheduler(graph_view)
        scheduler.generate_scheduling()

-        self.assertEqual(len(scheduler.get_static_scheduling()), 10)
+        self.assertEqual(len(scheduler.get_sequential_static_scheduling()), 10)
        # Do not care about the order of execution of the producers
-        self.assertListEqual([i.name() for i in scheduler.get_static_scheduling()[-3:]], EXPECTED_SCHEDULE)
+        self.assertListEqual([i.name() for i in scheduler.get_sequential_static_scheduling()[-3:]], EXPECTED_SCHEDULE)


    def test_parallel_scheduling(self):
@@ -69,7 +70,7 @@ class test_scheduler(unittest.TestCase):
            aidge_core.Producer(input_tensor, "X"),
            aidge_core.FC(1, 50, name='0'),
            aidge_core.parallel([aidge_core.FC(50, 50, name='1'), aidge_core.FC(50, 50, name='3')]),
-            aidge_core.Add(2, name='2'),
+            aidge_core.Add(name='2'),
        ])

        EXPECTED_SCHEDULE = [['0', '1', '3', '2'],  ['0', '3', '1', '2']] # Both scheduling are valid !
@@ -82,9 +83,9 @@ class test_scheduler(unittest.TestCase):
        scheduler = aidge_core.SequentialScheduler(graph_view)
        scheduler.generate_scheduling()

-        self.assertEqual(len(scheduler.get_static_scheduling()), 11)
+        self.assertEqual(len(scheduler.get_sequential_static_scheduling()), 11)
        # Do not care about the order of execution of the producers
-        self.assertTrue([i.name() for i in scheduler.get_static_scheduling()[-4:]] in EXPECTED_SCHEDULE)
+        self.assertTrue([i.name() for i in scheduler.get_sequential_static_scheduling()[-4:]] in EXPECTED_SCHEDULE)

 if __name__ == '__main__':
    unittest.main()
--- a/cmake/PybindModuleCreation.cmake
+++ b/cmake/PybindModuleCreation.cmake
-function(generate_python_binding pybind_module_name target_to_bind) 
+function(generate_python_binding pybind_module_name target_to_bind)

    find_package(Python COMPONENTS Interpreter Development.Module)

    Include(FetchContent)

-    set(PYBIND_VERSION v2.10.4)
+    set(PYBIND_VERSION v2.13.6)
    message(STATUS "Retrieving pybind ${PYBIND_VERSION} from git")

    FetchContent_Declare(

--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -12,40 +12,58 @@
 #ifndef AIDGE_CPU_IMPORTS_H_
 #define AIDGE_CPU_IMPORTS_H_

+#include "aidge/backend/cpu_version.h"
+
 #include "aidge/backend/cpu/operator/AbsImpl.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
 #include "aidge/backend/cpu/operator/AndImpl.hpp"
+#include "aidge/backend/cpu/operator/AtanImpl.hpp"
+
 #include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
 #include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
 #include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
+#include "aidge/backend/cpu/operator/ClipImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/backend/cpu/operator/ConvTransposeImpl.hpp"
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
+#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
+#include "aidge/backend/cpu/operator/DropoutImpl.hpp"
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
 #include "aidge/backend/cpu/operator/ErfImpl.hpp"
+#include "aidge/backend/cpu/operator/ExpandImpl.hpp"
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
 #include "aidge/backend/cpu/operator/FoldImpl.hpp"
 #include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/backend/cpu/operator/HardmaxImpl.hpp"
+#include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
+#include "aidge/backend/cpu/operator/LRNImpl.hpp"
 #include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
 #include "aidge/backend/cpu/operator/LnImpl.hpp"
 #include "aidge/backend/cpu/operator/MatMulImpl.hpp"
+#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
+#include "aidge/backend/cpu/operator/ModImpl.hpp"
 #include "aidge/backend/cpu/operator/MulImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
+#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
 #include "aidge/backend/cpu/operator/PowImpl.hpp"
 #include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
 #include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
+#include "aidge/backend/cpu/operator/ResizeImpl.hpp"
 #include "aidge/backend/cpu/operator/ReLUImpl.hpp"
-#include "aidge/backend/cpu/operator/ScalingImpl.hpp"
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
 #include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
 #include "aidge/backend/cpu/operator/SqrtImpl.hpp"
 #include "aidge/backend/cpu/operator/SliceImpl.hpp"
 #include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
 #include "aidge/backend/cpu/operator/SubImpl.hpp"
+#include "aidge/backend/cpu/operator/TopKImpl.hpp"
 #include "aidge/backend/cpu/operator/TanhImpl.hpp"
+#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp"

 #include "aidge/backend/cpu/data/TensorImpl.hpp"

 #endif /* AIDGE_CPU_IMPORTS_H_ */
-
--- a/include/aidge/backend/cpu/data/Interpolation.hpp
+++ b/include/aidge/backend/cpu/data/Interpolation.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_DATA_INTERPOLATION_H_
+#define AIDGE_CPU_DATA_INTERPOLATION_H_
+
+#include <vector>
+
+#include <aidge/data/Interpolation.hpp>
+#include <aidge/utils/Types.h>
+
+namespace Aidge {
+class InterpolationCPU : public Interpolation {
+  public:
+    /*
+     * @brief Interpolates values given via input in given mode.
+     *
+     * Values are contiguously arranged in a "square" shape around the point to
+     * interpolate. Depending on interpolation mode.
+     * The point that will be interpolated is located right in the
+     * middle of all points.
+     * Immediate neighbours :
+     * 1D interp :     2D interp :
+     *                 . . . . . .
+     * . . 1 2 . .     . . . . . .
+     *                 . . 1 2 . .
+     *                 . . 3 4 . .
+     *                 . . . . . .
+     *                 . . . . . .
+     *
+     * 2 neighbours :
+     * 1D interp :         2D interp :
+     *                   .  .  .  .  .  .  . .
+     *                   .  .  .  .  .  .  . .
+     * . . 1 2 3 4 . .   .  .  1  2  3  4  . .
+     *                   .  .  5  6  7  8  . .
+     *                   .  .  9 10 11 12  . .
+     *                   .  . 13 14 15 16  . .
+     *                   .  .  .  .  .  .  . .
+     *                   .  .  .  .  .  .  . .
+     *
+     * @param[in] originalCoords: coord of the point to interpolate in the
+     * original picture. These coords are generated with
+     * Interpolation::untransformCoords(coordsInInterpolatedTensor)
+     * @param[in] points : points to interpolate, arranged in a vector of a
+     * pairs ((point_coord), value) :
+     * [[[X1, X2, ..., XN], Xval], ...., [[A1, A2, ..., AN],Aval]].
+     * With :
+     * - N: the number of dimensions.
+     * - A: the number of points of the grid to interpolate.
+     * - All coordinates expressed in originalTensor frame.
+     * @param[in] interpMode: interpolation mode
+     * @return interpolated value
+     */
+    template <typename T>
+    static T interpolate(const std::vector<float> &coordsToInterpolate,
+                         const std::set<Point<T>> &points,
+                         const Mode interpMode = Interpolation::Mode::Linear);
+
+    /**
+     * @brief performs linear interpolation on given points.
+     * @param[in] values: values to interpolate, since we only do an average of
+     * all values, their indexes isn't useful.
+     * @return interpolated value
+     */
+    template <typename T>
+    static T linear(const std::vector<float> &originalCoords,
+                    const std::set<Point<T>> &points);
+
+    /**
+     * @brief performs nearest interpolation on given points.
+     * @note it is a wrapper for linearRecurse() private method
+     * @param[in] coordsToInterpolate: coordinates to interpolate
+     * @param[in] points: points to interpolate
+     * @param[in] interpMode: interpolation method, must be a Nearest...
+     * otherwise function will throw an error.
+     * @return interpolated value
+     */
+    template <typename T>
+    static T nearest(const std::vector<float> &coordsToInterpolate,
+                     const std::set<Point<T>> &points,
+                     const Interpolation::Mode nearestMode);
+
+  private:
+    /**
+     * @brief actual linear interpolation function.
+     * will :
+     * - Split all points along each dimension depending of if their coords at
+     * idx alongDim are above or under coordsToInterpolate until they are
+     * 1-to-1.
+     * - Perform interpolation in 2 leftover points and return interpolated
+     * point to parent call with a set of size 1.
+     * - repeat until all dimensions have been interpolated.
+     * @param[in] coordsToInterpolate: coordinates to interpolate
+     * @param[in] points: points to interpolate
+     * @param[in] alongDim: discriminant on along which dimension are being
+     * segregated.
+     * @return
+     */
+    template <typename T>
+    static std::set<Interpolation::Point<T>>
+    linearRecurse(const std::vector<float> &coordsToInterpolate,
+                  const std::set<Point<T>> &points,
+                  const DimIdx_t alongDim = 0);
+};
+
+} // namespace Aidge
+
+#endif // AIDGE_CPU_DATA_INTERPOLATION_H_
--- a/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp
@@ -20,14 +20,14 @@

 namespace Aidge {
 template <class I, class O>
-void AbsImpl_cpu_forward_kernel(std::size_t inputLenght,
+void AbsImpl_cpu_forward_kernel(std::size_t inputLength,
                                     const void* input_,
                                     void* output_) {

    const I* input = static_cast<const I*>(input_);
    O* output = static_cast<O*>(output_);

-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
        output[i] = std::abs(input[i]);
    }
 }

--- a/include/aidge/backend/cpu/operator/AddImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl.hpp
@@ -25,7 +25,17 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using AddImpl_cpu = OperatorImpl_cpu<Add_Op,
-    void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)>;
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*),
+    void(const std::size_t, 
+         const std::size_t, 
+         const std::size_t, 
+         const std::vector<std::size_t>&, 
+         const std::vector<std::size_t>&, 
+         const std::vector<std::size_t>&, 
+         const void*, 
+         void*, 
+         void*)
+>;

 // Implementation entry point registration to Operator
 REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create);

--- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
@@ -14,46 +14,204 @@

 #include "aidge/utils/Registrar.hpp"

-#include <cstdint>     // std::int32_t, std::int64_t
+#include <cstddef>  // std::size_t

 #include "aidge/backend/cpu/data/Broadcasting.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"

 namespace Aidge {

+namespace {
+// suppose values are contiguous in memory
 template <class I, class O>
-void AddImpl_cpu_forward_kernel(const std::vector<const void*> inputs_, const std::vector<std::vector<std::size_t>>& inputDims, const std::size_t outputLength, const std::vector<std::size_t>& outDims, void* output_) {
-    // FIXME: missing Add attributes as arguments
-    std::vector<const I*> inputs;
-    for (const auto& input_ : inputs_) {
-        inputs.push_back(static_cast<const I*>(input_));
+void add_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I* input1,
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] + input2[in2_id]);
    }
+}
+}
+
+template <class I, class O>
+void AddImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                void* output_) {
+
+    const I* input_0 = static_cast<const I*>(input0_);
+    const I* input_1 = static_cast<const I*>(input1_);
    O* output = static_cast<O*>(output_);

-	for (std::size_t oIndex = 0; oIndex < outputLength; ++oIndex)
-	{
-        output[oIndex] = 0;
-		std::vector<size_t> indexes = getMultiDimIndices(outDims, oIndex);
-		for(std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) {
-			std::size_t idx = getFlattenedIndex(inputDims[iIndex], indexes);
-            output[oIndex] += inputs[iIndex][idx];
-		}
-	}
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] + input_1[i]);
+        }
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        add_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+
+template <class I, class O>
+void AddImpl_cpu_backward_kernel(const std::size_t input0Length,
+                               const std::size_t input1Length,
+                               const std::size_t gradOutputLength,
+                               const std::vector<std::size_t>& dims0,
+                               const std::vector<std::size_t>& dims1,
+                               const std::vector<std::size_t>& outputDims,
+                               const void* grad_output_,
+                               void* gradientInput0_,
+                               void* gradientInput1_)
+{
+    // TODO: Remove input0/1 from the function
+    const O* gradOutput = static_cast<const O*>(grad_output_);
+    auto* gradInput0 = static_cast<I*>(gradientInput0_);
+    auto* gradInput1 = static_cast<I*>(gradientInput1_);
+
+    std::fill_n(gradInput0, input0Length, static_cast<I>(0));
+    std::fill_n(gradInput1, input1Length, static_cast<I>(0));
+
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
+
+        // For addition: gradient of both inputs is just the output gradient
+        // (unlike multiplication where we need to multiply by the other input,
+        // or subtraction where we need to negate one of them)
+        gradInput0[idx0] += static_cast<I>(gradOutput[i]);
+        gradInput1[idx1] += static_cast<I>(gradOutput[i]);
+    }
 }

 // Kernels registration to implementation entry point
 REGISTRAR(AddImpl_cpu,
    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<float, float>, Aidge::AddImpl_cpu_backward_kernel<float, float>});
 REGISTRAR(AddImpl_cpu,
    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, Aidge::AddImpl_cpu_backward_kernel<double, double>});
+REGISTRAR(AddImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}},
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, Aidge::AddImpl_cpu_backward_kernel<std::int8_t, std::int8_t>});
+REGISTRAR(AddImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}},
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, Aidge::AddImpl_cpu_backward_kernel<std::uint8_t, std::uint8_t>});
 REGISTRAR(AddImpl_cpu,
    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, Aidge::AddImpl_cpu_backward_kernel<std::int32_t, std::int32_t>});
 REGISTRAR(AddImpl_cpu,
    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, Aidge::AddImpl_cpu_backward_kernel<std::int64_t, std::int64_t>});
 }  // namespace Aidge

-#endif /* AIDGE_CPU_OPERATOR_ADDIMPL_CPU_KERNELS_H_ */
\ No newline at end of file
+#endif /* AIDGE_CPU_OPERATOR_ADDIMPL_CPU_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/AndImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl.hpp
@@ -23,7 +23,7 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using AndImpl_cpu = OperatorImpl_cpu<And_Op,
-    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>;
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;

 // Implementation entry point registration to Operator
 REGISTRAR(And_Op, "cpu", Aidge::AndImpl_cpu::create);

--- a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
@@ -12,52 +12,151 @@
 #ifndef AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_
 #define AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_

-#include "aidge/backend/cpu/data/Broadcasting.hpp"
 #include "aidge/backend/cpu/operator/AndImpl.hpp"
 #include "aidge/utils/Registrar.hpp"

 namespace Aidge {
-template <class I1, class I2, class O>
-void AndImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
-                                const std::vector<std::size_t>& input2Dims,
+
+namespace {
+// suppose values are contiguous in memory
+template <class I, class O>
+void and_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I* input1,
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] && input2[in2_id]);
+    }
+}
+}
+
+
+template <class I, class O>
+void AndImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
                                const void* input1_,
-                                const void* input2_,
                                void* output_) {

-    const I1* input_1 = static_cast<const I1*>(input1_);
-    const I2* input_2 = static_cast<const I2*>(input2_);
+    const I* input_0 = static_cast<const I*>(input0_);
+    const I* input_1 = static_cast<const I*>(input1_);
    O* output = static_cast<O*>(output_);

-    size_t totalElements = 1;
-    for (size_t dimSize : outputDims) {
-        totalElements *= dimSize;
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i) {
+            output[i] = static_cast<O>(input_0[i] && input_1[i]);
+        }
+        return;
    }

-	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
-	{
-		std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }

-		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
-		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
+    const std::size_t nbDims = dims0.size();

-        output[oIndex] = static_cast<O>(input_1[idx1] == input_2[idx2]);
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        and_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
    }
 }

 // Kernels registration to implementation entry point
 REGISTRAR(AndImpl_cpu,
-    {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float, float>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double, double>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Int64},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+
 }  // namespace Aidge

 #endif /* AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/AtanImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AtanImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ATAN_H_
+#define AIDGE_CPU_OPERATOR_ATAN_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Atan.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using AtanImpl_cpu = OperatorImpl_cpu<Atan_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Atan_Op, "cpu", Aidge::AtanImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ATAN_H_ */
--- a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/AtanImpl.hpp"
+#include <cmath>  // For atan()
+
+
+namespace Aidge {
+template <class I, class O>
+void AtanImpl_cpu_forward_kernel(std::size_t inputLength,
+                                    const void* input_,
+                                    void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    for (size_t i = 0; i < inputLength; ++i) {
+        output[i] = static_cast<O>(atan(input[i]));
+    }
+
+}
+
+template <class O, class GI, class GO>
+void AtanImpl_cpu_backward_kernel(const std::size_t inputLength,
+                                     const void* output_, const void* grad_output_,
+				     void* grad_input_) {
+    const O* output = static_cast<const O*>(output_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+
+    // Apply the derivative of atan for each element in the input array
+    for (size_t i = 0; i < inputLength; ++i) {
+        // dx = dy * (1 / (1 + x^2))
+        grad_input[i] += grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i]));
+    }
+}
+
+
+// Kernels registration to implementation entry point
+REGISTRAR(AtanImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::AtanImpl_cpu_forward_kernel<float, float>, Aidge::AtanImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(AtanImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::AtanImpl_cpu_forward_kernel<double, double>, Aidge::AtanImpl_cpu_backward_kernel<double, double, double>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
@@ -28,8 +28,11 @@ namespace Aidge {
 using AvgPooling2D_Op = AvgPooling_Op<2>;
 using AvgPoolingImpl2D_cpu = OperatorImpl_cpu<AvgPooling_Op<2>,
    void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
        const std::array<DimSize_t, 2>&,
        const std::array<DimSize_t, 4>&,
+        bool,
+        RoundingMode,
        const void *,
        void *)>;


--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -23,6 +23,22 @@
 #include "aidge/utils/Types.h"

 namespace Aidge {
+
+template <typename T>
+using Acc_T = typename std::conditional<std::is_floating_point<T>::value, T, double>::type;
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type
+castFromFloat(T value, RoundingMode /*roundingMode*/) {
+  return value;
+}
+
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+castFromFloat(double value, RoundingMode roundingMode) {
+  return static_cast<T>(round(value, roundingMode));
+}
+
 /**
 * @brief Forward kernel for 2D AvgPoolingolution on CPU backend.
 * @tparam I Input data type.
@@ -35,66 +51,72 @@ namespace Aidge {
 template <class I, class O>
 void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                        const std::array<DimSize_t, 2>& kernelDims,
+                                        const std::array<DimSize_t, 2>& dilations,
                                        const std::array<DimSize_t, 4> &dims,
+                                        bool ceilMode,
+                                        RoundingMode roundingMode,
                                        const void *input_,
                                        void *output_) {
-    // FIXME: missing convolution attributes as arguments
    const I *input = static_cast<const I *>(input_);
    O *output = static_cast<O *>(output_);

-
    // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
+    const std::size_t oxSize = 
+        ceilMode 
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])));
    // output W size
-    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) /
-                                static_cast<float>(strideDims[1])));
-
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, ch, Xin, Yin)
-    // weight (outCh, ch, kernelX, kernelY)
-    // does not take Dilation attribute into account
+    const std::size_t oySize = 
+        ceilMode 
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])));
+
    using signedsize = std::make_signed<std::size_t>::type;
-    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
-        for (std::size_t ch = 0; ch < dims[1]; ++ch) {
-            const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
-            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), 0);
+
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16)
+#endif
+    for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) {
+        for (int ch = 0; ch < static_cast<int>(dims[1]); ++ch) {
+            const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize;
+            const std::size_t iIndex = (ch + batch * dims[1]) * dims[2] * dims[3];
+
            for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                const signedsize difx = static_cast<signedsize>(-ox * strideDims[0]);
                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
                const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx);
+
                for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
+                    const signedsize dify = static_cast<signedsize>(-oy * strideDims[1]);
                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
                    const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify);
-                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
+
+                    const std::size_t oIndexFull = oIndex + ox * oySize + oy;
                    const std::size_t ix = ox * strideDims[0];
                    const std::size_t iy = oy * strideDims[1];

-                    if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
-                        output[oIndexFull] += static_cast<O>(
-                                               input[iIndex + (ix+0)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+0)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+0)*dims[3] + (iy+2)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+2)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+2)]) / O(9);
-                    } else {
-                        for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
-                            for (std::size_t sy = syMin; sy < syMax; ++sy) {
-                                output[oIndexFull] += input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
+                    Acc_T<I> sum = static_cast<Acc_T<I>>(0);
+                    std::size_t count = 0;
+
+                    for (unsigned int sy = syMin; sy < syMax; ++sy) {
+                        for (unsigned int sx = sxMin; sx < sxMax; ++sx) {
+                            // Apply dilation factor
+                            const std::size_t dilated_sx = sx * dilations[0];
+                            const std::size_t dilated_sy = sy * dilations[1];
+
+                            // Ensure within bounds
+                            if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) {
+                                sum += static_cast<Acc_T<I>>(input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)]);
+                                ++count;
                            }
                        }
-                        // padding not used
-                        output[oIndexFull] /= (sxMax - sxMin) * (syMax - syMin);
                    }
+
+                    output[oIndexFull] = count > 0 ? castFromFloat<O>(sum / count, roundingMode) : 0;
                }
            }
        }
No results found