diff --git a/.gitignore b/.gitignore
index f37378e300efeb5362882eb8d6eb59f028563a0e..0e14676b900cb1418593019be70cc4d20aba2883 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# common
+.cache
+
 # C++ Build
 build*/
 install*/
@@ -11,6 +14,8 @@ __pycache__
 *.pyc
 *.egg-info
 dist*/
+wheelhouse/
+aidge_backend_cpu/_version.py
 
 # Mermaid
 *.mmd
@@ -19,4 +24,4 @@ dist*/
 xml*/
 
 # ONNX
-*.onnx
\ No newline at end of file
+*.onnx
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 420442101a3892683f52e28e3bc9c8022abbcab5..97fcaa704b72922d35ad70feb923633fa194c850 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -4,18 +4,27 @@
 ###############################################################################
 
 stages:
-  # Analyse code
   - static_analysis
-  # Build Aidge
   - build
-  # Unit test stage
   - test
-  # Code coverage
   - coverage
+  - release
+  - deploy
 
 include:
-  - local: '/.gitlab/ci/_global.gitlab-ci.yml'
-  # - local: '/.gitlab/ci/static_analysis.gitlab-ci.yml'
-  - local: '/.gitlab/ci/build.gitlab-ci.yml'
-  - local: '/.gitlab/ci/test.gitlab-ci.yml'
-  # - local: '/.gitlab/ci/coverage.gitlab-ci.yml'
+  - project: 'eclipse/aidge/gitlab_shared_files' 
+    ref: 'main'
+    file: 
+      # choose which jobs to run by including the corresponding files.
+      - '.gitlab/ci/ubuntu_cpp.gitlab-ci.yml'
+
+      - '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
+      - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'   
+
+      - '.gitlab/ci/windows_cpp.gitlab-ci.yml'
+
+      - '.gitlab/ci/windows_python.gitlab-ci.yml'   
+      - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'   
+
+    
+
diff --git a/.gitlab/ci/_global.gitlab-ci.yml b/.gitlab/ci/_global.gitlab-ci.yml
deleted file mode 100644
index 331373fe0f27e7750183eb2e76fe83300cf316a8..0000000000000000000000000000000000000000
--- a/.gitlab/ci/_global.gitlab-ci.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-################################################################################
-# Centralized definitions of common job parameter values.                      #
-# Parameters with many optional configurations may be in separate files.       #
-#                                                                              #
-################################################################################
-variables:
-  GIT_SUBMODULE_STRATEGY: recursive
-  OMP_NUM_THREADS: 4
-  GIT_SSL_NO_VERIFY: 1
-  DEBIAN_FRONTEND: noninteractive
-
-# See https://docs.gitlab.com/ee/ci/yaml/workflow.html#switch-between-branch-pipelines-and-merge-request-pipelines
-workflow:
-  rules:
-    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
-    - if: $CI_COMMIT_BRANCH && $CI_OPEN_MERGE_REQUESTS
-      when: never
-    - if: $CI_COMMIT_BRANCH
-
-default:
-  image: nvidia/cuda:12.2.0-devel-ubuntu22.04
-  before_script:
-    - apt update
-    - apt install -y cmake cppcheck python-is-python3 pip git gcovr unzip curl
diff --git a/.gitlab/ci/build.gitlab-ci.yml b/.gitlab/ci/build.gitlab-ci.yml
deleted file mode 100644
index 18963ced1084c56c1e4c04dceec735126bba962a..0000000000000000000000000000000000000000
--- a/.gitlab/ci/build.gitlab-ci.yml
+++ /dev/null
@@ -1,214 +0,0 @@
-include:
-  - remote: 'https://gitlab.eclipse.org/eclipse/aidge/gitlab_shared_files/-/raw/main/.gitlab/ci/shared_script.gitlab-ci.yml'
-
-build:ubuntu_cpp:
-  stage: build
-  needs: []
-  tags:
-    - docker
-  script:
-    # Download dependencies
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    - !reference [.download_dependency, script]
-
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - mkdir -p build_cpp
-    - cd build_cpp
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-
-  artifacts:
-    expire_in: 1 week
-    paths:
-      - build_cpp/
-      - install_cpp/
-
-build:ubuntu_cpp_g++10:
-  stage: build
-  needs: []
-  tags:
-    - docker
-
-  script:
-    # Download dependencies
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    - !reference [.download_dependency, script]
-
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - apt install -y g++-10
-    - mkdir -p build_cpp
-    - mkdir -p install_cpp
-    - cd build_cpp
-    - export CXX=/usr/bin/g++-10
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-
-build:ubuntu_cpp_g++12:
-  stage: build
-  needs: []
-  tags:
-    - docker
-
-  script:
-    # Download dependencies
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    - !reference [.download_dependency, script]
-
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - apt install -y g++-12
-    - mkdir -p build_cpp
-    - mkdir -p install_cpp
-    - cd build_cpp
-    - export CXX=/usr/bin/g++-12
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-
-build:ubuntu_cpp_clang12:
-  stage: build
-  needs: []
-  tags:
-    - docker
-
-  script:
-    # Download dependencies
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    - !reference [.download_dependency, script]
-
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - apt install -y clang-12
-    - mkdir -p build_cpp
-    - mkdir -p install_cpp
-    - cd build_cpp
-    - export CXX=/usr/bin/clang++-12
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-
-build:ubuntu_cpp_clang15:
-  stage: build
-  needs: []
-  tags:
-    - docker
-
-  script:
-    # Download dependencies
-    # aidge_core
-    - DEPENDENCY_NAME="aidge_core"
-    - DEPENDENCY_JOB="build:ubuntu_cpp"
-    - !reference [.download_dependency, script]
-
-    # Build current module
-    - export CMAKE_PREFIX_PATH=../install_cpp
-    - apt install -y clang-15
-    - mkdir -p build_cpp
-    - mkdir -p install_cpp
-    - cd build_cpp
-    - export CXX=/usr/bin/clang++-15
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug -DWERROR=ON -DCOVERAGE=ON ..
-    - make -j4 all install
-
-build:ubuntu_python:
-  stage: build
-  needs: []
-  tags:
-    - docker
-
-  script:
-    # Download dependencies
-    # aidge_core (Python)
-    - DEPENDENCY_NAME="aidge_core"
-    - DEPENDENCY_JOB="build:ubuntu_python"
-    - !reference [.download_dependency, script]
-
-    - python3 -m pip install virtualenv
-    - virtualenv venv
-    - source venv/bin/activate
-    - python3 -m pip install -r requirements.txt
-    - python3 -m pip install .
-    - python3 -m pip install numpy unittest-xml-reporting
-    - python3 -m pip list
-  artifacts:
-    expire_in: 1 week
-    paths:
-      - venv/
-
-build:windows_cpp:
-  stage: build
-  needs: []
-  tags:
-    - windows
-
-  image: buildtools
-  before_script:
-    # Install Chocolatey
-    - Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
-    # Install dependencies
-    - choco install cmake.install --installargs '"ADD_CMAKE_TO_PATH=System"' -Y
-    - choco install git -Y
-    - choco install python -Y
-    # Update PATH
-    - $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
-  script:
-    # Download dependencies
-    # aidge_core
-    - $DEPENDENCY_NAME="aidge_core"
-    - $DEPENDENCY_JOB="build:windows_cpp"
-    - !reference [.download_dependency_windows, script]
-    - Remove-Item .\build_cpp\ -Recurse -Force -ErrorAction Ignore
-
-    - $env:CMAKE_PREFIX_PATH = '../install_cpp'
-    - mkdir -p build_cpp
-    - cd build_cpp
-    - cmake -DCMAKE_INSTALL_PREFIX:PATH=../install_cpp -DCMAKE_BUILD_TYPE=Debug ..
-    - cmake --build . -j2
-    - cmake --install . --config Debug
-
-  artifacts:
-    expire_in: 1 week
-    paths:
-      - build_cpp/
-      - install_cpp/
-
-build:windows_python:
-  stage: build
-  needs: []
-  tags:
-    - windows
-
-  image: buildtools
-  before_script:
-    # Install Chocolatey
-    - Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
-    # Install dependencies
-    - choco install cmake.install --installargs '"ADD_CMAKE_TO_PATH=System"' -Y
-    - choco install git -Y
-    - choco install python -Y
-    # Update PATH
-    - $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
-  script:
-    # Download dependencies
-    # aidge_core (Python)
-    - $DEPENDENCY_NAME="aidge_core"
-    - $DEPENDENCY_JOB="build:windows_python"
-    - !reference [.download_dependency_windows, script]
-
-    - python -m pip install virtualenv
-    - virtualenv venv
-    - venv\Scripts\Activate.ps1
-    - python -m pip install -r requirements.txt
-    - python -m pip install .
-  artifacts:
-    expire_in: 1 week
-    paths:
-      - venv/
diff --git a/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.ps1 b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.ps1
new file mode 100755
index 0000000000000000000000000000000000000000..12e1b7566cf8ea534ea71f8416630dae9267e0cc
--- /dev/null
+++ b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.ps1
@@ -0,0 +1,23 @@
+$ErrorActionPreference = "Stop"
+
+# Retrieve and clean the dependencies string from the environment variable
+$AIDGE_DEPENDENCIES = $env:AIDGE_DEPENDENCIES -split ' '
+Write-Host "Aidge dependencies : $AIDGE_DEPENDENCIES"
+if ( $($AIDGE_DEPENDENCIES.Length) -eq 0) {
+        Write-Host "- No dependencies provided for current repsitory"
+        New-Item -ItemType Directory -Force -Path ".\build" | Out-Null
+        Remove-Item -Path ".\build\*" -Recurse -Force
+    } else {
+        Write-Host "Retrieving given dependencies to build current package : $AIDGE_DEPENDENCIES"
+    foreach ($dep in $($AIDGE_DEPENDENCIES -split " ")) {
+        Write-Host "Retrieving : $dep"
+        $curr_loc=$(Get-Location)
+        Set-Location $dep
+        Get-Location 
+        Get-ChildItem .
+        New-Item -Path ".\build" -ItemType Directory -Force | Out-Null
+        Get-ChildItem -Path ".\build" -File | Remove-Item -Force
+        python -m pip install . -v
+        Set-Location $curr_loc
+    }
+}
diff --git a/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4f74488ae41714a4ce03ba7514bf93842768c5ae
--- /dev/null
+++ b/.gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+set -e
+if [[ "$1" == "" ]]; then 
+  echo "build aidge deps in cibuildwheel container before building wheel."
+  echo "search path defines where the dependencies will be searched."
+  echo "Hint : In wheel containers, files are mounted on /host by default."
+  echo "\nusage : ./cibuildwheel_build_deps_before_build_wheel.sh $search_path"
+fi
+set -x
+if [[ $AIDGE_DEPENDENCIES ==  "" ]]; then # case for aidge_ core
+  mkdir -p build # creating build if its not already there to hold the build of cpp files
+  rm -rf build/* # build from scratch
+else 
+  for repo in $AIDGE_DEPENDENCIES ; do # case for other projects
+    search_path=$1
+    REPO_PATH=$(find $search_path ! -writable -prune -o  -type d     \
+                                    -name "$repo"                    \
+                                    -not -path "*/install/*"         \
+                                    -not -path "*/.git/*"            \
+                                    -not -path "*/miniconda/*"       \
+                                    -not -path "*/conda/*"           \
+                                    -not -path "*/.local/*"          \
+                                    -not -path "*/lib/*"             \
+                                    -not -path "*/$repo/$repo/*"     \
+                                    -not -path "*/proc/*"            \
+                                    -print -quit)
+    if [[ -z "$REPO_PATH" ]]; then 
+      echo "ERROR : dependency $repo not found in search_path \"$search_path\". ABORTING."
+      exit -1
+    fi
+
+    cd $REPO_PATH
+    mkdir -p build # creating build if its not already there to hold the build of cpp files
+    rm -rf build/* # build from scratch
+    pip install . -v
+    cd -
+  done
+fi
+set +x
+set +e
diff --git a/.gitlab/ci/coverage.gitlab-ci.yml b/.gitlab/ci/coverage.gitlab-ci.yml
deleted file mode 100644
index 33547fc3f52771c456fba3d34a6e8d96eebafd8a..0000000000000000000000000000000000000000
--- a/.gitlab/ci/coverage.gitlab-ci.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-coverage:ubuntu_cpp:
-  stage: coverage
-  needs: ["build:ubuntu_cpp"]
-  tags:
-    - docker
-  script:
-    - cd build_cpp
-    - ctest --output-on-failure
-    # HTML report for visualization
-    - gcovr --html-details --exclude-unreachable-branches -o coverage.html --root ${CI_PROJECT_DIR} --filter '\.\./include/' --filter '\.\./src/'
-    # Coberta XML report for Gitlab integration
-    - gcovr --xml-pretty --exclude-unreachable-branches --print-summary -o coverage.xml --root ${CI_PROJECT_DIR} --filter '\.\./include/' --filter '\.\./src/'
-  coverage: /^\s*lines:\s*\d+.\d+\%/
-  artifacts:
-    name: ${CI_JOB_NAME}-${CI_COMMIT_REF_NAME}-${CI_COMMIT_SHA}
-    expire_in: 2 days
-    reports:
-      coverage_report:
-        coverage_format: cobertura
-        path: build_cpp/coverage.xml
-
-coverage:ubuntu_python:
-  stage: coverage
-  needs: ["build:ubuntu_python"]
-  tags:
-    - docker
-  script:
-    - source venv/bin/activate
-    - python3 -m pip install numpy coverage
-    - cd ${CI_PROJECT_NAME}
-    # Retrieve the installation path of the module, since it is installed with pip.
-    - export MODULE_LOCATION=`python -c "import ${CI_PROJECT_NAME} as _; print(_.__path__[0])"`
-    - python3 -m coverage run --source=$MODULE_LOCATION -m unittest discover -s unit_tests/ -v -b
-    - python3 -m coverage report
-    - python3 -m coverage xml
-  coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
-  artifacts:
-    reports:
-      coverage_report:
-        coverage_format: cobertura
-        path: ${CI_PROJECT_NAME}/coverage.xml
diff --git a/.gitlab/ci/static_analysis.gitlab-ci.yml b/.gitlab/ci/static_analysis.gitlab-ci.yml
deleted file mode 100644
index 0ea9b711885442e7f260ae86e313464b592127a0..0000000000000000000000000000000000000000
--- a/.gitlab/ci/static_analysis.gitlab-ci.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-static_analysis:cpp:
-  stage: static_analysis
-  tags:
-    - static_analysis
-  allow_failure: true
-  script:
-    - mkdir -p $CI_COMMIT_REF_NAME
-    - cppcheck -j 4 --enable=all --inconclusive --force --xml --xml-version=2 . 2> cppcheck-result.xml
-    - python -m pip install Pygments
-    - cppcheck-htmlreport --file=cppcheck-result.xml --report-dir=$CI_COMMIT_REF_NAME --source-dir=src
-    - python3 -m pip install -U cppcheck_codequality
-    - cppcheck-codequality --input-file=cppcheck-result.xml --output-file=cppcheck.json
-    - mkdir -p public/cpp
-    - mv $CI_COMMIT_REF_NAME public/cpp/
-  artifacts:
-    paths: 
-      - public
-    reports:
-      codequality: cppcheck.json
-
-static_analysis:python:
-  stage: static_analysis
-  tags:
-    - static_analysis
-  allow_failure: true
-  script:
-    - pip install pylint
-    - pip install pylint-gitlab
-    - pylint --rcfile=.pylintrc --exit-zero --output-format=pylint_gitlab.GitlabCodeClimateReporter ${CI_PROJECT_NAME}/ > codeclimate.json
-    - pylint --rcfile=.pylintrc --exit-zero --output-format=pylint_gitlab.GitlabPagesHtmlReporter ${CI_PROJECT_NAME}/ > pylint.html
-    - mkdir -p public/python/$CI_COMMIT_REF_NAME
-    - mv pylint.html public/python/$CI_COMMIT_REF_NAME/
-  artifacts:
-    paths:
-      - public
-    reports:
-      codequality: codeclimate.json
\ No newline at end of file
diff --git a/.gitlab/ci/test.gitlab-ci.yml b/.gitlab/ci/test.gitlab-ci.yml
deleted file mode 100644
index 3cada635eb25b3eb87e8318eb6e26723f7a27dd6..0000000000000000000000000000000000000000
--- a/.gitlab/ci/test.gitlab-ci.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-test:ubuntu_cpp:
-  stage: test
-  needs: ["build:ubuntu_cpp"]
-  tags:
-    - docker
-  script:
-    - cd build_cpp
-    - ctest --output-junit ctest-results.xml --output-on-failure
-  artifacts:
-    reports:
-      junit: build_cpp/ctest-results.xml
-
-test:ubuntu_python:
-  stage: test
-  needs: ["build:ubuntu_python"]
-  tags:
-    - docker
-  script:
-    - source venv/bin/activate
-    - cd ${CI_PROJECT_NAME}
-
-    # Run on discovery all tests located in core/unit_tests/python and discard the stdout
-    # only to show the errors/warnings and the results of the tests
-    - python3 -m xmlrunner discover -s unit_tests/ -v -b --output-file xmlrunner-results.xml
-  artifacts:
-    reports:
-      junit: ${CI_PROJECT_NAME}/xmlrunner-results.xml
-
-test:windows_cpp:
-  stage: test
-  needs: ["build:windows_cpp"]
-  tags:
-    - windows
-  image: buildtools
-  before_script:
-    # Install Chocolatey
-    - Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
-    # Install dependencies
-    - choco install cmake.install --installargs '"ADD_CMAKE_TO_PATH=System"' -Y
-    - choco install python -Y
-    # Update PATH
-    - $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
-  script:
-    - cd build_cpp
-    - ctest --output-junit ctest-results.xml --output-on-failure
-  artifacts:
-    reports:
-      junit: build_cpp/ctest-results.xml
diff --git a/.pylintrc b/.pylintrc
deleted file mode 100644
index 20ad01971c6e02397253a115490a3afc458b546d..0000000000000000000000000000000000000000
--- a/.pylintrc
+++ /dev/null
@@ -1,644 +0,0 @@
-[MASTER]
-
-# A comma-separated list of package or module names from where C extensions may
-# be loaded. Extensions are loading into the active Python interpreter and may
-# run arbitrary code.
-extension-pkg-allow-list= aidge_core, aidge_backend_cpu, torch, tensorflow
-
-# A comma-separated list of package or module names from where C extensions may
-# be loaded. Extensions are loading into the active Python interpreter and may
-# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
-# for backward compatibility.)
-extension-pkg-whitelist=
-
-# Return non-zero exit code if any of these messages/categories are detected,
-# even if score is above --fail-under value. Syntax same as enable. Messages
-# specified are enabled, while categories only check already-enabled messages.
-fail-on=
-
-# Specify a score threshold to be exceeded before program exits with error.
-fail-under=0.0
-
-# Files or directories to be skipped. They should be base names, not paths.
-ignore=CVS
-
-# Add files or directories matching the regex patterns to the ignore-list. The
-# regex matches against paths.
-ignore-paths=
-
-# Files or directories matching the regex patterns are skipped. The regex
-# matches against base names, not paths.
-ignore-patterns=
-
-# Python code to execute, usually for sys.path manipulation such as
-# pygtk.require().
-#init-hook=
-
-# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
-# number of processors available to use.
-jobs=1
-
-# Control the amount of potential inferred values when inferring a single
-# object. This can help the performance when dealing with large functions or
-# complex, nested conditions.
-limit-inference-results=100
-
-# List of plugins (as comma separated values of python module names) to load,
-# usually to register additional checkers.
-load-plugins=
-
-# Pickle collected data for later comparisons.
-persistent=yes
-
-# When enabled, pylint would attempt to guess common misconfiguration and emit
-# user-friendly hints instead of false-positive error messages.
-suggestion-mode=yes
-
-# Allow loading of arbitrary C extensions. Extensions are imported into the
-# active Python interpreter and may run arbitrary code.
-unsafe-load-any-extension=no
-
-
-[MESSAGES CONTROL]
-
-# Only show warnings with the listed confidence levels. Leave empty to show
-# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
-confidence=
-
-# Disable the message, report, category or checker with the given id(s). You
-# can either give multiple identifiers separated by comma (,) or put this
-# option multiple times (only on the command line, not in the configuration
-# file where it should appear only once). You can also use "--disable=all" to
-# disable everything first and then reenable specific checks. For example, if
-# you want to run only the similarities checker, you can use "--disable=all
-# --enable=similarities". If you want to run only the classes checker, but have
-# no Warning level messages displayed, use "--disable=all --enable=classes
-# --disable=W".
-disable=print-statement,
-        parameter-unpacking,
-        unpacking-in-except,
-        old-raise-syntax,
-        backtick,
-        long-suffix,
-        old-ne-operator,
-        old-octal-literal,
-        import-star-module-level,
-        non-ascii-bytes-literal,
-        raw-checker-failed,
-        bad-inline-option,
-        locally-disabled,
-        file-ignored,
-        suppressed-message,
-        useless-suppression,
-        deprecated-pragma,
-        use-symbolic-message-instead,
-        apply-builtin,
-        basestring-builtin,
-        buffer-builtin,
-        cmp-builtin,
-        coerce-builtin,
-        execfile-builtin,
-        file-builtin,
-        long-builtin,
-        raw_input-builtin,
-        reduce-builtin,
-        standarderror-builtin,
-        unicode-builtin,
-        xrange-builtin,
-        coerce-method,
-        delslice-method,
-        getslice-method,
-        setslice-method,
-        no-absolute-import,
-        old-division,
-        dict-iter-method,
-        dict-view-method,
-        next-method-called,
-        metaclass-assignment,
-        indexing-exception,
-        raising-string,
-        reload-builtin,
-        oct-method,
-        hex-method,
-        nonzero-method,
-        cmp-method,
-        input-builtin,
-        round-builtin,
-        intern-builtin,
-        unichr-builtin,
-        map-builtin-not-iterating,
-        zip-builtin-not-iterating,
-        range-builtin-not-iterating,
-        filter-builtin-not-iterating,
-        using-cmp-argument,
-        eq-without-hash,
-        div-method,
-        idiv-method,
-        rdiv-method,
-        exception-message-attribute,
-        invalid-str-codec,
-        sys-max-int,
-        bad-python3-import,
-        deprecated-string-function,
-        deprecated-str-translate-call,
-        deprecated-itertools-function,
-        deprecated-types-field,
-        next-method-defined,
-        dict-items-not-iterating,
-        dict-keys-not-iterating,
-        dict-values-not-iterating,
-        deprecated-operator-function,
-        deprecated-urllib-function,
-        xreadlines-attribute,
-        deprecated-sys-function,
-        exception-escape,
-        comprehension-escape,
-        c-extension-no-member,
-        too-many-locals,
-        missing-class-docstring,
-        missing-function-docstring,
-        too-many-ancestor,
-        too-many-arguments,
-        protected-access,
-        too-many-branches,
-        too-many-ancestors,
-        wrong-import-order,
-        wrong-import-position,
-
-# Enable the message, report, category or checker with the given id(s). You can
-# either give multiple identifier separated by comma (,) or put this option
-# multiple time (only on the command line, not in the configuration file where
-# it should appear only once). See also the "--disable" option for examples.
-enable=c-extension-no-member
-
-
-[REPORTS]
-
-# Python expression which should return a score less than or equal to 10. You
-# have access to the variables 'error', 'warning', 'refactor', and 'convention'
-# which contain the number of messages in each category, as well as 'statement'
-# which is the total number of statements analyzed. This score is used by the
-# global evaluation report (RP0004).
-evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
-
-# Template used to display messages. This is a python new-style format string
-# used to format the message information. See doc for all details.
-#msg-template=
-
-# Set the output format. Available formats are text, parseable, colorized, json
-# and msvs (visual studio). You can also give a reporter class, e.g.
-# mypackage.mymodule.MyReporterClass.
-output-format=text
-
-# Tells whether to display a full report or only the messages.
-reports=no
-
-# Activate the evaluation score.
-score=yes
-
-
-[REFACTORING]
-
-# Maximum number of nested blocks for function / method body
-max-nested-blocks=5
-
-# Complete name of functions that never returns. When checking for
-# inconsistent-return-statements if a never returning function is called then
-# it will be considered as an explicit return statement and no message will be
-# printed.
-never-returning-functions=sys.exit,argparse.parse_error
-
-
-[BASIC]
-
-# Naming style matching correct argument names.
-argument-naming-style=snake_case
-
-# Regular expression matching correct argument names. Overrides argument-
-# naming-style.
-#argument-rgx=
-
-# Naming style matching correct attribute names.
-attr-naming-style=snake_case
-
-# Regular expression matching correct attribute names. Overrides attr-naming-
-# style.
-#attr-rgx=
-
-# Bad variable names which should always be refused, separated by a comma.
-bad-names=foo,
-          bar,
-          baz,
-          toto,
-          tutu,
-          tata
-
-# Bad variable names regexes, separated by a comma. If names match any regex,
-# they will always be refused
-bad-names-rgxs=
-
-# Naming style matching correct class attribute names.
-class-attribute-naming-style=any
-
-# Regular expression matching correct class attribute names. Overrides class-
-# attribute-naming-style.
-#class-attribute-rgx=
-
-# Naming style matching correct class constant names.
-class-const-naming-style=UPPER_CASE
-
-# Regular expression matching correct class constant names. Overrides class-
-# const-naming-style.
-#class-const-rgx=
-
-# Naming style matching correct class names.
-class-naming-style=PascalCase
-
-# Regular expression matching correct class names. Overrides class-naming-
-# style.
-#class-rgx=
-
-# Naming style matching correct constant names.
-const-naming-style=UPPER_CASE
-
-# Regular expression matching correct constant names. Overrides const-naming-
-# style.
-#const-rgx=
-
-# Minimum line length for functions/classes that require docstrings, shorter
-# ones are exempt.
-docstring-min-length=-1
-
-# Naming style matching correct function names.
-function-naming-style=snake_case
-
-# Regular expression matching correct function names. Overrides function-
-# naming-style.
-#function-rgx=
-
-# Good variable names which should always be accepted, separated by a comma.
-good-names=i,
-           j,
-           k,
-           ex,
-           Run,
-           _,
-
-# Good variable names regexes, separated by a comma. If names match any regex,
-# they will always be accepted
-good-names-rgxs=
-
-# Include a hint for the correct naming format with invalid-name.
-include-naming-hint=no
-
-# Naming style matching correct inline iteration names.
-inlinevar-naming-style=any
-
-# Regular expression matching correct inline iteration names. Overrides
-# inlinevar-naming-style.
-#inlinevar-rgx=
-
-# Naming style matching correct method names.
-method-naming-style=snake_case
-
-# Regular expression matching correct method names. Overrides method-naming-
-# style.
-#method-rgx=
-
-# Naming style matching correct module names.
-module-naming-style=snake_case
-
-# Regular expression matching correct module names. Overrides module-naming-
-# style.
-#module-rgx=
-
-# Colon-delimited sets of names that determine each other's naming style when
-# the name regexes allow several styles.
-name-group=
-
-# Regular expression which should only match function or class names that do
-# not require a docstring.
-no-docstring-rgx=^_
-
-# List of decorators that produce properties, such as abc.abstractproperty. Add
-# to this list to register other decorators that produce valid properties.
-# These decorators are taken in consideration only for invalid-name.
-property-classes=abc.abstractproperty
-
-# Naming style matching correct variable names.
-variable-naming-style=snake_case
-
-# Regular expression matching correct variable names. Overrides variable-
-# naming-style.
-#variable-rgx=
-
-
-[FORMAT]
-
-# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
-expected-line-ending-format=
-
-# Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=^\s*(# )?<?https?://\S+>?$
-
-# Number of spaces of indent required inside a hanging or continued line.
-indent-after-paren=4
-
-# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
-# tab).
-indent-string='    '
-
-# Maximum number of characters on a single line.
-max-line-length=200
-
-# Maximum number of lines in a module.
-max-module-lines=1000
-
-# Allow the body of a class to be on the same line as the declaration if body
-# contains single statement.
-single-line-class-stmt=no
-
-# Allow the body of an if to be on the same line as the test if there is no
-# else.
-single-line-if-stmt=no
-
-
-[LOGGING]
-
-# The type of string formatting that logging methods do. `old` means using %
-# formatting, `new` is for `{}` formatting.
-logging-format-style=old
-
-# Logging modules to check that the string format arguments are in logging
-# function parameter format.
-logging-modules=logging
-
-
-[MISCELLANEOUS]
-
-# List of note tags to take in consideration, separated by a comma.
-notes=FIXME,
-      XXX,
-      TODO
-
-# Regular expression of note tags to take in consideration.
-#notes-rgx=
-
-
-[SIMILARITIES]
-
-# Comments are removed from the similarity computation
-ignore-comments=yes
-
-# Docstrings are removed from the similarity computation
-ignore-docstrings=yes
-
-# Imports are removed from the similarity computation
-ignore-imports=no
-
-# Signatures are removed from the similarity computation
-ignore-signatures=no
-
-# Minimum lines number of a similarity.
-min-similarity-lines=4
-
-
-[SPELLING]
-
-# Limits count of emitted suggestions for spelling mistakes.
-max-spelling-suggestions=4
-
-# Spelling dictionary name. Available dictionaries: none. To make it work,
-# install the 'python-enchant' package.
-spelling-dict=
-
-# List of comma separated words that should be considered directives if they
-# appear and the beginning of a comment and should not be checked.
-spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
-
-# List of comma separated words that should not be checked.
-spelling-ignore-words=
-
-# A path to a file that contains the private dictionary; one word per line.
-spelling-private-dict-file=
-
-# Tells whether to store unknown words to the private dictionary (see the
-# --spelling-private-dict-file option) instead of raising a message.
-spelling-store-unknown-words=no
-
-
-[STRING]
-
-# This flag controls whether inconsistent-quotes generates a warning when the
-# character used as a quote delimiter is used inconsistently within a module.
-check-quote-consistency=no
-
-# This flag controls whether the implicit-str-concat should generate a warning
-# on implicit string concatenation in sequences defined over several lines.
-check-str-concat-over-line-jumps=no
-
-
-[TYPECHECK]
-
-# List of decorators that produce context managers, such as
-# contextlib.contextmanager. Add to this list to register other decorators that
-# produce valid context managers.
-contextmanager-decorators=contextlib.contextmanager
-
-# List of members which are set dynamically and missed by pylint inference
-# system, and so shouldn't trigger E1101 when accessed. Python regular
-# expressions are accepted.
-generated-members=
-
-# Tells whether missing members accessed in mixin class should be ignored. A
-# mixin class is detected if its name ends with "mixin" (case insensitive).
-ignore-mixin-members=yes
-
-# Tells whether to warn about missing members when the owner of the attribute
-# is inferred to be None.
-ignore-none=yes
-
-# This flag controls whether pylint should warn about no-member and similar
-# checks whenever an opaque object is returned when inferring. The inference
-# can return multiple potential results while evaluating a Python object, but
-# some branches might not be evaluated, which results in partial inference. In
-# that case, it might be useful to still emit no-member and other checks for
-# the rest of the inferred objects.
-ignore-on-opaque-inference=yes
-
-# List of class names for which member attributes should not be checked (useful
-# for classes with dynamically set attributes). This supports the use of
-# qualified names.
-ignored-classes=optparse.Values,
-                thread._local,
-                _thread._local,
-                aidge.global_variables,
-                aidge.cells.abstract_cell.Trainable,
-                torch,
-                tensorflow,
-
-# List of module names for which member attributes should not be checked
-# (useful for modules/projects where namespaces are manipulated during runtime
-# and thus existing member attributes cannot be deduced by static analysis). It
-# supports qualified module names, as well as Unix pattern matching.
-ignored-modules= aidge_core, aidge_backend_cpu
-
-# Show a hint with possible names when a member name was not found. The aspect
-# of finding the hint is based on edit distance.
-missing-member-hint=yes
-
-# The minimum edit distance a name should have in order to be considered a
-# similar match for a missing member name.
-missing-member-hint-distance=1
-
-# The total number of similar names that should be taken in consideration when
-# showing a hint for a missing member.
-missing-member-max-choices=1
-
-# List of decorators that change the signature of a decorated function.
-signature-mutators=
-
-
-[VARIABLES]
-
-# List of additional names supposed to be defined in builtins. Remember that
-# you should avoid defining new builtins when possible.
-additional-builtins=
-
-# Tells whether unused global variables should be treated as a violation.
-allow-global-unused-variables=yes
-
-# List of names allowed to shadow builtins
-allowed-redefined-builtins=
-
-# List of strings which can identify a callback function by name. A callback
-# name must start or end with one of those strings.
-callbacks=cb_,
-          _cb
-
-# A regular expression matching the name of dummy variables (i.e. expected to
-# not be used).
-dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
-
-# Argument names that match this expression will be ignored. Default to name
-# with leading underscore.
-ignored-argument-names=_.*|^ignored_|^unused_
-
-# Tells whether we should check for unused import in __init__ files.
-init-import=no
-
-# List of qualified module names which can have objects that can redefine
-# builtins.
-redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
-
-
-[CLASSES]
-
-# Warn about protected attribute access inside special methods
-check-protected-access-in-special-methods=no
-
-# List of method names used to declare (i.e. assign) instance attributes.
-defining-attr-methods=__init__,
-                      __new__,
-                      setUp,
-                      __post_init__
-
-# List of member names, which should be excluded from the protected access
-# warning.
-exclude-protected=_asdict,
-                  _fields,
-                  _replace,
-                  _source,
-                  _make
-
-# List of valid names for the first argument in a class method.
-valid-classmethod-first-arg=cls
-
-# List of valid names for the first argument in a metaclass class method.
-valid-metaclass-classmethod-first-arg=cls
-
-
-[DESIGN]
-
-# List of qualified class names to ignore when countint class parents (see
-# R0901)
-ignored-parents=
-
-# Maximum number of arguments for function / method.
-max-args=5
-
-# Maximum number of attributes for a class (see R0902).
-max-attributes=7
-
-# Maximum number of boolean expressions in an if statement (see R0916).
-max-bool-expr=5
-
-# Maximum number of branch for function / method body.
-max-branches=12
-
-# Maximum number of locals for function / method body.
-max-locals=15
-
-# Maximum number of parents for a class (see R0901).
-max-parents=7
-
-# Maximum number of public methods for a class (see R0904).
-max-public-methods=20
-
-# Maximum number of return / yield for function / method body.
-max-returns=6
-
-# Maximum number of statements in function / method body.
-max-statements=50
-
-# Minimum number of public methods for a class (see R0903).
-min-public-methods=2
-
-
-[IMPORTS]
-
-# List of modules that can be imported at any level, not just the top level
-# one.
-allow-any-import-level=
-
-# Allow wildcard imports from modules that define __all__.
-allow-wildcard-with-all=no
-
-# Analyse import fallback blocks. This can be used to support both Python 2 and
-# 3 compatible code, which means that the block might have code that exists
-# only in one or another interpreter, leading to false positives when analysed.
-analyse-fallback-blocks=no
-
-# Deprecated modules which should not be used, separated by a comma.
-deprecated-modules=
-
-# Output a graph (.gv or any supported image format) of external dependencies
-# to the given file (report RP0402 must not be disabled).
-ext-import-graph=
-
-# Output a graph (.gv or any supported image format) of all (i.e. internal and
-# external) dependencies to the given file (report RP0402 must not be
-# disabled).
-import-graph=
-
-# Output a graph (.gv or any supported image format) of internal dependencies
-# to the given file (report RP0402 must not be disabled).
-int-import-graph=
-
-# Force import order to recognize a module as part of the standard
-# compatibility libraries.
-known-standard-library=
-
-# Force import order to recognize a module as part of a third party library.
-known-third-party=enchant
-
-# Couples of modules and preferred modules, separated by a comma.
-preferred-modules=
-
-
-[EXCEPTIONS]
-
-# Exceptions that will emit a warning when being caught. Defaults to
-# "BaseException, Exception".
-overgeneral-exceptions=BaseException,
-                       Exception
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a9603c550f89f106fcc9da818a7bd67492ec863f..3574e25cec5977bc2249c7d756041c09650f9b11 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,33 +1,34 @@
-cmake_minimum_required(VERSION 3.15)
-
+cmake_minimum_required(VERSION 3.18)
+set(CXX_STANDARD 14)
 
 file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version)
-add_definitions(-DPROJECT_VERSION="${version}")
-file(STRINGS "${CMAKE_SOURCE_DIR}/project_name.txt" project)
 
-message(STATUS "Project name: ${project}")
+project(aidge_backend_cpu
+        VERSION ${version}
+        DESCRIPTION "CPU implementations of the operators of aidge framework."
+        LANGUAGES CXX)
+
+message(STATUS "Project name: ${CMAKE_PROJECT_NAME}")
 message(STATUS "Project version: ${version}")
+add_definitions(-DPROJECT_VERSION="${version}")
 
 execute_process(
     COMMAND git rev-parse --short HEAD
     WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
     OUTPUT_VARIABLE GIT_COMMIT_HASH
     OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_QUIET
 )
 message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}")
-
-# Define a preprocessor macro with the Git commit version
 add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")
 
-# Note : project name is {project} and python module name is also {project}
-set(module_name _${project}) # target name
-
-project(${project})
-set(CXX_STANDARD 14)
+# Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME}
+set(module_name _${CMAKE_PROJECT_NAME}) # target name
+set(pybind_module_name ${CMAKE_PROJECT_NAME}) # name of submodule for python bindings
 
 ##############################################
 # Define options
-option(PYBIND "python binding" ON)
+option(PYBIND "python binding" OFF)
 option(WERROR "Warning as error" OFF)
 option(TEST "Enable tests" ON)
 option(COVERAGE "Enable coverage" OFF)
@@ -36,14 +37,20 @@ option(ENABLE_ASAN "Enable ASan (AddressSanitizer) for runtime analysis of memor
 ##############################################
 # Import utils CMakeLists
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
-include(PybindModuleCreation)
 
 if(CMAKE_COMPILER_IS_GNUCXX AND COVERAGE)
     Include(CodeCoverage)
 endif()
 
 ##############################################
-# Find system dependencies
+# FIND Dependencies
+if(NOT $ENV{AIDGE_INSTALL} STREQUAL "")
+    set(CMAKE_INSTALL_PREFIX $ENV{AIDGE_INSTALL})
+    list(APPEND CMAKE_PREFIX_PATH $ENV{AIDGE_INSTALL})
+    message(WARNING "Env var AIDGE_INSTALL detected : $ENV{AIDGE_INSTALL}. Set CMAKE_INSTALL_PREFIX to AIDGE_INSTALL & added to CMAKE_PREFIX_PATH"
+                    "\n\tCMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}"
+                    "\n\tCMAKE_PREFIX_PATH = ${CMAKE_PREFIX_PATH}")
+endif()
 find_package(aidge_core REQUIRED)
 
 ##############################################
@@ -52,14 +59,25 @@ file(GLOB_RECURSE src_files "src/*.cpp")
 file(GLOB_RECURSE inc_files "include/*.hpp")
 
 add_library(${module_name} ${src_files} ${inc_files})
+
 target_link_libraries(${module_name}
     PUBLIC
-        _aidge_core # _ is added because we link the target not the project
+        _aidge_core # _ is added because we link the exported target and not the project
 )
 
 #Set target properties
 set_property(TARGET ${module_name} PROPERTY POSITION_INDEPENDENT_CODE ON)
 
+# PYTHON BINDING
+if (PYBIND)
+    # Python binding lib is by default installed in <prefix>/python_packages/<package>/
+    # When installed from python, setup.py should set it to the python package dir
+    set(PYBIND_INSTALL_PREFIX python_packages/${pybind_module_name} CACHE PATH "Python package install prefix")
+
+    include(PybindModuleCreation)
+    generate_python_binding(${pybind_module_name} ${module_name})
+endif()
+
 if( ${ENABLE_ASAN} )
     message("Building ${module_name} with ASAN.")
     set(SANITIZE_FLAGS -fsanitize=address -fno-omit-frame-pointer)
@@ -81,20 +99,6 @@ target_include_directories(${module_name}
         ${CMAKE_CURRENT_SOURCE_DIR}/src
 )
 
-# PYTHON BINDING
-if (PYBIND)
-    generate_python_binding(${project} ${module_name})
-
-    # Handles Python + pybind11 headers dependencies
-    target_link_libraries(${module_name}
-        PUBLIC
-            pybind11::pybind11
-        PRIVATE
-            Python::Python
-        )
-endif()
-
-target_link_libraries(${module_name} PUBLIC fmt::fmt)
 target_compile_features(${module_name} PRIVATE cxx_std_14)
 
 target_compile_options(${module_name} PRIVATE
@@ -110,22 +114,25 @@ endif()
 
 ##############################################
 # Installation instructions
-
 include(GNUInstallDirs)
-set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/${project})
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME})
 
-install(TARGETS ${module_name} EXPORT ${project}-targets
+install(TARGETS ${module_name} EXPORT ${CMAKE_PROJECT_NAME}-targets
   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
 )
-
 install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
-#Export the targets to a script
+if (PYBIND)
+    install(TARGETS ${pybind_module_name}
+        DESTINATION ${PYBIND_INSTALL_PREFIX}
+    )
+endif()
 
-install(EXPORT ${project}-targets
- FILE "${project}-targets.cmake"
+#Export the targets to a script
+install(EXPORT ${CMAKE_PROJECT_NAME}-targets
+ FILE "${CMAKE_PROJECT_NAME}-targets.cmake"
  DESTINATION ${INSTALL_CONFIGDIR}
  COMPONENT ${module_name}
 )
@@ -133,32 +140,37 @@ install(EXPORT ${project}-targets
 #Create a ConfigVersion.cmake file
 include(CMakePackageConfigHelpers)
 write_basic_package_version_file(
-    "${CMAKE_CURRENT_BINARY_DIR}/${project}-config-version.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config-version.cmake"
     VERSION ${version}
     COMPATIBILITY AnyNewerVersion
 )
 
-configure_package_config_file("${project}-config.cmake.in"
-    "${CMAKE_CURRENT_BINARY_DIR}/${project}-config.cmake"
+configure_package_config_file("${CMAKE_PROJECT_NAME}-config.cmake.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config.cmake"
     INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
 )
 
 #Install the config, configversion and custom find modules
 install(FILES
-    "${CMAKE_CURRENT_BINARY_DIR}/${project}-config.cmake"
-    "${CMAKE_CURRENT_BINARY_DIR}/${project}-config-version.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-config-version.cmake"
     DESTINATION ${INSTALL_CONFIGDIR}
 )
 
 ##############################################
 ## Exporting from the build tree
-export(EXPORT ${project}-targets
-    FILE "${CMAKE_CURRENT_BINARY_DIR}/${project}-targets.cmake")
+message(STATUS "Exporting created targets to use them in another build")
+export(EXPORT ${CMAKE_PROJECT_NAME}-targets
+    FILE "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-targets.cmake")
 
 
 ##############################################
 ## Add test
 if(TEST)
-    enable_testing()
-    add_subdirectory(unit_tests)
+    if (AIDGE_REQUIRES_PYTHON AND NOT AIDGE_PYTHON_HAS_EMBED)
+        message(WARNING "Skipping compilation of tests: missing Python embedded interpreter")
+    else()
+        enable_testing()
+        add_subdirectory(unit_tests)
+    endif()
 endif()
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..61f22a8c8ed0c92dab03c0533d7617d96d42c03d
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,8 @@
+include README.md LICENSE
+recursive-include aidge_backend_cpu *.py 
+recursive-exclude aidge_backend_cpu/unit_tests *.py
+
+recursive-include include *.hpp
+recursive-include src *.cpp
+recursive-include python_binding *.cpp
+include CMakeLists.txt
diff --git a/README.md b/README.md
index e67b9d8bade1a862beee8f7dbe59ceac1469efe3..96283603759f03415b7dc1b99f3905550427f633 100644
--- a/README.md
+++ b/README.md
@@ -5,47 +5,53 @@
 You can find in this folder the library that implements the CPU operators. <br>
 Those operators can be used on any machine with an Linux OS.
 
-So far be sure to have the correct requirements to use this library
-- GCC
-- Make
-- CMake
-- aidge_core
-- Python (optional, if you have no intend to use this library in python with pybind)
+[TOC]
 
-## Pip installation
+## Installation
 
-You will need to install first the ``aidge_core`` library before installing ``aidge_backend_cpu``.
+### Dependencies
+- `GCC`
+- `Make`/`Ninja`
+- `CMake`
+- `Python` (optional, if you have no intend to use this library in python with pybind)
 
-If you have set a custom install path for the ``aidge_core`` library, make sure to use the same one here.
+#### Aidge dependencies
+ - `aidge_core`
 
-Then run in your python environnement :
+### Pip installation
 ``` bash
 pip install . -v
 ```
+> **TIPS :** Use environment variables to change compilation options :
+> - `AIDGE_INSTALL` : to set the installation folder. Defaults to `<python_prefix>/lib/libAidge`. :warning: This path must be identical to aidge_core install path.
+> - `AIDGE_PYTHON_BUILD_TYPE` : to set the compilation mode to **Debug** or **Release** or "" (for default flags). Defaults to **Release**.
+> - `AIDGE_BUILD_GEN` : to set the build backend (for development mode) or "" for the cmake default. Default to "".
 
-## Standard C++ Compilation
+## Pip installation for development
 
-You will need to compile first the Core library before compiling the CPU one.
-The makefile is designed to do it for you.
+To setup using pip in development (or editable mode), use the `--no-build-isolation -e` options to pip.
 
-To only compile the CPU library, run
-```
-make cpu_only
+For instance run the following command in your python environnement for a typical setup :
+``` bash
+export AIDGE_PYTHON_BUILD_TYPE=         # default flags (no debug info but fastest build time)
+export AIDGE_PYTHON_BUILD_TYPE=Debug    # or if one really need to debug the C++ code
+pip install -U pip setuptools setuptools_scm[toml] cmake   # Pre-install build requirements (refer to the pyproject.toml [build-system] section)
+pip install -v --no-build-isolation -e .
 ```
 
-To compile the CPU library + the associated unitary tests, run
-```
-make cpu_tests
-```
+Refer to `aidge_core/README.md` for more details on development build options.
 
-To compile the CPU library with the python binding, run
-```
-make cpu_with_pybind
-```
-Important: this command can also be run with `make`.
+### Standard C++ Compilation
 
+You will need to compile and install the [Core Library](https://gitlab.eclipse.org/eclipse/aidge/aidge_core) before compiling the CPU one.
 
-To compile the CPU library with the python binding + the associated unitary tests, run
-```
-make cpu_with_pybind_tests
+Once this has been done, you'll need run CMake with the
+`CMAKE_INSTALL_PREFIX:PATH` flag, in order to indicate to CMake where
+`aidge_core` has been installed : 
+```sh
+cmake -DCMAKE_INSTALL_PREFIX:PATH=$(path_to_install_folder) $(CMAKE PARAMETERS) $(projet_root)
+
+make all
 ```
+
+More detailed information is available in the [Aidge User Guide](https://eclipse.dev/aidge/source/GetStarted/install.html)
diff --git a/aidge_backend_cpu-config.cmake.in b/aidge_backend_cpu-config.cmake.in
index f3604be11c27d86caf1ad8a48b333b9bd8f30625..d8e1372bc8a7b79bd09c79b654af4291c995ac58 100644
--- a/aidge_backend_cpu-config.cmake.in
+++ b/aidge_backend_cpu-config.cmake.in
@@ -1,3 +1,10 @@
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+find_dependency(aidge_core)
+
+include(CMakeFindDependencyMacro)
+
 include(${CMAKE_CURRENT_LIST_DIR}/aidge_backend_cpu-config-version.cmake)
 
 include(${CMAKE_CURRENT_LIST_DIR}/aidge_backend_cpu-targets.cmake)
diff --git a/aidge_backend_cpu/__init__.py b/aidge_backend_cpu/__init__.py
index 8f18440daf33134718273b4b3fdd4d99039b6ddf..a7fe1ea3abdea25b18af6e7e0a1958f01f928433 100644
--- a/aidge_backend_cpu/__init__.py
+++ b/aidge_backend_cpu/__init__.py
@@ -1 +1,3 @@
-from aidge_backend_cpu.aidge_backend_cpu import * # import so generated by PyBind
\ No newline at end of file
+import aidge_core
+from aidge_backend_cpu.aidge_backend_cpu import * # import so generated by PyBind
+from ._version import *
diff --git a/cmake/PybindModuleCreation.cmake b/cmake/PybindModuleCreation.cmake
index 87e70fc38c9e4ec4ddb44cbe5d7fb2a31c2e94d6..a520039f6505a7178acefaca076fa3f659e41bcb 100644
--- a/cmake/PybindModuleCreation.cmake
+++ b/cmake/PybindModuleCreation.cmake
@@ -1,21 +1,24 @@
-function(generate_python_binding name target_to_bind) 
-    add_definitions(-DPYBIND)
+function(generate_python_binding pybind_module_name target_to_bind) 
+
+    find_package(Python COMPONENTS Interpreter Development.Module)
+
     Include(FetchContent)
 
+    set(PYBIND_VERSION v2.10.4)
+    message(STATUS "Retrieving pybind ${PYBIND_VERSION} from git")
+
     FetchContent_Declare(
-    PyBind11
-    GIT_REPOSITORY https://github.com/pybind/pybind11.git
-    GIT_TAG        v2.10.4 # or a later release
+        PyBind11
+        GIT_REPOSITORY https://github.com/pybind/pybind11.git
+        GIT_TAG        ${PYBIND_VERSION} # or a later release
     )
 
-    # Use the New FindPython mode, recommanded. Requires CMake 3.15+
-    find_package(Python COMPONENTS Interpreter Development)
     FetchContent_MakeAvailable(PyBind11)
 
-    message(STATUS "Creating binding for module ${name}")
+    message(STATUS "Creating binding for module ${pybind_module_name}")
     file(GLOB_RECURSE pybind_src_files "python_binding/*.cpp")
 
-    pybind11_add_module(${name} MODULE ${pybind_src_files} "NO_EXTRAS") # NO EXTRA recquired for pip install
-    target_include_directories(${name} PUBLIC "python_binding")
-    target_link_libraries(${name} PUBLIC ${target_to_bind})
+    pybind11_add_module(${pybind_module_name} MODULE ${pybind_src_files} "NO_EXTRAS") # NO EXTRA recquired for pip install
+    target_include_directories(${pybind_module_name} PRIVATE "python_binding")
+    target_link_libraries(${pybind_module_name} PRIVATE ${target_to_bind})
 endfunction()
diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index 11f9c264098d5a238d0d1f8e6bc4fac0cc099549..b45aa1cb4151d8d6c5268d4a94da97bb25a89a40 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -12,22 +12,30 @@
 #ifndef AIDGE_CPU_IMPORTS_H_
 #define AIDGE_CPU_IMPORTS_H_
 
+#include "aidge/backend/cpu/operator/AbsImpl.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
+#include "aidge/backend/cpu/operator/AndImpl.hpp"
+#include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
 #include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
+#include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
 #include "aidge/backend/cpu/operator/ErfImpl.hpp"
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
+#include "aidge/backend/cpu/operator/FoldImpl.hpp"
 #include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
+#include "aidge/backend/cpu/operator/LnImpl.hpp"
 #include "aidge/backend/cpu/operator/MatMulImpl.hpp"
 #include "aidge/backend/cpu/operator/MulImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
 #include "aidge/backend/cpu/operator/PowImpl.hpp"
 #include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
+#include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
 #include "aidge/backend/cpu/operator/ReLUImpl.hpp"
 #include "aidge/backend/cpu/operator/ScalingImpl.hpp"
 #include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
diff --git a/include/aidge/backend/cpu/operator/AbsImpl.hpp b/include/aidge/backend/cpu/operator/AbsImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8233d47c4d1e2dc7bf724600ec083bcaa0d667e9
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/AbsImpl.hpp
@@ -0,0 +1,31 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ABSIMPL_H_
+#define AIDGE_CPU_OPERATOR_ABSIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Abs.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using AbsImpl_cpu = OperatorImpl_cpu<Abs_Op,
+    void(const std::size_t, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Abs_Op, "cpu", Aidge::AbsImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ABSIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/TanhImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp
similarity index 50%
rename from include/aidge/backend/cpu/operator/TanhImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp
index 9e57b6dfcb0da322f5b21944fb10ec7a10cd0ab8..16e5f9dee26a6f8b760e14a1ad66a40d8f0f7e93 100644
--- a/include/aidge/backend/cpu/operator/TanhImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp
@@ -9,34 +9,39 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_TANHIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_TANHIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_ABSIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ABSIMPL_KERNELS_H_
+
+#include <cmath>
 
 #include "aidge/utils/Registrar.hpp"
 
-#include "aidge/backend/cpu/operator/TanhImpl.hpp"
+#include "aidge/backend/cpu/operator/AbsImpl.hpp"
 
 namespace Aidge {
 template <class I, class O>
-void TanhImpl_cpu_forward_kernel(std::size_t inputLenght,
+void AbsImpl_cpu_forward_kernel(std::size_t inputLenght,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-//#pragma omp parallel for if (inputLenght > 1024)
     for (std::size_t i = 0; i < inputLenght; ++i) {
-        output[i] = std::tanh(input[i]);
+        output[i] = std::abs(input[i]);
     }
 }
 
-namespace {
-static Registrar<TanhImplForward_cpu> registrarTanhImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::TanhImpl_cpu_forward_kernel<float, float>);
-static Registrar<TanhImplForward_cpu> registrarTanhImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::TanhImpl_cpu_forward_kernel<double, double>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(AbsImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::AbsImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(AbsImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::AbsImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(AbsImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::AbsImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_TANHIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_ABSIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/AddImpl.hpp b/include/aidge/backend/cpu/operator/AddImpl.hpp
index 7a1497a2f4a2ae0e6005897ae504502505bbe60a..5e795922a67be178dde588e8e5e346ec268efe86 100644
--- a/include/aidge/backend/cpu/operator/AddImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl.hpp
@@ -17,36 +17,18 @@
 #include <string>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Add.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
+using AddImpl_cpu = OperatorImpl_cpu<Add_Op,
+    void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)>;
 
-// compute kernel registry for forward and backward
-class AddImplForward_cpu
-    : public Registrable<AddImplForward_cpu, std::tuple<DataType, DataType>, void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)> {};
-
-class AddImplBackward_cpu
-    : public Registrable<AddImplBackward_cpu, std::tuple<DataType, DataType>, void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)> {};
-
-
-class AddImpl_cpu : public OperatorImpl {
-public:
-    AddImpl_cpu(const Add_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<AddImpl_cpu> create(const Add_Op& op) {
-        return std::make_unique<AddImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t /*inputIdx*/) const override final;
-    void forward() override;
-};
-
-namespace {
-static Registrar<Add_Op> registrarAddImpl_cpu("cpu", Aidge::AddImpl_cpu::create);
-}  // namespace
+// Implementation entry point registration to Operator
+REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_ADDIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
similarity index 60%
rename from include/aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
index 94b22dcc7fc8251f8ca907ab0b060b0275309c9d..4a4ba2a8999c4dc33fc743b5a3a7dad023f9e0dd 100644
--- a/include/aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_ADDIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_ADDIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_ADDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ADDIMPL_KERNELS_H_
 
 #include "aidge/utils/Registrar.hpp"
 
@@ -41,16 +41,19 @@ void AddImpl_cpu_forward_kernel(const std::vector<const void*> inputs_, const st
 	}
 }
 
-namespace {
-static Registrar<AddImplForward_cpu> registrarAddImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::AddImpl_cpu_forward_kernel<float, float>);
-static Registrar<AddImplForward_cpu> registrarAddImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::AddImpl_cpu_forward_kernel<double, double>);
-static Registrar<AddImplForward_cpu> registrarAddImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32}, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>);
-static Registrar<AddImplForward_cpu> registrarAddImplForward_cpu_Int64(
-        {DataType::Int64, DataType::Int64}, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(AddImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(AddImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(AddImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(AddImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_ADDIMPL_CPU_FORWARD_KERNEL_H_ */
\ No newline at end of file
+#endif /* AIDGE_CPU_OPERATOR_ADDIMPL_CPU_KERNELS_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cpu/operator/AndImpl.hpp b/include/aidge/backend/cpu/operator/AndImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..316a2fb922596642088d133a7fec49c988739bb7
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/AndImpl.hpp
@@ -0,0 +1,32 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ANDIMPL_H_
+#define AIDGE_CPU_OPERATOR_ANDIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/And.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using AndImpl_cpu = OperatorImpl_cpu<And_Op,
+    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(And_Op, "cpu", Aidge::AndImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ANDIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/PowImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
similarity index 53%
rename from include/aidge/backend/cpu/operator/PowImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
index 1146cfa77464f8bd1c33a0ec0113415dcf599b53..197e829f3527ce2f36c3ef5ee812a26477633703 100644
--- a/include/aidge/backend/cpu/operator/PowImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
@@ -1,63 +1,63 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_POWIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_POWIMPL_FORWARD_KERNEL_H_
-
-#include "aidge/utils/Registrar.hpp"
-#include <cmath>
-
-#include "aidge/backend/cpu/data/Broadcasting.hpp"
-#include "aidge/backend/cpu/operator/PowImpl.hpp"
-
-namespace Aidge {
-template <class I1, class I2, class O>
-void PowImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
-                                const std::vector<std::size_t>& input2Dims,
-                                const std::vector<std::size_t>& outputDims,
-                                const void* input1_,
-                                const void* input2_,
-                                void* output_) {
-
-    const I1* input_1 = static_cast<const I1*>(input1_);
-    const I2* input_2 = static_cast<const I2*>(input2_);
-    O* output = static_cast<O*>(output_);
-
-    size_t totalElements = 1;
-    for (size_t dimSize : outputDims) {
-        totalElements *= dimSize;
-    }
-
-	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) 
-	{
-		std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
-
-		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
-		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
-		
-        output[oIndex] = std::pow(input_1[idx1], input_2[idx2]);
-	}
-}
-
-namespace {
-static Registrar<PowImplForward_cpu> registrarPowImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::PowImpl_cpu_forward_kernel<float, float, float>);
-static Registrar<PowImplForward_cpu> registrarPowImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::PowImpl_cpu_forward_kernel<int, int, int>);
-static Registrar<PowImplForward_cpu> registrarPowImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::PowImpl_cpu_forward_kernel<double, double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_POWIMPL_FORWARD_KERNEL_H_ */
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/AndImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+template <class I1, class I2, class O>
+void AndImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
+                                const std::vector<std::size_t>& input2Dims,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input1_,
+                                const void* input2_,
+                                void* output_) {
+
+    const I1* input_1 = static_cast<const I1*>(input1_);
+    const I2* input_2 = static_cast<const I2*>(input2_);
+    O* output = static_cast<O*>(output_);
+
+    size_t totalElements = 1;
+    for (size_t dimSize : outputDims) {
+        totalElements *= dimSize;
+    }
+
+	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+	{
+		std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+
+		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
+
+        output[oIndex] = static_cast<O>(input_1[idx1] == input_2[idx2]);
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(AndImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float, float>, nullptr});
+REGISTRAR(AndImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double, double>, nullptr});
+REGISTRAR(AndImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(AndImpl_cpu,
+    {DataType::Int64},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/ArgMaxImpl.hpp b/include/aidge/backend/cpu/operator/ArgMaxImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b1a2d5168013e4f9595f4275b98143cfc3509629
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ArgMaxImpl.hpp
@@ -0,0 +1,38 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ARGMAXIMPL_H_
+#define AIDGE_CPU_OPERATOR_ARGMAXIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/ArgMax.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ArgMaxImpl_cpu = OperatorImpl_cpu<ArgMax_Op,
+    void(std::int32_t,
+        DimSize_t,
+        const std::vector<DimSize_t>&,
+        const void *,
+        void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(ArgMax_Op, "cpu", Aidge::ArgMaxImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ARGMAXIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ArgMaxImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ArgMaxImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1bedec701766fc59fac233a1c400df1042369c5a
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ArgMaxImpl_kernels.hpp
@@ -0,0 +1,87 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ARGMAXIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ARGMAXIMPL_KERNELS_H_
+
+#include <algorithm>   // std::for_each
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t
+#include <functional>  //std::multiplies
+#include <numeric>     //std::accumulate
+#include <vector>
+#include <limits>
+
+#include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/ArgMax.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void ArgMaxImpl_cpu_forward_kernel(std::int32_t axis_,
+                                    DimSize_t select_last_index,
+                                    const std::vector<DimSize_t>& inputDims,
+                                    const void* input_,
+                                    void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    const std::size_t axis = static_cast<std::size_t>(axis_);
+
+    std::size_t stride_post = 1;
+    for (std::size_t i = axis + 1; i < inputDims.size(); ++i) {
+        stride_post *= inputDims[i];
+    }
+    std::size_t stride_pre = 1;
+    for (std::size_t i = 0; i < axis; ++i) {
+        stride_pre *= inputDims[i];
+    }
+    const std::size_t dim_i = inputDims[axis];
+    for (std::size_t pre = 0; pre < stride_pre; ++pre) {
+        for (std::size_t post = 0; post < stride_post; ++post) {
+            const std::size_t idx_i = pre * dim_i * stride_post + post;
+            const std::size_t idx_o = pre * stride_post + post;
+            I max = std::numeric_limits<I>::min();
+            for (std::size_t i = 0; i < dim_i; ++i) {
+                I curr_value = input[idx_i + i*stride_post];
+                if (select_last_index) {
+                    if (curr_value>=max) {
+                        output[idx_o] = i;
+                        max = curr_value;
+                    }
+                }
+                else {
+                    if (curr_value > max) {
+                        output[idx_o] = i;
+                        max = curr_value;
+                    }
+                }
+            }
+        }
+    }
+
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ArgMaxImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::defaultModel, Aidge::ArgMaxImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(ArgMaxImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::defaultModel, Aidge::ArgMaxImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(ArgMaxImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::defaultModel, Aidge::ArgMaxImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ARGMAXIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
index 12a5dc334619c16e6ad3a77f0cd76f4db7a87b77..adea96ca43a1ad9d2a49777426913ca4676e4f32 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
@@ -17,49 +17,24 @@
 #include <tuple>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/AvgPooling.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
-// class AvgPooling_Op;
-
-// compute kernel registry for forward and backward
-class AvgPoolingImpl2DForward_cpu
-    : public Registrable<AvgPoolingImpl2DForward_cpu,
-                        std::tuple<DataType, DataType>,
-                        void(const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 4>&,
-                            const void *,
-                            void *)> {};
-class AvgPoolingImpl2DBackward_cpu
-    : public Registrable<AvgPoolingImpl2DBackward_cpu,
-                        std::tuple<DataType, DataType>,
-                        void(const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 4>&,
-                            const void *,
-                            void *)> {};
-
-class AvgPoolingImpl2D_cpu : public OperatorImpl {
-public:
-    AvgPoolingImpl2D_cpu(const AvgPooling_Op<2> &op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<AvgPoolingImpl2D_cpu> create(const AvgPooling_Op<2> &op) {
-        return std::make_unique<AvgPoolingImpl2D_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-// add cpu backend to AvgPooling_Op<2> implementation registry
-static Registrar<AvgPooling_Op<2>> registrarAvgPoolingImpl2D_cpu("cpu", Aidge::AvgPoolingImpl2D_cpu::create);
-}  // namespace
+// Operator implementation entry point for the backend
+using AvgPooling2D_Op = AvgPooling_Op<2>;
+using AvgPoolingImpl2D_cpu = OperatorImpl_cpu<AvgPooling_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 4>&,
+        const void *,
+        void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(AvgPooling2D_Op, "cpu", Aidge::AvgPoolingImpl2D_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_AVGPOOLINGIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
similarity index 85%
rename from include/aidge/backend/cpu/operator/AvgPoolingImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
index c7d9f86235c3bf1d7d01cf429cab29d156592fb5..f6da9dcb026101b93de862499d42ae8734532d52 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_AVGPOOLINGIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_AVGPOOLINGIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_AVGPOOLINGIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_AVGPOOLINGIMPL_KERNELS_H_
 
 #include <array>
 #include <tuple>
@@ -101,17 +101,16 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
     }
 }
 
-namespace {
-static Registrar<AvgPoolingImpl2DForward_cpu> registrarAvgPoolingImpl2DForward_cpu_Float32(
-        std::tuple<DataType, DataType>({DataType::Float32, DataType::Float32}),
-        Aidge::AvgPoolingImpl2D_cpu_forward_kernel<float, float>);
-static Registrar<AvgPoolingImpl2DForward_cpu> registrarAvgPoolingImpl2DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32},
-        Aidge::AvgPoolingImpl2D_cpu_forward_kernel<int, int>);
-static Registrar<AvgPoolingImpl2DForward_cpu> registrarAvgPoolingImpl2DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64},
-        Aidge::AvgPoolingImpl2D_cpu_forward_kernel<double, double>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(AvgPoolingImpl2D_cpu,
+    {{DataType::Float32, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::AvgPoolingImpl2D_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(AvgPoolingImpl2D_cpu,
+    {{DataType::Int32, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::AvgPoolingImpl2D_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(AvgPoolingImpl2D_cpu,
+    {{DataType::Float64, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::AvgPoolingImpl2D_cpu_forward_kernel<double, double>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_AVGPOOLINGIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_AVGPOOLINGIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/BatchNormImpl.hpp b/include/aidge/backend/cpu/operator/BatchNormImpl.hpp
index 93bdab2d3f37e3bd8dc1e68ab68a05de8c8015ed..36a100b21edc6cd63a0176c89f2f1e57c10001c7 100644
--- a/include/aidge/backend/cpu/operator/BatchNormImpl.hpp
+++ b/include/aidge/backend/cpu/operator/BatchNormImpl.hpp
@@ -17,58 +17,29 @@
 #include <tuple>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/BatchNorm.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
-// class BatchNorm_Op;
-
-// compute kernel registry for forward and backward
-class BatchNormImpl2DForward_cpu
-    : public Registrable<BatchNormImpl2DForward_cpu,
-                         std::tuple<DataType, DataType, DataType>,
-                         void(float,
-                            float,
-                            const std::array<DimSize_t, 4> &,
-                            const void *,
-                            const void *,
-                            const void *,
-                            void *,
-                            void *,
-                            void *,
-                            const bool)> {};
-class BatchNormImpl2DBackward_cpu
-    : public Registrable<BatchNormImpl2DBackward_cpu,
-                         std::tuple<DataType, DataType, DataType>,
-                         void(float,
-                            float,
-                            const std::array<DimSize_t, 4> &,
-                            const void *,
-                            const void *,
-                            const void *,
-                            void *,
-                            void *,
-                            void *)> {};
-
-class BatchNormImpl2D_cpu : public OperatorImpl {
-public:
-    BatchNormImpl2D_cpu(const BatchNorm_Op<2> &op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<BatchNormImpl2D_cpu> create(const BatchNorm_Op<2> &op) {
-        return std::make_unique<BatchNormImpl2D_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-// add cpu backend to BatchNorm_Op<2> implementation registry
-static Registrar<BatchNorm_Op<2>> registrarBatchNormImpl2D_cpu("cpu", Aidge::BatchNormImpl2D_cpu::create);
-}  // namespace
+// Operator implementation entry point for the backend
+using BatchNorm2D_Op = BatchNorm_Op<2>;
+using BatchNormImpl2D_cpu = OperatorImpl_cpu<BatchNorm_Op<2>,
+    void(float,
+        float,
+        const std::array<DimSize_t, 4> &,
+        const void *,
+        const void *,
+        const void *,
+        void *,
+        void *,
+        void *,
+        const bool)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(BatchNorm2D_Op, "cpu", Aidge::BatchNormImpl2D_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_BATCHNORMIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/BatchNormImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
similarity index 90%
rename from include/aidge/backend/cpu/operator/BatchNormImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
index 19f232a783bccf0a800d41f2bc566ccf6e04f05e..ec71e3b8e37e344c551fd643dc7b3957bdddcb67 100644
--- a/include/aidge/backend/cpu/operator/BatchNormImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_BATCHNORMIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_BATCHNORMIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_BATCHNORMIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_BATCHNORMIMPL_KERNELS_H_
 
 #include "aidge/utils/Registrar.hpp"
 
@@ -96,15 +96,10 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std
     }
 }
 
-
-
-
-
-namespace {
-static Registrar<BatchNormImpl2DForward_cpu> registrarBatchNormImpl2DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::BatchNormImpl2D_cpu_forward_kernel<float, float, float>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(BatchNormImpl2D_cpu,
+    {{DataType::Float32, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::BatchNormImpl2D_cpu_forward_kernel<float, float, float>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_BATCHNORMIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_BATCHNORMIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6da67bb7dd4469b6ca609c5aea1ae70dfca3f939
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp
@@ -0,0 +1,38 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_H_
+#define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/BitShift.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using BitShiftImpl_cpu = OperatorImpl_cpu<BitShift_Op,
+    void(const BitShift_Op::BitShiftDirection,
+    const std::vector<std::size_t>&, 
+    const std::vector<std::size_t>&, 
+    const std::vector<std::size_t>&, 
+    const void*, 
+    const void*,
+    void*)>;
+    
+    // Implementation entry point registration to Operator
+    REGISTRAR(BitShift_Op,"cpu",Aidge::BitShiftImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_BITSHIFTIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f815e946ea2e4abaff48a6e5155368d564e88e8c
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp
@@ -0,0 +1,70 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include <cstdint>     // std::int32_t, std::int64_t
+#include "aidge/operator/BitShift.hpp"
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
+
+
+
+namespace Aidge {
+template <class I1, class I2, class O>
+void BitShiftImpl_cpu_forward_kernel(
+                                const BitShift_Op::BitShiftDirection direction,
+                                const std::vector<std::size_t>& input1Dims,
+                                const std::vector<std::size_t>& input2Dims,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input1_,
+                                const void* input2_,
+                                void* output_
+                                ) {
+
+    const I1* input_1 = static_cast<const I1*>(input1_);
+    const I2* input_2 = static_cast<const I2*>(input2_);
+    O* output = static_cast<O*>(output_);
+
+    const size_t totalElements = std::accumulate(outputDims.begin(), outputDims.end(), std::size_t(1), std::multiplies<std::size_t>());
+    
+    for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+    {
+        std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+        std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+        std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
+        if(direction == BitShift_Op::BitShiftDirection::right)
+
+        {
+                output[oIndex]= input_1[idx1] >> input_2[idx2];
+        }
+        else
+        {
+                output[oIndex] = input_1[idx1] << input_2[idx2];
+        }
+    }
+}
+
+REGISTRAR(BitShiftImpl_cpu,
+{DataType::Int32},
+{ProdConso::inPlaceModel,Aidge::BitShiftImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>,nullptr});
+REGISTRAR(BitShiftImpl_cpu,
+{DataType::Int64},
+{ProdConso::inPlaceModel,Aidge::BitShiftImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>,nullptr});
+
+
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_BitShiftIMPL_KERNELS_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..83e7e030f526e0db3cff4741eabe39e287130562
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
@@ -0,0 +1,34 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_
+#define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/ConstantOfShape.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ConstantOfShapeImpl_cpu = OperatorImpl_cpu<ConstantOfShape_Op,
+    void(const std::vector<DimSize_t>, const Tensor&, void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(ConstantOfShape_Op, "cpu", Aidge::ConstantOfShapeImpl_cpu::create);
+} // namespace Aidge
+
+#endif /* _AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_ */
+
diff --git a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..18ab9c0a77c4545c955fc4fe1f1fc1cbcb763bf7
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
@@ -0,0 +1,71 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_
+
+#include <aidge/data/Tensor.hpp>
+#include <aidge/data/half.hpp>
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <functional> // std::multiplies
+#include <numeric>    // std::accumulate
+#include <vector>
+
+#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+template <class O>
+void ConstantOfShapeimpl_cpu_forward_kernel(
+    const std::vector<DimSize_t> output_dims, const Tensor &value,
+    void *output_) {
+
+  O *output = static_cast<O *>(output_);
+  O val;
+  std::copy(static_cast<O *>(value.getImpl()->hostPtr()),
+            static_cast<O *>(value.getImpl()->hostPtr()) +
+                static_cast<NbElts_t>(1),
+            &val);
+  const size_t output_size = std::accumulate(
+      output_dims.begin(), output_dims.end(), 1, std::multiplies<DimSize_t>());
+  for (size_t i = 0; i < output_size; ++i) {
+    output[i] = val;
+  }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float16}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<half_float::half>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<float>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<double>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int16}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int16_t>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int32_t>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int64_t>, nullptr});
+} // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_ */
+
diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp
index ec886a310dd2edc616ced6ee447665eab3ce301a..5b985accfb7b9778993b557524de7b60060ad437 100644
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp
@@ -17,85 +17,39 @@
 #include <tuple>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/ConvDepthWise.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
-// class ConvDepthWise_Op;
-// compute kernel registry for forward and backward
-class ConvDepthWiseImpl1DForward_cpu
-    : public Registrable<ConvDepthWiseImpl1DForward_cpu,
-                         std::tuple<DataType, DataType, DataType, DataType>,
-                         void(const std::array<DimSize_t, 1>&,
-                            const std::array<DimSize_t, 1>&,
-                            const std::array<DimSize_t, 1>&,
-                            const std::array<DimSize_t, 3>&,
-                            const void *,
-                            const void *,
-                            const void *,
-                            void *)> {};
-
-class ConvDepthWiseImpl1D_cpu : public OperatorImpl {
-public:
-    ConvDepthWiseImpl1D_cpu(const ConvDepthWise_Op<1> &op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<ConvDepthWiseImpl1D_cpu> create(const ConvDepthWise_Op<1> &op) {
-        return std::make_unique<ConvDepthWiseImpl1D_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-// add cpu backend to ConvDepthWise_Op<1> implementation registry
-static Registrar<ConvDepthWise_Op<1>> registrarConvDepthWiseImpl1D_cpu("cpu", Aidge::ConvDepthWiseImpl1D_cpu::create);
-}  // namespace
-
-// compute kernel registry for forward and backward
-class ConvDepthWiseImpl2DForward_cpu
-    : public Registrable<ConvDepthWiseImpl2DForward_cpu,
-                         std::tuple<DataType, DataType, DataType, DataType>,
-                         void(const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 4> &,
-                            const void *,
-                            const void *,
-                            const void *,
-                            void *)> {};
-class ConvDepthWiseImpl2DBackward_cpu
-    : public Registrable<ConvDepthWiseImpl2DBackward_cpu,
-                         std::tuple<DataType, DataType, DataType, DataType>,
-                         void(const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 2>&,
-                            bool,
-                            const std::array<DimSize_t, 4> &,
-                            const void *,
-                            const void *,
-                            const void *,
-                            void *)> {};
-
-class ConvDepthWiseImpl2D_cpu : public OperatorImpl {
-public:
-    ConvDepthWiseImpl2D_cpu(const ConvDepthWise_Op<2> &op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<ConvDepthWiseImpl2D_cpu> create(const ConvDepthWise_Op<2> &op) {
-        return std::make_unique<ConvDepthWiseImpl2D_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-// add cpu backend to ConvDepthWise_Op<2> implementation registry
-static Registrar<ConvDepthWise_Op<2>> registrarConvDepthWiseImpl2D_cpu("cpu", Aidge::ConvDepthWiseImpl2D_cpu::create);
-}  // namespace
+// Operator implementation entry point for the backend
+using ConvDepthWise1D_Op = ConvDepthWise_Op<1>;
+using ConvDepthWiseImpl1D_cpu = OperatorImpl_cpu<ConvDepthWise_Op<1>,
+    void(const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 3>&,
+        const void *,
+        const void *,
+        const void *,
+        void *)>;
+
+using ConvDepthWise2D_Op = ConvDepthWise_Op<2>;
+using ConvDepthWiseImpl2D_cpu = OperatorImpl_cpu<ConvDepthWise_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 4> &,
+        const void *,
+        const void *,
+        const void *,
+        void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(ConvDepthWise1D_Op, "cpu", Aidge::ConvDepthWiseImpl1D_cpu::create);
+REGISTRAR(ConvDepthWise2D_Op, "cpu", Aidge::ConvDepthWiseImpl2D_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
similarity index 83%
rename from include/aidge/backend/cpu/operator/ConvDepthWiseImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
index a02aa672b92f089790ef1903af8b804f816f3baa..ff9bb148fa68d75e2d4b00804e13f063e3ca2cc0 100644
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_KERNELS_H_
 
 #include <algorithm>
 #include <array>
@@ -86,17 +86,16 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri
     }
 }
 
-namespace {
-static Registrar<ConvDepthWiseImpl1DForward_cpu> registrarConvDepthWiseImpl1DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<float, float, float, float>);
-static Registrar<ConvDepthWiseImpl1DForward_cpu> registrarConvDepthWiseImpl1DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>);
-static Registrar<ConvDepthWiseImpl1DForward_cpu> registrarConvDepthWiseImpl1DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<double, double, double, double>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(ConvDepthWiseImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(ConvDepthWiseImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(ConvDepthWiseImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr});
 
 
 /**
@@ -187,17 +186,16 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
     }
 }
 
-namespace {
-static Registrar<ConvDepthWiseImpl2DForward_cpu> registrarConvDepthWiseImpl2DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<float, float, float, float>);
-static Registrar<ConvDepthWiseImpl2DForward_cpu> registrarConvDepthWiseImpl2DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>);
-static Registrar<ConvDepthWiseImpl2DForward_cpu> registrarConvDepthWiseImpl2DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<double, double, double, double>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(ConvDepthWiseImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(ConvDepthWiseImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(ConvDepthWiseImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/ConvImpl.hpp b/include/aidge/backend/cpu/operator/ConvImpl.hpp
index d7be46c251a82d1b631f4ad50e7175fa2f896d03..c06d0912f419909013f930867ce3c3238c1a5555 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl.hpp
@@ -17,91 +17,41 @@
 #include <tuple>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Conv.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
-// class Conv_Op;
-
-// compute kernel registry for forward and backward
-// Conv 1D
-class ConvImpl1DForward_cpu
-    : public Registrable<ConvImpl1DForward_cpu,
-                         std::tuple<DataType, DataType, DataType, DataType>,
-                         void(const std::array<DimSize_t, 1>&,
-                            const std::array<DimSize_t, 1>&,
-                            const std::array<DimSize_t, 1>&,
-                            const std::array<DimSize_t, 3> &,
-                            DimSize_t,
-                            const void *,
-                            const void *,
-                            const void *,
-                            void *)> {};
-
-class ConvImpl1D_cpu : public OperatorImpl {
-   public:
-    ConvImpl1D_cpu(const Conv_Op<1>& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<ConvImpl1D_cpu> create(const Conv_Op<1> &op) {
-        return std::make_unique<ConvImpl1D_cpu>(op);
-    }
-
-   public:
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-// add cpu backend to Conv_Op<1> implementation registry
-static Registrar<Conv_Op<1>> registrarConvImpl1D_cpu("cpu", Aidge::ConvImpl1D_cpu::create);
-}  // namespace
-
-// Conv 2D
-class ConvImpl2DForward_cpu
-    : public Registrable<ConvImpl2DForward_cpu,
-                         std::tuple<DataType, DataType, DataType, DataType>,
-                         void(const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 4> &,
-                            DimSize_t,
-                            const void *,
-                            const void *,
-                            const void *,
-                            void *)> {};
-class ConvImpl2DBackward_cpu
-    : public Registrable<ConvImpl2DBackward_cpu,
-                         std::tuple<DataType, DataType, DataType, DataType>,
-                         void(const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 2>&,
-                            bool,
-                            const std::array<DimSize_t, 4> &,
-                            const void *,
-                            const void *,
-                            const void *,
-                            void *)> {};
-
-class ConvImpl2D_cpu : public OperatorImpl {
-   public:
-    ConvImpl2D_cpu(const Conv_Op<2>& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<ConvImpl2D_cpu> create(const Conv_Op<2> &op) {
-        return std::make_unique<ConvImpl2D_cpu>(op);
-    }
-
-   public:
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-// add cpu backend to Conv_Op<2> implementation registry
-static Registrar<Conv_Op<2>> registrarConvImpl2D_cpu("cpu", Aidge::ConvImpl2D_cpu::create);
-}  // namespace
+// Operator implementation entry point for the backend
+using Conv1D_Op = Conv_Op<1>;
+using ConvImpl1D_cpu = OperatorImpl_cpu<Conv_Op<1>,
+    void(const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 3> &,
+        DimSize_t,
+        const void *,
+        const void *,
+        const void *,
+        void *)>;
+
+using Conv2D_Op = Conv_Op<2>;
+using ConvImpl2D_cpu = OperatorImpl_cpu<Conv_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 4> &,
+        DimSize_t,
+        const void *,
+        const void *,
+        const void *,
+        void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Conv1D_Op, "cpu", Aidge::ConvImpl1D_cpu::create);
+REGISTRAR(Conv2D_Op, "cpu", Aidge::ConvImpl2D_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_CONVIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
similarity index 70%
rename from include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index 88a71c47244788f2da5e576c8ad5170a92561909..cc3bd57cb17f2a0feb6a79af2c291e6f960467d8 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -9,18 +9,20 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
 
-#include <algorithm>
 #include <array>
-#include <cmath>
+#include <memory>
+#include <tuple>
+#include <vector>
 
-#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
-#include "aidge/data/half.hpp"
+#include "aidge/operator/Conv.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
 /**
@@ -90,20 +92,19 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
     }
 }
 
-namespace {
-static Registrar<ConvImpl1DForward_cpu> registrarConvImpl1DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::ConvImpl1D_cpu_forward_kernel<float, float, float, float>);
-static Registrar<ConvImpl1DForward_cpu> registrarConvImpl1DForward_cpu_Float16(
-        {DataType::Float16, DataType::Float16, DataType::Float16, DataType::Float16},
-        Aidge::ConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>);
-static Registrar<ConvImpl1DForward_cpu> registrarConvImpl1DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::ConvImpl1D_cpu_forward_kernel<int, int, int, int>);
-static Registrar<ConvImpl1DForward_cpu> registrarConvImpl1DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::ConvImpl1D_cpu_forward_kernel<double, double, double, double>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr});
 
 
 /**
@@ -135,49 +136,6 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
     const W *weights = static_cast<const W *>(weights_);
     const B *biases = static_cast<const B *>(biases_);
     O *output = static_cast<O *>(output_);
-/*
-    // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(static_cast<float>(inputDims[0] - kernelDims[0] + strideDims[0]) /
-                                static_cast<float>(strideDims[0]));
-    // output W size
-    const std::size_t oySize =
-            static_cast<std::size_t>(static_cast<float>(inputDims[1] - kernelDims[1] + strideDims[1]) /
-                                static_cast<float>(strideDims[1]));
-
-    // TODO: kernel computation
-    // output (Xout, Yout, outCh, batch)
-    // input  (Xin, Yin, inCh, batch)
-    // weight (kernelX, kernelY, inCh, outCh)
-    // does not take Dilation attribute into account
-    for (std::size_t ox = 0; ox < oxSize; ++ox) {
-        for (std::size_t oy = 0; oy < oySize; ++oy) {
-            const std::size_t ix = ox * strideDims[0];
-            const std::size_t iy = oy * strideDims[1];
-
-            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
-                const std::size_t oIndex = inputDims[3] * (outCh + outChannels * (oy + oySize * ox));
-                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                for (std::size_t batch = 0; batch < inputDims[3]; ++batch) {
-                    output[oIndex + batch] = biasVal;
-                }
-                for (std::size_t inCh = 0; inCh < inputDims[2]; ++inCh) {
-                    for (std::size_t sx = 0; sx < kernelDims[0]; ++sx) {
-                        for (std::size_t sy = 0; sy < kernelDims[1]; ++sy) {
-                            const std::size_t wIndex =
-                                    outCh + outChannels * (inCh + inputDims[2] * (sy + kernelDims[1] * sx));
-                            std::size_t iIndex = inputDims[3] * (inCh + inputDims[2] * ((iy + sy) + inputDims[1] * (ix + sx)));
-                            for (std::size_t batch = 0; batch < inputDims[3]; ++batch) {
-                                output[oIndex + batch] += weights[wIndex] * input[iIndex + batch];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-*/
-
 
     // output H size
     const std::size_t oxSize =
@@ -240,20 +198,19 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
     }
 }
 
-namespace {
-static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>);
-static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float16(
-        {DataType::Float16, DataType::Float16, DataType::Float16, DataType::Float16},
-        Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>);
-static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::ConvImpl2D_cpu_forward_kernel<int, int, int, int>);
-static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/DivImpl.hpp b/include/aidge/backend/cpu/operator/DivImpl.hpp
index 3a19d7303464e3543bd1ce83e334c4a6bdb713a2..40c1b678a78713d6c3b27629ae898c715797b9b2 100644
--- a/include/aidge/backend/cpu/operator/DivImpl.hpp
+++ b/include/aidge/backend/cpu/operator/DivImpl.hpp
@@ -16,38 +16,18 @@
 #include <tuple>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Div.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
+using DivImpl_cpu = OperatorImpl_cpu<Div_Op,
+    void(const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)>;
 
-// compute kernel registry for forward and backward
-class DivImplForward_cpu
-    // : public Registrable<DivImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)> {
-    : public Registrable<DivImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)> {
-};
-class DivImplBackward_cpu
-    : public Registrable<DivImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)> {
-};
-
-class DivImpl_cpu : public OperatorImpl {
-public:
-    DivImpl_cpu(const Div_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<DivImpl_cpu> create(const Div_Op& op) {
-        return std::make_unique<DivImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-
-    void forward() override final;
-};
-
-namespace {
-static Registrar<Div_Op> registrarDivImpl_cpu("cpu", Aidge::DivImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(Div_Op, "cpu", Aidge::DivImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_DIVIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/DivImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp
similarity index 77%
rename from include/aidge/backend/cpu/operator/DivImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/DivImpl_kernels.hpp
index 74db1128c111ae62bedb6fa61682abca62429cdb..ed6e55a79acbe23a689a67c22477f64f785a3aef 100644
--- a/include/aidge/backend/cpu/operator/DivImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_DIVIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_DIVIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_
 
 #include <numeric>     // std::accumulate
 #include <cstddef>     // std::size_t
@@ -69,19 +69,16 @@ constexpr void DivImpl_cpu_forward_kernel(const std::size_t input1size_,
     }
 }
 
-
-
-namespace {
-static Registrar<DivImplForward_cpu> registrarDivImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::DivImpl_cpu_forward_kernel<float, float, float>);
-static Registrar<DivImplForward_cpu> registrarDivImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::DivImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>);
-static Registrar<DivImplForward_cpu> registrarDivImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::DivImpl_cpu_forward_kernel<double, double, double>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(DivImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<float, float, float>, nullptr});
+REGISTRAR(DivImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<double, double, double>, nullptr});
+REGISTRAR(DivImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_DIVIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/ErfImpl.hpp b/include/aidge/backend/cpu/operator/ErfImpl.hpp
index 6864803a542e4beed0259be9c4722d4215bec449..3d2835600367e81499cbe6af81a8475a0cd1b61e 100644
--- a/include/aidge/backend/cpu/operator/ErfImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ErfImpl.hpp
@@ -12,7 +12,7 @@
 #ifndef AIDGE_CPU_OPERATOR_ERFIMPL_H_
 #define AIDGE_CPU_OPERATOR_ERFIMPL_H_
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Erf.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
@@ -20,31 +20,12 @@
 #include <vector>
 
 namespace Aidge {
-// class Erf_Op;
+// Operator implementation entry point for the backend
+using ErfImpl_cpu = OperatorImpl_cpu<Erf_Op,
+    void(const std::size_t, const void*, void*)>;
 
-// compute kernel registry for forward and backward
-class ErfImplForward_cpu
-    : public Registrable<ErfImplForward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-class ErfImplBackward_cpu
-    : public Registrable<ErfImplBackward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-
-class ErfImpl_cpu : public OperatorImpl {
-public:
-    ErfImpl_cpu(const Erf_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<ErfImpl_cpu> create(const Erf_Op& op) {
-        return std::make_unique<ErfImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-static Registrar<Erf_Op> registrarErfImpl_cpu("cpu", Aidge::ErfImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(Erf_Op, "cpu", Aidge::ErfImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_ERFIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ErfImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp
similarity index 57%
rename from include/aidge/backend/cpu/operator/ErfImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp
index bb92401b6e72b1528d0342474bf394a7c29a4042..02041f55ce9a1b2476db575b40340b1bb6517ce1 100644
--- a/include/aidge/backend/cpu/operator/ErfImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_ERFIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_ERFIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_ERFIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ERFIMPL_KERNELS_H_
 
 #include <cmath>
 
@@ -32,14 +32,16 @@ void ErfImpl_cpu_forward_kernel(std::size_t inputLenght,
     }
 }
 
-namespace {
-static Registrar<ErfImplForward_cpu> registrarErfImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::ErfImpl_cpu_forward_kernel<float, float>);
-static Registrar<ErfImplForward_cpu> registrarErfImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32}, Aidge::ErfImpl_cpu_forward_kernel<int, int>);
-static Registrar<ErfImplForward_cpu> registrarErfImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::ErfImpl_cpu_forward_kernel<double, double>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(ErfImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ErfImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(ErfImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ErfImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(ErfImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ErfImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_ERFIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_ERFIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/FCImpl.hpp b/include/aidge/backend/cpu/operator/FCImpl.hpp
index f21cd0ff330f61b942eb55f036c7b23458a5959a..e82352d9cba60440efef87faf97dfd4ed66565b6 100644
--- a/include/aidge/backend/cpu/operator/FCImpl.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl.hpp
@@ -16,57 +16,33 @@
 #include <memory>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/FC.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
-// class FC_Op;
-
-// compute kernel registry for forward and backward
-class FCImplForward_cpu : public Registrable<FCImplForward_cpu,
-                                             std::tuple<DataType,
-                                                        DataType,
-                                                        DataType,
-                                                        DataType>,
-                                             void(const DimSize_t,
-                                                const DimSize_t,
-                                                const DimSize_t,
-                                                const void *,
-                                                const void *,
-                                                const void *,
-                                                void *)> {};
-class FCImplBackward_cpu : public Registrable<FCImplBackward_cpu,
-                                              std::tuple<DataType,
-                                                         DataType,
-                                                         DataType,
-                                                         DataType>,
-                                              void(const DimSize_t,
-                                                const DimSize_t,
-                                                const DimSize_t,
-                                                const void *,
-                                                const void *,
-                                                const void *,
-                                                void *,
-                                                void *,
-                                                void *)> {};
-
-class FCImpl_cpu : public OperatorImpl {
-public:
-    FCImpl_cpu(const FC_Op &op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<FCImpl_cpu> create(const FC_Op &op) {
-        return std::make_unique<FCImpl_cpu>(op);
-    }
-
-    void forward() override final;
-    void backward() override final;
-};
-
-namespace {
-static Registrar<FC_Op> registrarFCImpl_cpu("cpu", Aidge::FCImpl_cpu::create);
-}
+// Operator implementation entry point for the backend
+using FCImpl_cpu = OperatorImpl_cpu<FC_Op,
+    void(const DimSize_t,
+        const DimSize_t,
+        const DimSize_t,
+        const void *,
+        const void *,
+        const void *,
+        void *),
+    void(const DimSize_t,
+        const DimSize_t,
+        const DimSize_t,
+        const void *,
+        const void *,
+        const void *,
+        void *,
+        void *,
+        void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(FC_Op, "cpu", Aidge::FCImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_FCIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp
deleted file mode 100644
index c93a44d922dce2dc18df94bf903134ddadf5256f..0000000000000000000000000000000000000000
--- a/include/aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_FCIMPL_BACKWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_FCIMPL_BACKWARD_KERNEL_H_
-
-#include "aidge/utils/Registrar.hpp"
-#include <algorithm>
-
-#include "aidge/backend/cpu/operator/FCImpl.hpp"
-
-namespace Aidge {
-template <class I, class O, class W, class B>
-void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
-                                const DimSize_t inputFeatureSize,
-                                const DimSize_t outputFeatureSize,
-                                const void* input_,
-                                const void* originalInput_,
-                                const void* weight_,
-                                void* output_,
-                                void* weightGrad_,
-                                void* biasesGrad_)
-{
-    // FIXME: missing FC attributes as arguments
-    const I* input  = static_cast<const I*>(input_);
-    const I* originalInput  = static_cast<const I*>(originalInput_);
-    const W* weight = static_cast<const W*>(weight_);
-    O* output       = static_cast<O*>(output_);
-    W* weightGrad   = static_cast<W*>(weightGrad_);
-    B* biasesGrad   = static_cast<B*>(biasesGrad_);
-
-
-    // bias grad
-    if (biasesGrad == nullptr) { // no bias
-        std::fill(biasesGrad, biasesGrad + outputFeatureSize, B(0));
-    } else {
-        for (std::size_t o = 0; o < outputFeatureSize; ++o) { // nb outputs
-            B sum{0};
-            for (std::size_t b = 0; b < batchSize; ++b) {
-                sum += input[b*outputFeatureSize + o];
-            }
-            biasesGrad[o] = sum;
-        }
-    }
-
-    // weight grad
-    for (std::size_t o = 0; o < outputFeatureSize; ++o) {
-        for (std::size_t c = 0; c < inputFeatureSize; ++c) {
-            W sum{0};
-            for (std::size_t b = 0; b < batchSize; ++b) {
-                sum += originalInput[b*inputFeatureSize + c]*input[b*outputFeatureSize + o];
-            }
-            weightGrad[o*inputFeatureSize + c] = sum;
-        }
-    }
-
-    // input grad
-    for (std::size_t b = 0; b < batchSize; ++b) {
-        for (std::size_t c = 0; c < inputFeatureSize; ++c) {
-            O sum{0};
-            for (std::size_t o = 0; o < outputFeatureSize; ++o) {
-                sum += weight[o*inputFeatureSize + c] * input[b*outputFeatureSize + o];
-            }
-            output[b*inputFeatureSize + c] = sum;
-        }
-    }
-}
-
-
-namespace {
-static Registrar<FCImplBackward_cpu> registrarFCImpl2DBackward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::FCImpl_cpu_backward_kernel<float, float, float, float>);
-static Registrar<FCImplBackward_cpu> registrarFCImpl2DBackward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::FCImpl_cpu_backward_kernel<int, int, int, int>);
-static Registrar<FCImplBackward_cpu> registrarFCImpl2DBackward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::FCImpl_cpu_backward_kernel<double, double, double, double>);
-}  // namespace
-
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_FCIMPL_BACKWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cpu/operator/FCImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
similarity index 62%
rename from include/aidge/backend/cpu/operator/FCImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
index caeacd1bda2fde086fd649c50a733e790fc2c000..c57f86e6ac6e74acebb48f471991e7181920f7c3 100644
--- a/include/aidge/backend/cpu/operator/FCImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_FCIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_FCIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_FCIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_FCIMPL_KERNELS_H_
 
 #include <algorithm>
 
@@ -115,19 +115,72 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
     }
 }
 
+template <class I, class O, class W, class B>
+void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
+                                const DimSize_t inputFeatureSize,
+                                const DimSize_t outputFeatureSize,
+                                const void* input_,
+                                const void* originalInput_,
+                                const void* weight_,
+                                void* output_,
+                                void* weightGrad_,
+                                void* biasesGrad_)
+{
+    // FIXME: missing FC attributes as arguments
+    const I* input  = static_cast<const I*>(input_);
+    const I* originalInput  = static_cast<const I*>(originalInput_);
+    const W* weight = static_cast<const W*>(weight_);
+    O* output       = static_cast<O*>(output_);
+    W* weightGrad   = static_cast<W*>(weightGrad_);
+    B* biasesGrad   = static_cast<B*>(biasesGrad_);
+
+
+    // bias grad
+    if (biasesGrad == nullptr) { // no bias
+        std::fill(biasesGrad, biasesGrad + outputFeatureSize, B(0));
+    } else {
+        for (std::size_t o = 0; o < outputFeatureSize; ++o) { // nb outputs
+            B sum{0};
+            for (std::size_t b = 0; b < batchSize; ++b) {
+                sum += input[b*outputFeatureSize + o];
+            }
+            biasesGrad[o] = sum;
+        }
+    }
 
-namespace {
-static Registrar<FCImplForward_cpu> registrarFCImpl2DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::FCImpl_cpu_forward_kernel<float, float, float, float>);
-static Registrar<FCImplForward_cpu> registrarFCImpl2DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::FCImpl_cpu_forward_kernel<int, int, int, int>);
-static Registrar<FCImplForward_cpu> registrarFCImpl2DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::FCImpl_cpu_forward_kernel<double, double, double, double>);
-}  // namespace
+    // weight grad
+    for (std::size_t o = 0; o < outputFeatureSize; ++o) {
+        for (std::size_t c = 0; c < inputFeatureSize; ++c) {
+            W sum{0};
+            for (std::size_t b = 0; b < batchSize; ++b) {
+                sum += originalInput[b*inputFeatureSize + c]*input[b*outputFeatureSize + o];
+            }
+            weightGrad[o*inputFeatureSize + c] = sum;
+        }
+    }
+
+    // input grad
+    for (std::size_t b = 0; b < batchSize; ++b) {
+        for (std::size_t c = 0; c < inputFeatureSize; ++c) {
+            O sum{0};
+            for (std::size_t o = 0; o < outputFeatureSize; ++o) {
+                sum += weight[o*inputFeatureSize + c] * input[b*outputFeatureSize + o];
+            }
+            output[b*inputFeatureSize + c] = sum;
+        }
+    }
+}
 
+// Kernels registration to implementation entry point
+REGISTRAR(FCImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<float, float, float, float>, Aidge::FCImpl_cpu_backward_kernel<float, float, float, float>});
+REGISTRAR(FCImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<double, double, double, double>, Aidge::FCImpl_cpu_backward_kernel<double, double, double, double>});
+REGISTRAR(FCImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, Aidge::FCImpl_cpu_backward_kernel<int32_t, int32_t, int32_t, int32_t>});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_FCIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/FoldImpl.hpp b/include/aidge/backend/cpu/operator/FoldImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..94ddbdcba8e33e12108968d536037ab1ccab2c8d
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/FoldImpl.hpp
@@ -0,0 +1,42 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_FOLDIMPL_H_
+#define AIDGE_CPU_OPERATOR_FOLDIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Fold.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using Fold2D_Op = Fold_Op<2>;
+using FoldImpl2D_cpu = OperatorImpl_cpu<Fold_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::vector<DimSize_t> &,
+        const void *,
+        void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Fold2D_Op, "cpu", Aidge::FoldImpl2D_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_FOLDIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/FoldImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FoldImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8cced8958f49f1cc4215c7cf463cc3391fb29246
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/FoldImpl_kernels.hpp
@@ -0,0 +1,86 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_FOLDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_FOLDIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/FoldImpl.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <cmath>
+#include <array>
+#include <algorithm>
+
+namespace Aidge {
+template <class I, class O>
+void FoldImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& outputDims,
+                                    const std::array<DimSize_t, 2>& strideDims,
+                                    const std::array<DimSize_t, 2>& dilationDims,
+                                    const std::array<DimSize_t, 2>& kernelDims,
+                                    const std::vector<DimSize_t> &dims,
+                                    const void *input_, void *output_)
+{
+    const I *input = static_cast<const I *>(input_);
+    O *output = static_cast<O *>(output_);
+
+    const DimSize_t inHeight = outputDims[0];
+    const DimSize_t inWidth = outputDims[1];
+
+    const DimSize_t kernelExtentHeight = dilationDims[0] *
+                                            (kernelDims[0] - 1) + 1;
+    const DimSize_t outHeight = 1 + static_cast<DimSize_t>(
+                    floor(static_cast<float>(inHeight - kernelExtentHeight) /
+                            static_cast<float>(strideDims[0])));
+    const DimSize_t kernelExtentWidth = dilationDims[1] *
+                                            (kernelDims[1] - 1) + 1;
+    const DimSize_t outWidth = 1 + static_cast<DimSize_t>(
+                    floor(static_cast<float>(inWidth - kernelExtentWidth) /
+                            static_cast<float>(strideDims[1])));
+    const DimSize_t outChannels = dims[dims.size() - 2];
+    const DimSize_t inChannels = outChannels / kernelDims[0] / kernelDims[1];
+
+    std::fill_n(output, dims[0] * outHeight * outWidth * outChannels, O(0));
+
+    for (DimSize_t n = 0; n < dims[0]; ++n) {
+        for (DimSize_t outC = 0; outC < outChannels; ++outC) {
+            const auto inOffsetW = outC % kernelDims[1];
+            const auto inOffsetH = (outC / kernelDims[1]) % kernelDims[0];
+            const auto inC = outC / kernelDims[0] / kernelDims[1];
+
+            for (DimSize_t outH = 0; outH < outHeight; ++outH) {
+                const auto inH = outH * strideDims[0] + inOffsetH * dilationDims[0];
+
+                for (DimSize_t outW = 0; outW < outWidth; ++outW) {
+                    const auto inW = outW * strideDims[1] + inOffsetW * dilationDims[1];
+
+                    output[((n * inChannels + inC) * inHeight + inH) * inWidth + inW] +=
+                        input[((n * outChannels + outC) * outHeight + outH) * outWidth + outW];
+                }
+            }
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(FoldImpl2D_cpu,
+    {DataType::Float32},
+    {ProdConso::defaultModel, Aidge::FoldImpl2D_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(FoldImpl2D_cpu,
+    {DataType::Float64},
+    {ProdConso::defaultModel, Aidge::FoldImpl2D_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(FoldImpl2D_cpu,
+    {DataType::Int32},
+    {ProdConso::defaultModel, Aidge::FoldImpl2D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_FOLDIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp
index 758535de4cc506b8de4adf7004afbbfdd8185941..4e04b1a595a8660b1528e49921e7e3e7a567829a 100644
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp
@@ -15,41 +15,18 @@
 #include <memory>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/GlobalAveragePooling.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
-// class GlobalAveragePooling_Op;
+// Operator implementation entry point for the backend
+using GlobalAveragePoolingImpl_cpu = OperatorImpl_cpu<GlobalAveragePooling_Op,
+    void(const std::vector<DimSize_t> &, const void *, void *)>;
 
-class GlobalAveragePoolingImplForward_cpu
-    : public Registrable<
-          GlobalAveragePoolingImplForward_cpu, std::tuple<DataType, DataType>,
-          void(const std::vector<DimSize_t> &, const void *, void *)> {};
-
-class GlobalAveragePoolingImplBackward_cpu
-    : public Registrable<
-          GlobalAveragePoolingImplBackward_cpu, std::tuple<DataType, DataType>,
-          void(const std::vector<DimSize_t> &, const void *, void *)> {};
-
-class GlobalAveragePoolingImpl_cpu : public OperatorImpl {
-public:
-  GlobalAveragePoolingImpl_cpu(const GlobalAveragePooling_Op &op)
-      : OperatorImpl(op, "cpu") {}
-
-  static std::unique_ptr<GlobalAveragePoolingImpl_cpu>
-  create(const GlobalAveragePooling_Op &op) {
-    return std::make_unique<GlobalAveragePoolingImpl_cpu>(op);
-  }
-
-  void forward() override;
-};
-
-namespace {
-static Registrar<GlobalAveragePooling_Op> registrarGlobalAveragePoolingImpl_cpu(
-    "cpu", Aidge::GlobalAveragePoolingImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(GlobalAveragePooling_Op, "cpu", Aidge::GlobalAveragePoolingImpl_cpu::create);
 } // namespace Aidge
 
 #endif /* _AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
similarity index 68%
rename from include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
index 81f10975cc107a23448da3df14b88f6b31d55146..ed838a94cc0c0238a870427c3b774b29f7818b09 100644
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_
 
 #include <cstddef>
 #include <functional>  // std::multiplies
@@ -59,21 +59,16 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel(
   }
 }
 
-// Then we add the Registrar declaration for different input/output types
-namespace {
-static Registrar<GlobalAveragePoolingImplForward_cpu>
-    registrarGlobalAveragePoolingImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32},
-        Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<float, float>);
-static Registrar<GlobalAveragePoolingImplForward_cpu>
-    registrarGlobalAveragePoolingImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32},
-        Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<int, int>);
-static Registrar<GlobalAveragePoolingImplForward_cpu>
-    registrarGlobalAveragePoolingImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64},
-        Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<double, double>);
-} // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(GlobalAveragePoolingImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(GlobalAveragePoolingImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(GlobalAveragePoolingImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
 } // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/GridSampleImpl.hpp b/include/aidge/backend/cpu/operator/GridSampleImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..697bb35a983bc108c2a5d65db3c08ef462ffcdbd
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl.hpp
@@ -0,0 +1,38 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_
+#define AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/GridSample.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using GridSampleImpl_cpu = OperatorImpl_cpu<GridSample_Op,
+    void(const GridSample_Op&,
+        const std::shared_ptr<Tensor>&,
+        const std::shared_ptr<Tensor>&,
+        const std::shared_ptr<Tensor>&)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(GridSample_Op, "cpu", Aidge::GridSampleImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ea62fd010db8c155a3ff86ff8396797da5ebb6be
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
@@ -0,0 +1,477 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
+
+#include <algorithm>  // std::max, std::min
+#include <cmath>      // std::fabs, std::trunf, std::nearbyint
+#include <cstddef>    // std::size_t
+#include <cstdint>    // std::int64_t
+
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/GridSampleImpl.hpp"
+#include "aidge/data/half.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+static bool in_bound(float coord, float lower_bound, float upper_bound) noexcept {
+    return (coord > lower_bound) && (coord < upper_bound);
+}
+
+static float unnormalized_coord(float coord, float new_lower_bound, float new_upper_bound) noexcept {
+    return (coord + 1) / 2 * (new_upper_bound - new_lower_bound) + new_lower_bound;
+}
+
+// unused
+// static float normalized_coord(float coord, float prev_lower_bound, float prev_upper_bound) noexcept {
+//     return (coord + prev_lower_bound) / (prev_upper_bound-prev_lower_bound) * 2 - 1;
+// }
+
+static float unnormalize_grid_sample_coord(float coord, std::size_t size, bool align_corners) noexcept {
+    return align_corners ? unnormalized_coord(coord, 0.0f, static_cast<float>(size) - 1.0f)
+                         : unnormalized_coord(coord, -0.5f, static_cast<float>(size) - 0.5f);
+}
+
+// unused
+// static float normalize_grid_sample_coord(float coord, std::size_t size, bool align_corners) noexcept {
+//     return align_corners ? normalized_coord(coord, 0.0f, static_cast<float>(size) - 1.0f)
+//                          : normalized_coord(coord, -0.5f, static_cast<float>(size) - 0.5f);
+// }
+
+static float update_normalized_coord_with_padding(float coord, Aidge::GridSample_Op::PaddingMode padding_mode) {
+    if (!in_bound(coord, -1.0f, 1.0f)) {
+        if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) {
+            coord = std::min(std::max(-1.0f, coord), 1.0f);
+        }
+        else if (padding_mode == Aidge::GridSample_Op::PaddingMode::Reflection) {
+            float abs_coord = std::fabs(coord);
+            float int_coord = std::truncf(abs_coord);
+            std::int32_t nb_refl = static_cast<std::int32_t>((int_coord - 1) / 2);
+            float res = ((nb_refl + 1)*2) - abs_coord;
+            coord = (coord > 0) ? (nb_refl % 2 == 0 ? res : -res) \
+                            : (nb_refl % 2 == 0 ? -res : res);
+        }
+
+    }
+    return coord;
+}
+
+static inline std::int64_t update_unnormalized_coord_with_padding(std::int64_t coord, std::int64_t size, Aidge::GridSample_Op::PaddingMode padding_mode) {
+    if (!in_bound(coord, 0, size)) {
+        // out of bound. switch padding mode
+        if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) {
+            coord = std::min(std::max(std::int64_t(0), coord), size-std::int64_t(1));
+        } else if (padding_mode == Aidge::GridSample_Op::PaddingMode::Reflection) {
+            const std::int64_t quotient = coord / (size-1);
+            const std::int64_t remainer = std::abs(coord - quotient*(size-1));
+            coord = (quotient % 2 == 0) ? remainer : size - 1 - remainer;
+        }
+    }
+    return coord;
+}
+
+namespace Aidge {
+/**
+ * @brief Forward kernel for 1D GridSample on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param grid_ const grid Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void GridSampleImpl1D_cpu_forward_kernel(const GridSample_Op& op,
+                            const std::shared_ptr<Tensor>& in0,
+                            const std::shared_ptr<Tensor>& in1,
+                            const std::shared_ptr<Tensor>& out)
+{
+    const I* const input = static_cast<const I * const>(in0->getImpl()->rawPtr());
+    const I* input_ptr = input;
+    float* const grid = static_cast<float* const>(in1->getImpl()->rawPtr());
+    float* grid_ptr = grid;
+    O* const output = static_cast<O* const>(out->getImpl()->rawPtr());
+    O* output_ptr = output;
+
+    const std::size_t N = in0->dim(0);
+    const std::size_t C = in0->dim(1);
+    const std::size_t in_H = in0->dim(2);
+    const std::size_t grid_H = in1->dim(1);
+
+    const std::size_t in_N_s = in0->stride(0);
+    const std::size_t in_C_s = in0->stride(1);
+    const std::size_t in_H_s = in0->stride(2);
+    const std::size_t grid_N_s = in1->stride(0);
+    const std::size_t grid_H_s = in1->stride(1);
+    const std::size_t out_N_s = out->stride(0);
+    const std::size_t out_C_s = out->stride(1);
+    const std::size_t out_H_s = out->stride(2);
+
+    float* grid_ptr_N = grid;
+    const I* input_ptr_N = input;
+    O* output_ptr_N = output;
+    for (std::size_t n = 0; n < N; ++n) {
+        grid_ptr = grid_ptr_N;
+        for (std::size_t grid_x = 0; grid_x < grid_H; ++grid_x) {
+            output_ptr = output_ptr_N + grid_x*out_H_s;
+            /*
+            * change grid_x coord to match padding_mode
+            * Change range from [-1, 1] to [0, H-1] or [-0.5, H-0.5] according to align_corners
+            * Handle computation of interpolation
+            *   any value outside bounds is considered 0
+            *   if nearest:
+            *   else if linear:
+            *   else if cubic:
+            *   else : nothing
+            */
+            float x = *grid_ptr;
+            x = update_normalized_coord_with_padding(x, op.paddingMode());
+            x = unnormalize_grid_sample_coord(x, in_H, op.alignCorners());
+            if (op.mode() == GridSample_Op::Mode::Nearest) {
+                const std::int64_t x_rounded = std::nearbyintf(x);
+
+                if (in_bound(x_rounded, 0, in_H)) {
+                    input_ptr = input_ptr_N + x_rounded*in_H_s;
+                    for (std::size_t c = 0; c < C; ++c) {
+                        *output_ptr = *input_ptr;
+                        input_ptr += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                } else {
+                    for (std::size_t c = 0; c < C; ++c) {
+                        *output_ptr = O(0);
+                        output_ptr += out_C_s;
+                    }
+                }
+            } else if (op.mode() == GridSample_Op::Mode::Linear) {
+                const std::int64_t x_inf = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_H, op.paddingMode());
+                const std::int64_t x_sup = update_unnormalized_coord_with_padding(x_inf + 1, in_H, op.paddingMode());
+
+                const I* input_ptr_NC = input_ptr_N;
+                for (std::size_t c = 0; c < C; ++c) {
+                    const I f_inf = in_bound(x_inf, 0, in_H) ?
+                        input_ptr_NC[static_cast<std::size_t>(x_inf)*in_H_s] : I(0);
+                    const I f_sup = in_bound(x_sup, 0, in_H) ?
+                        input_ptr_NC[static_cast<std::size_t>(x_sup)*in_H_s] : I(0);
+
+                    *output_ptr = static_cast<O>(static_cast<I>(x - x_inf)*f_inf \
+                            + static_cast<I>(x_sup - x)*f_sup);
+
+                    input_ptr_NC += in_C_s;
+                    output_ptr += out_C_s;
+                }
+            } else if (op.mode() == GridSample_Op::Mode::Cubic) {
+                const std::int64_t x_inf = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_H, op.paddingMode());
+                const std::int64_t x_sup = update_unnormalized_coord_with_padding(x_inf + 1, in_H, op.paddingMode());
+                const std::int64_t x_inf_inf = update_unnormalized_coord_with_padding(x_inf - 1, in_H, op.paddingMode());
+                const std::int64_t x_sup_sup = update_unnormalized_coord_with_padding(x_sup + 1, in_H, op.paddingMode());
+
+                const I x1 = static_cast<I>(x - static_cast<float>(x_inf));
+                const I x2 = x1 * x1;
+                const I x3 = x1 * x2;
+
+                const I* input_ptr_NC = input_ptr_N;
+                for (std::size_t c = 0; c < C; ++c) {
+                    const I f_inf_inf = in_bound(x_inf_inf, 0, in_H) ? input_ptr_NC[x_inf_inf*in_H_s] : I(0);
+                    const I f_inf = in_bound(x_inf, 0, in_H) ? input_ptr_NC[x_inf*in_H_s] : I(0);
+                    const I f_sup = in_bound(x_sup, 0, in_H) ? input_ptr_NC[x_sup*in_H_s] : I(0);
+                    const I f_sup_sup = in_bound(x_sup_sup, 0, in_H) ? input_ptr_NC[x_sup_sup*in_H_s] : I(0);
+
+                    const I m_inf = (f_sup - f_inf_inf) / I(2);
+                    const I m_sup = (f_sup_sup - f_inf) / I(2);
+
+                    *output_ptr = f_inf \
+                        + x1 * m_inf \
+                        + x2 * (3 * (f_sup - f_inf) - 2 * m_inf - m_sup) \
+                        + x3 * (2*(f_inf - f_sup) + m_inf + m_sup);
+
+                    input_ptr_NC += in_C_s;
+                    output_ptr += out_C_s;
+                }
+            }
+
+            grid_ptr += grid_H_s;
+        }
+
+        input_ptr_N += in_N_s;
+        grid_ptr_N += grid_N_s;
+        output_ptr_N += out_N_s;
+    }
+}
+
+// Kernels registration to implementation entry point
+// only accept 1st input with only 1 spatial feat. (nb dims = 1)
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float16}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<half_float::half, half_float::half>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float64}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Int32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+
+
+/**
+ * @brief Forward kernel for 1D GridSample on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param grid_ const grid Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void GridSampleImpl2D_cpu_forward_kernel(const GridSample_Op& op,
+                            const std::shared_ptr<Tensor>& in0,
+                            const std::shared_ptr<Tensor>& in1,
+                            const std::shared_ptr<Tensor>& out)
+{
+    const I* input = static_cast<const I *>(in0->getImpl()->rawPtr());
+    const I* input_ptr = input;
+    float* const grid = static_cast<float* const>(in0->getImpl()->rawPtr());
+    float* grid_ptr = grid;
+    O* const output = static_cast<O* const>(out->getImpl()->rawPtr());
+
+    const std::size_t N = in0->dim(0);
+    const std::size_t C = in0->dim(1);
+    const std::size_t in_H = in0->dim(2);
+    const std::size_t in_W = in0->dim(3);
+    const std::size_t grid_H = in1->dim(1);
+    const std::size_t grid_W = in1->dim(2);
+
+    const std::size_t in_N_s = in0->stride(0);
+    const std::size_t in_C_s = in0->stride(1);
+    const std::size_t in_H_s = in0->stride(2);
+    const std::size_t in_W_s = in0->stride(3);
+    const std::size_t grid_N_s = in1->stride(0);
+    const std::size_t grid_H_s = in1->stride(1);
+    const std::size_t grid_W_s = in1->stride(2);
+    const std::size_t grid_Coord_s = in1->stride(3);
+    const std::size_t out_N_s = out->stride(0);
+    const std::size_t out_C_s = out->stride(1);
+    const std::size_t out_H_s = out->stride(2);
+    const std::size_t out_W_s = out->stride(3);
+
+
+    float* grid_ptr_N = grid;
+    const I* input_ptr_N = input;
+    O* output_ptr_N = output;
+    for (std::size_t n = 0; n < N; ++n) {
+        for (std::size_t grid_y = 0; grid_y < grid_H; ++grid_y) {
+            for (std::size_t grid_x = 0; grid_x < grid_W; ++grid_x) {
+                O* output_ptr = output_ptr_N + grid_y*out_H_s + grid_y*out_W_s;
+                grid_ptr = grid_ptr_N + grid_y*grid_H_s + grid_x*grid_W_s;
+                /*
+                * change grid_x coord to match padding_mode
+                * Change range from [-1, 1] to [0, H-1] or [-0.5, H-0.5] according to align_corners
+                * Handle computation of interpolation
+                *   any value outside bounds is considered 0
+                *   if nearest:
+                *   else if linear:
+                *   else if cubic:
+                *   else : nothing
+                */
+                float x = *grid_ptr;
+                float y = grid_ptr[grid_Coord_s];
+                x = update_normalized_coord_with_padding(x, op.paddingMode());
+                x = unnormalize_grid_sample_coord(x, in_W, op.alignCorners());
+                y = update_normalized_coord_with_padding(y, op.paddingMode());
+                y = unnormalize_grid_sample_coord(y, in_H, op.alignCorners());
+                if (op.mode() == GridSample_Op::Mode::Nearest) {
+                    const std::int64_t x_rounded = std::nearbyintf(x);
+                    const std::int64_t y_rounded = std::nearbyintf(y);
+
+                    if (in_bound(x_rounded, 0, in_W) && in_bound(y_rounded, 0, in_H)) {
+                        input_ptr = input_ptr_N + y_rounded*in_H_s + x_rounded*in_W_s;
+                        for (std::size_t c = 0; c < C; ++c) {
+                            *output_ptr = *input_ptr;
+                            input_ptr += in_C_s;
+                            output_ptr += out_C_s;
+                        }
+                    } else {
+                        for (std::size_t c = 0; c < C; ++c) {
+                            *output_ptr = O(0);
+                            output_ptr += out_C_s;
+                        }
+                    }
+                } else if (op.mode() == GridSample_Op::Mode::Linear) {
+                    const std::int64_t x_r = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_W, op.paddingMode()); // right
+                    const std::int64_t x_l = update_unnormalized_coord_with_padding(x_r + 1, in_W, op.paddingMode()); // left
+
+                    const std::int64_t y_t = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(y)), in_H, op.paddingMode()); // top
+                    const std::int64_t y_b = update_unnormalized_coord_with_padding(y_t + 1, in_H, op.paddingMode()); // bottom
+
+                    const I* input_ptr_NC = input_ptr_N;
+                    for (std::size_t c = 0; c < C; ++c) {
+
+                        const I f_tr = (in_bound(x_r, 0, in_W) && in_bound(y_t, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_t)*in_H_s
+                                         + static_cast<std::size_t>(x_r)*in_W_s]
+                                : I(0);
+                        const I f_tl = (in_bound(x_l, 0, in_W) && in_bound(y_t, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_t)*in_H_s
+                                         + static_cast<std::size_t>(x_l)*in_W_s]
+                                : I(0);
+                        const I f_br = (in_bound(x_r, 0, in_W) && in_bound(y_b, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_b)*in_H_s
+                                         + static_cast<std::size_t>(x_r)*in_W_s]
+                                : I(0);
+                        const I f_bl = (in_bound(x_l, 0, in_W) && in_bound(y_b, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_b)*in_H_s
+                                         + static_cast<std::size_t>(x_l)*in_W_s]
+                                : I(0);
+
+                        // compute weighted sum of the 4 corners
+                        const I w_tr = static_cast<I>((y - static_cast<float>(y_t))*(static_cast<float>(x_r) - x));
+                        const I w_tl = static_cast<I>((y - static_cast<float>(y_t))*(x - static_cast<float>(x_l)));
+                        const I w_br = static_cast<I>((static_cast<float>(y_b) - y)*(static_cast<float>(x_r) - x));
+                        const I w_bl = static_cast<I>((static_cast<float>(y_b) - y)*(x - static_cast<float>(x_l)));
+
+                        *output_ptr = static_cast<O>(w_tr*f_tr + w_tl*f_tl + w_br*f_br + w_bl*f_bl);
+
+                        input_ptr_NC += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                } else if (op.mode() == GridSample_Op::Mode::Cubic) {
+                    /*
+                    *  .. .. .. .. .. ..
+                    *  .. 00 01 02 03 ..
+                    *  .. 10 11 12 13 ..
+                    *  .. 20 21 22 23 ..
+                    *  .. 30 31 32 33 ..
+                    *  .. .. .. .. .. ..
+                    */
+                    const std::int64_t x_1 = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_W, op.paddingMode());
+                    const std::int64_t x_0 = update_unnormalized_coord_with_padding(x_1 - 1, in_W, op.paddingMode());
+                    const std::int64_t x_2 = update_unnormalized_coord_with_padding(x_1 + 1, in_W, op.paddingMode());
+                    const std::int64_t x_3 = update_unnormalized_coord_with_padding(x_1 + 2, in_W, op.paddingMode());
+
+                    const std::int64_t y_1 = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(y)), in_H, op.paddingMode());
+                    const std::int64_t y_0 = update_unnormalized_coord_with_padding(y_1 - 1, in_H, op.paddingMode());
+                    const std::int64_t y_2 = update_unnormalized_coord_with_padding(y_1 + 1, in_H, op.paddingMode());
+                    const std::int64_t y_3 = update_unnormalized_coord_with_padding(y_1 + 2, in_H, op.paddingMode());
+
+                    const I* input_ptr_NC = input_ptr_N;
+
+                    for (std::size_t c = 0; c < C; ++c) {
+                        const I f_00 = in_bound(x_0, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_01 = in_bound(x_0, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_02 = in_bound(x_0, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_03 = in_bound(x_0, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_10 = in_bound(x_1, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_20 = in_bound(x_2, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_30 = in_bound(x_3, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_11 = in_bound(x_1, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_12 = in_bound(x_1, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_13 = in_bound(x_1, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_21 = in_bound(x_2, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_22 = in_bound(x_2, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_23 = in_bound(x_2, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_31 = in_bound(x_3, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_32 = in_bound(x_3, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_33 = in_bound(x_3, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_3*in_H_s] : I(0);
+
+                        const I mx_11 = (f_21 - f_01) / I(2);
+                        const I mx_12 = (f_22 - f_02) / I(2);
+                        const I mx_21 = (f_31 - f_11) / I(2);
+                        const I mx_22 = (f_32 - f_12) / I(2);
+
+                        const I my_11 = (f_12 - f_10) / I(2);
+                        const I my_12 = (f_13 - f_11) / I(2);
+                        const I my_21 = (f_22 - f_20) / I(2);
+                        const I my_22 = (f_23 - f_21) / I(2);
+
+                        const I mxy_11 = (f_22 - f_20 - f_02 - + f_00) / I(4);
+                        const I mxy_12 = (f_23 - f_21 - f_03 - + f_01) / I(4);
+                        const I mxy_21 = (f_32 - f_30 - f_12 - + f_10) / I(4);
+                        const I mxy_22 = (f_33 - f_31 - f_13 - + f_11) / I(4);
+
+                        const I a_00 = f_11;
+                        const I a_10 = mx_11;
+                        const I a_20 = I(3)*(f_21 - f_11) - I(2)*mx_11 - mx_21;
+                        const I a_30 = I(2)*(f_11 - f_21) + mx_11 + mx_21;
+                        const I a_01 = my_11;
+                        const I a_11 = mxy_11;
+                        const I a_21 = I(3)*(my_21 - my_11) - I(2)*mxy_11 - mxy_21;
+                        const I a_31 = I(2)*(my_11 - my_21) + mxy_11 + mxy_21;
+                        const I a_02 = I(3)*(f_12 - f_11) - I(2)*my_11 - my_12;
+                        const I a_12 = I(3)*(mx_12 - mx_11) - I(2)*mxy_11 - mxy_12;
+                        const I a_22 = I(9)*(f_11 + f_22 - f_21 - f_12) + I(3)*(I(2)*(mx_11 - mx_12 + my_11 - my_21) + mx_21 - mx_22 + my_12 - my_22) + mxy_22 + I(2)*(mxy_12 + mxy_21 + I(2)*mxy_11);
+                        const I a_32 = - mxy_12 - mxy_22 + I(2)*(my_22 - my_12 - mxy_11 - mxy_21 + I(2)*(my_21 - my_11) + I(3)*(f_21 + f_12 - f_11 - f_22)) + I(3)*(mx_12 + mx_22 - mx_11 - mx_21);
+                        const I a_03 = I(2)*(f_11 - f_12) + my_11 + my_12;
+                        const I a_13 = I(2)*(mx_11 - mx_12) + mxy_11 + mxy_12;
+                        const I a_23 = - mxy_21 - mxy_22 + I(2)*(-mx_21 + mx_22 - mxy_11 - mxy_12 + I(2)*(mx_12 - mx_11) + I(3)*(f_12 + f_21 - f_11 - f_22)) + I(3)*(my_21 + my_22 - my_11 - my_12);
+                        const I a_33 = mxy_11 + mxy_21 + mxy_12 + mxy_22 + I(2)*(mx_11 + mx_21 - mx_12 - mx_22 + my_11 - my_21 + my_12 - my_22 + I(2)*(f_11 - f_21 - f_12 + f_22));
+
+                        const I x2 = static_cast<I>(x*x);
+                        const I x3 = static_cast<I>(x*x*x);
+                        const I y2 = static_cast<I>(y*y);
+                        const I y3 = static_cast<I>(y*y*y);
+
+                        *output_ptr = static_cast<O>( \
+                            a_00 + a_10*x + a_20*x2 + a_30*x3 \
+                            + a_01*y + a_11*x*y + a_21*x2*y + a_31*x3*y \
+                            + a_02*y2 + a_12*x*y2 + a_22*x2*y2 + a_32*x3*y2 \
+                            + a_03*y3 + a_13*x*y3 + a_23*x2*y3 + a_33*x3*y3);
+
+                        input_ptr_NC += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                }
+            }
+        }
+
+        input_ptr_N += in_N_s;
+        grid_ptr_N += grid_N_s;
+        output_ptr_N += out_N_s;
+    }
+}
+
+// Kernels registration to implementation entry point
+// only accept 1st input with only 2 spatial feat. (nb dims = 2)
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float16}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<half_float::half, half_float::half>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float64}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Int32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp
index c9ad909eee631189a81067eda076c0b8cbb13377..1e8c1a14435f53ad7a63b327944e0bb8c70c8661 100644
--- a/include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp
+++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp
@@ -16,47 +16,26 @@
 #include <tuple>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/LeakyReLU.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
-// compute kernel registry for forward and backward
-class LeakyReLUImplForward_cpu
-    : public Registrable<LeakyReLUImplForward_cpu,
-                        std::tuple<DataType, DataType>,
-                        void(const float,
-                            std::size_t,
-                            const void*,
-                            void*)> {};
-class LeakyReLUImplBackward_cpu
-    : public Registrable<LeakyReLUImplBackward_cpu,
-                        std::tuple<DataType, DataType>,
-                        void(const float,
-                            std::size_t,
-                            const void*,
-                            void*)> {};
-
-class LeakyReLUImpl_cpu : public OperatorImpl {
-public:
-    LeakyReLUImpl_cpu(const LeakyReLU_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<LeakyReLUImpl_cpu> create(const LeakyReLU_Op& op) {
-        return std::make_unique<LeakyReLUImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-
-    void forward() override final;
-
-    void backward() override final;
-};
-
-namespace {
-static Registrar<LeakyReLU_Op> registrarLeakyReLUImpl_cpu("cpu", Aidge::LeakyReLUImpl_cpu::create);
-}
+// Operator implementation entry point for the backend
+using LeakyReLUImpl_cpu = OperatorImpl_cpu<LeakyReLU_Op,
+    void(const float,
+        std::size_t,
+        const void*,
+        void*),
+    void(const float,
+        std::size_t,
+        const void*,
+        void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(LeakyReLU_Op, "cpu", Aidge::LeakyReLUImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl_backward_kernels.hpp
deleted file mode 100644
index e308d940890101ad396c7ed20541bbc4f8b035cf..0000000000000000000000000000000000000000
--- a/include/aidge/backend/cpu/operator/LeakyReLUImpl_backward_kernels.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_BACKWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_BACKWARD_KERNEL_H_
-
-#include "aidge/utils/Registrar.hpp"
-
-#include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
-
-namespace Aidge {
-template <class I, class O>
-void LeakyReLUImpl_cpu_backward_kernel(const float negativeSlope_,
-                                     std::size_t inputLenght,
-                                     const void* input_,
-                                     void* output_) {
-
-    const I* input = static_cast<const I*>(input_);
-    O* output = static_cast<O*>(output_);
-    const I negativeSlope = static_cast<const I>(negativeSlope_);
-
-    for (std::size_t i = 0; i < inputLenght; ++i) {
-        output[i] = (input[i] > 0) ? input[i] : negativeSlope*input[i];
-    }
-}
-
-namespace {
-static Registrar<LeakyReLUImplBackward_cpu> registrarLeakyReLUImplBackward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::LeakyReLUImpl_cpu_backward_kernel<float, float>);
-static Registrar<LeakyReLUImplBackward_cpu> registrarLeakyReLUImplBackward_cpu_Int32(
-        {DataType::Int32, DataType::Int32}, Aidge::LeakyReLUImpl_cpu_backward_kernel<int, int>);
-static Registrar<LeakyReLUImplBackward_cpu> registrarLeakyReLUImplBackward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::LeakyReLUImpl_cpu_backward_kernel<double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_BACKWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl_forward_kernels.hpp
deleted file mode 100644
index 450d0bf4ace4879f90e0104e14b5bf61366e96c2..0000000000000000000000000000000000000000
--- a/include/aidge/backend/cpu/operator/LeakyReLUImpl_forward_kernels.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_FORWARD_KERNEL_H_
-
-#include "aidge/utils/Registrar.hpp"
-
-#include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
-
-namespace Aidge {
-template <class I, class O>
-void LeakyReLUImpl_cpu_forward_kernel(const float negativeSlope_,
-                                     std::size_t inputLenght,
-                                     const void* input_,
-                                     void* output_) {
-
-    const I* input = static_cast<const I*>(input_);
-    O* output = static_cast<O*>(output_);
-    const I negativeSlope = static_cast<const I>(negativeSlope_);
-
-    for (std::size_t i = 0; i < inputLenght; ++i) {
-        output[i] = (input[i] >= 0) ? input[i] : input[i] * negativeSlope;
-    }
-}
-
-namespace {
-static Registrar<LeakyReLUImplForward_cpu> registrarLeakyReLUImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::LeakyReLUImpl_cpu_forward_kernel<float, float>);
-static Registrar<LeakyReLUImplForward_cpu> registrarLeakyReLUImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32}, Aidge::LeakyReLUImpl_cpu_forward_kernel<int, int>);
-static Registrar<LeakyReLUImplForward_cpu> registrarLeakyReLUImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::LeakyReLUImpl_cpu_forward_kernel<double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_FORWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bc856f703aee8ba422887d43cb96db2132fc4603
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
@@ -0,0 +1,62 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void LeakyReLUImpl_cpu_forward_kernel(const float negativeSlope_,
+                                     std::size_t inputLenght,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+    const I negativeSlope = static_cast<const I>(negativeSlope_);
+
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        output[i] = (input[i] >= 0) ? input[i] : input[i] * negativeSlope;
+    }
+}
+
+template <class I, class O>
+void LeakyReLUImpl_cpu_backward_kernel(const float negativeSlope_,
+                                     std::size_t inputLenght,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+    const I negativeSlope = static_cast<const I>(negativeSlope_);
+
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        output[i] = (input[i] > 0) ? input[i] : negativeSlope*input[i];
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(LeakyReLUImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::LeakyReLUImpl_cpu_forward_kernel<float, float>, Aidge::LeakyReLUImpl_cpu_backward_kernel<float, float>});
+REGISTRAR(LeakyReLUImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::LeakyReLUImpl_cpu_forward_kernel<double, double>, Aidge::LeakyReLUImpl_cpu_backward_kernel<double, double>});
+REGISTRAR(LeakyReLUImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::LeakyReLUImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::LeakyReLUImpl_cpu_backward_kernel<int32_t, int32_t>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/LnImpl.hpp b/include/aidge/backend/cpu/operator/LnImpl.hpp
index faa03855a4f881f2a644ebc4023871b7acd6275c..d48a7ae437d9ed1c7769d3628691993c1e9dcb90 100755
--- a/include/aidge/backend/cpu/operator/LnImpl.hpp
+++ b/include/aidge/backend/cpu/operator/LnImpl.hpp
@@ -12,7 +12,7 @@
 #ifndef AIDGE_CPU_OPERATOR_LNIMPL_H_
 #define AIDGE_CPU_OPERATOR_LNIMPL_H_
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Ln.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
@@ -21,34 +21,13 @@
 #include <vector>
 
 namespace Aidge {
-// class Ln_Op;
+// Operator implementation entry point for the backend
+using LnImpl_cpu = OperatorImpl_cpu<Ln_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, const void*, void*)>;
 
-// compute kernel registry for forward and backward
-class LnImplForward_cpu
-    : public Registrable<LnImplForward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-class LnImplBackward_cpu
-    : public Registrable<LnImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t, const void*, const void*, void*)> {
-};
-
-class LnImpl_cpu : public OperatorImpl {
-public:
-    LnImpl_cpu(const Ln_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<LnImpl_cpu> create(const Ln_Op& op) {
-        return std::make_unique<LnImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-	
-    void forward() override final;
-
-    void backward() override final;
-};
-
-namespace {
-static Registrar<Ln_Op> registrarLnImpl_cpu("cpu", Aidge::LnImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(Ln_Op, "cpu", Aidge::LnImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_LNIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/LnImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/LnImpl_forward_kernels.hpp
deleted file mode 100755
index ebb975512a6e7c0f7225c305372f0ec6e7060786..0000000000000000000000000000000000000000
--- a/include/aidge/backend/cpu/operator/LnImpl_forward_kernels.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_LNIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_LNIMPL_FORWARD_KERNEL_H_
-
-#include "aidge/utils/Registrar.hpp"
-
-#include "aidge/backend/cpu/operator/LnImpl.hpp"
-
-namespace Aidge {
-template <class I, class O>
-void LnImpl_cpu_forward_kernel(std::size_t inputLenght,
-                               const void* input_,
-                               void* output_) {
-
-    const I* input = static_cast<const I*>(input_);
-    O* output = static_cast<O*>(output_);
-	const float eps = 1.0e-20f;
-
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
-		if (input[i] > I(eps)) {
-			output[i] = std::log(input[i]);
-		} else {
-			output[i] = std::log(I(eps));
-		}
-    }
-}
-
-namespace {
-static Registrar<LnImplForward_cpu> registrarLnImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::LnImpl_cpu_forward_kernel<float, float>);
-static Registrar<LnImplForward_cpu> registrarLnImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::LnImpl_cpu_forward_kernel<double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_LNIMPL_FORWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cpu/operator/LnImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
similarity index 50%
rename from include/aidge/backend/cpu/operator/LnImpl_backward_kernels.hpp
rename to include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
index 5fb82e35f8855d9d6e2eb85e9ab380c9f1fc9b90..b30b05bb806de08d4e70c67e66979fb3138980df 100755
--- a/include/aidge/backend/cpu/operator/LnImpl_backward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
@@ -1,50 +1,67 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_LNIMPL_BACKWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_LNIMPL_BACKWARD_KERNEL_H_
-
-#include <cstddef>  // std::size_t
-
-#include "aidge/backend/cpu/operator/LnImpl.hpp"
-#include "aidge/utils/Registrar.hpp"
-
-namespace Aidge {
-template <class I, class GI, class GO>
-void LnImpl_cpu_backward_kernel(const std::size_t inputLenght,
-                                const void* input_, const void* grad_output_,
-	                            void* grad_input_) {
-						 
-    const I* input = static_cast<const I*>(input_);
-    const GO* grad_output = static_cast<const GO*>(grad_output_);
-    GI* grad_input = static_cast<GI*>(grad_input_);
-	const float eps = 1.0e-20f;
-	
-    for (std::size_t i = 0; i < inputLenght; ++i) {
-		if (input[i] > I(eps)) {
-			grad_input[i] = grad_output[i] / input[i];
-		} else {
-			grad_input[i] = GI(0);
-		}
-    }
-}
-
-namespace {
-static Registrar<LnImplBackward_cpu> registrarLnImplBackward_cpu_Float32(
-    {DataType::Float32, DataType::Float32, DataType::Float32},
-    Aidge::LnImpl_cpu_backward_kernel<float, float, float>);	
-static Registrar<LnImplBackward_cpu> registrarLnImplBackward_cpu_Float64(
-    {DataType::Float64, DataType::Float64, DataType::Float64},
-    Aidge::LnImpl_cpu_backward_kernel<double, double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_LNIMPL_BACKWARD_KERNEL_H_ */
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_LNIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_LNIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/LnImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void LnImpl_cpu_forward_kernel(std::size_t inputLenght,
+                               const void* input_,
+                               void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+	const float eps = 1.0e-20f;
+
+//#pragma omp parallel for if (inputLenght > 1024)
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+		if (input[i] > I(eps)) {
+			output[i] = std::log(input[i]);
+		} else {
+			output[i] = std::log(I(eps));
+		}
+    }
+}
+
+template <class I, class GI, class GO>
+void LnImpl_cpu_backward_kernel(const std::size_t inputLenght,
+                                const void* input_, const void* grad_output_,
+	                            void* grad_input_) {
+						 
+    const I* input = static_cast<const I*>(input_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+	const float eps = 1.0e-20f;
+	
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+		if (input[i] > I(eps)) {
+			grad_input[i] = grad_output[i] / input[i];
+		} else {
+			grad_input[i] = GI(0);
+		}
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(LnImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::LnImpl_cpu_forward_kernel<float, float>, Aidge::LnImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(LnImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::LnImpl_cpu_forward_kernel<double, double>, Aidge::LnImpl_cpu_backward_kernel<double, double, double>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_LNIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/MatMulImpl.hpp b/include/aidge/backend/cpu/operator/MatMulImpl.hpp
index e4b76d64baadbcb1baa7d24180c4bb13ed47215b..c07aa5f8ffa62f5fffe3ca02638cc3c66cdaeedb 100644
--- a/include/aidge/backend/cpu/operator/MatMulImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MatMulImpl.hpp
@@ -16,37 +16,20 @@
 #include <memory>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/MatMul.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
+using MatMulImpl_cpu = OperatorImpl_cpu<MatMul_Op,
+    void(const std::size_t, const std::size_t, const std::size_t,
+                              const void *, const void *, void *)>;
 
-class MatMulImplForward_cpu
-    : public Registrable<MatMulImplForward_cpu, std::tuple<DataType, DataType>,
-                         void(const std::size_t, const std::size_t, const std::size_t,
-                              const void *, const void *, void *)> {};
-class MatMulImplBackward_cpu
-    : public Registrable<MatMulImplBackward_cpu, std::tuple<DataType, DataType>,
-                         void(const std::vector<DimSize_t>&, const std::vector<DimSize_t>&,
-                              const void *, const void *, void *)> {};
-
-class MatMulImpl_cpu : public OperatorImpl {
-public:
-    MatMulImpl_cpu(const MatMul_Op &op): OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<MatMulImpl_cpu> create(const MatMul_Op &op) {
-        return std::make_unique<MatMulImpl_cpu>(op);
-    }
-
-    void forward() override;
-};
-
-namespace {
-static Registrar<MatMul_Op> registrarMatMulImpl_cpu("cpu", Aidge::MatMulImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(MatMul_Op, "cpu", Aidge::MatMulImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_MATMULIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp
deleted file mode 100644
index 5045580fa599aac64f2c1414bfdf2b87ea57e313..0000000000000000000000000000000000000000
--- a/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_MATMULIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_MATMULIMPL_FORWARD_KERNEL_H_
-
-#include "aidge/backend/cpu/operator/MatMulImpl.hpp"
-
-namespace Aidge {
-
-template <class I, class O>
-void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, const std::size_t m,
-                                    const void* input1_, const void* input2_, void* output_) {
-    // FIXME: missing MatMul parameters as arguments
-    const I* input1 = static_cast<const I*>(input1_);
-    const I* input2 = static_cast<const I*>(input2_);
-    O* output = static_cast<O*>(output_);
-
-    for (std::size_t i = 0; i < n; ++i) {
-        for (std::size_t j = 0; j < m; ++j) {
-            O sum = O(0);
-            for (std::size_t l = 0; l < k; ++l) {
-                sum += static_cast<O>(input1[i*k + l] * input2[l*m + j]);
-            }
-            output[i*m + j] = sum;
-        }
-    }
-}
-
-namespace {
-static Registrar<MatMulImplForward_cpu> registrarMatMulImpl2DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32},
-        Aidge::MatMulImpl_cpu_forward_kernel<float, float>);
-static Registrar<MatMulImplForward_cpu> registrarMatMulImpl2DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32},
-        Aidge::MatMulImpl_cpu_forward_kernel<int, int>);
-static Registrar<MatMulImplForward_cpu> registrarMatMulImpl2DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64},
-        Aidge::MatMulImpl_cpu_forward_kernel<double, double>);
-}  // namespace
-
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_MATMULIMPL_FORWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5fc13baf49b1d0606eb4af5a54eec83fa5dce22a
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
@@ -0,0 +1,50 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MATMULIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_MATMULIMPL_KERNELS_H_
+
+#include "aidge/backend/cpu/operator/MatMulImpl.hpp"
+
+namespace Aidge {
+
+template <class I, class O>
+void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, const std::size_t m,
+                                    const void* input1_, const void* input2_, void* __restrict output_) {
+    // FIXME: missing MatMul parameters as arguments
+    const I* input1 = static_cast<const I*>(input1_);
+    const I* input2 = static_cast<const I*>(input2_);
+    O* __restrict output = static_cast<O* __restrict>(output_);
+
+    std::memset(output, O(0), n * m * sizeof(O));
+
+    for (std::size_t i = 0; i < n; ++i) {
+        for (std::size_t l = 0; l < k; ++l) {
+            for (std::size_t j = 0; j < m; ++j) {
+                output[i*m + j] += static_cast<O>(input1[i*k + l] * input2[l*m + j]);
+            }
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(MatMulImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::defaultModel, Aidge::MatMulImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(MatMulImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::defaultModel, Aidge::MatMulImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(MatMulImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::defaultModel, Aidge::MatMulImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MATMULIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
index 4dd30e1fb939837f6861313eda04d7d05f3c8110..68cc3621514de97d9837e10bcf90218abe559aaa 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
@@ -17,51 +17,25 @@
 #include <tuple>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/MaxPooling.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
-// class MaxPooling_Op;
-
-// compute kernel registry for forward and backward
-class MaxPoolingImpl2DForward_cpu
-    : public Registrable<MaxPoolingImpl2DForward_cpu,
-                        std::tuple<DataType, DataType>,
-                        void(const std::array<DimSize_t, 2>&,
-                            const std::array<DimSize_t, 2>&,
-                            const bool,
-                            const std::array<DimSize_t, 4> &,
-                            const void *,
-                            void *)> {};
-class MaxPoolingImpl2DBackward_cpu
-    : public Registrable<MaxPoolingImpl2DBackward_cpu,
-                        std::tuple<DataType, DataType>,
-                        void(const std::array<DimSize_t, 2>&,
+// Operator implementation entry point for the backend
+using MaxPooling2D_Op = MaxPooling_Op<2>;
+using MaxPoolingImpl2D_cpu = OperatorImpl_cpu<MaxPooling_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
                             const std::array<DimSize_t, 2>&,
                             const bool,
                             const std::array<DimSize_t, 4> &,
                             const void *,
-                            void *)> {};
-
-class MaxPoolingImpl2D_cpu : public OperatorImpl {
-public:
-    MaxPoolingImpl2D_cpu(const MaxPooling_Op<2> &op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<MaxPoolingImpl2D_cpu> create(const MaxPooling_Op<2> &op) {
-        return std::make_unique<MaxPoolingImpl2D_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
+                            void *)>;
 
-namespace {
-// add cpu backend to MaxPooling_Op<2> implementation registry
-static Registrar<MaxPooling_Op<2>> registrarMaxPoolingImpl2D_cpu("cpu", Aidge::MaxPoolingImpl2D_cpu::create);
-}  // namespace
+// Implementation entry point registration to Operator
+REGISTRAR(MaxPooling2D_Op, "cpu", Aidge::MaxPoolingImpl2D_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
similarity index 91%
rename from include/aidge/backend/cpu/operator/MaxPoolingImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 79a7bd154f4d4e19a71d719597992466c37c6a9f..7b6f04f141eb701849a8d436561bcf9e37471cfa 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_
 
 #include <array>
 #include <cmath>
@@ -199,17 +199,16 @@ void N2D2::PoolCell_Frame_Kernels::forwardMax(const T* alpha,
 
 */
 
-namespace {
-static Registrar<MaxPoolingImpl2DForward_cpu> registrarMaxPoolingImpl2DForward_cpu_Float32(
-        std::tuple<DataType, DataType>({DataType::Float32, DataType::Float32}),
-        Aidge::MaxPoolingImpl2D_cpu_forward_kernel<float, float>);
-static Registrar<MaxPoolingImpl2DForward_cpu> registrarMaxPoolingImpl2DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32},
-        Aidge::MaxPoolingImpl2D_cpu_forward_kernel<int, int>);
-static Registrar<MaxPoolingImpl2DForward_cpu> registrarMaxPoolingImpl2DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64},
-        Aidge::MaxPoolingImpl2D_cpu_forward_kernel<double, double>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(MaxPoolingImpl2D_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(MaxPoolingImpl2D_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(MaxPoolingImpl2D_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/MulImpl.hpp b/include/aidge/backend/cpu/operator/MulImpl.hpp
index 2d42194c417bd7d57c00f4325a4585cf59d95b24..05fceba17471229d83d9f8738614b2e747121b49 100644
--- a/include/aidge/backend/cpu/operator/MulImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl.hpp
@@ -12,7 +12,7 @@
 #ifndef AIDGE_CPU_OPERATOR_MULIMPL_H_
 #define AIDGE_CPU_OPERATOR_MULIMPL_H_
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Mul.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
@@ -21,31 +21,27 @@
 #include <vector>
 
 namespace Aidge {
-// class Mul_Op;
-
-// compute kernel registry for forward and backward
-class MulImplForward_cpu
-    : public Registrable<MulImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)> {
-};
-class MulImplBackward_cpu
-    : public Registrable<MulImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)> {
-};
-
-class MulImpl_cpu : public OperatorImpl {
-public:
-    MulImpl_cpu(const Mul_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<MulImpl_cpu> create(const Mul_Op& op) {
-        return std::make_unique<MulImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-static Registrar<Mul_Op> registrarMulImpl_cpu("cpu", Aidge::MulImpl_cpu::create);
-}
+// Operator implementation entry point for the backend
+using MulImpl_cpu = OperatorImpl_cpu<Mul_Op,
+    void(const std::vector<std::size_t>&,
+        const std::vector<std::size_t>&, 
+        const std::vector<std::size_t>&, 
+        const void*, 
+        const void*,
+        void*),
+    void(const std::size_t, 
+        const std::size_t, 
+        const std::size_t,
+        const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
+        const void*, 
+        const void*, 
+        const void*, 
+        void*, 
+        void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Mul_Op, "cpu", Aidge::MulImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_MULIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/MulImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_forward_kernels.hpp
deleted file mode 100644
index c44199ba4797682362f4a7cb223435d6d1585443..0000000000000000000000000000000000000000
--- a/include/aidge/backend/cpu/operator/MulImpl_forward_kernels.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_MULIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_MULIMPL_FORWARD_KERNEL_H_
-
-#include "aidge/utils/Registrar.hpp"
-
-#include <cstdint>     // std::int32_t, std::int64_t
-
-#include "aidge/backend/cpu/data/Broadcasting.hpp"
-#include "aidge/backend/cpu/operator/MulImpl.hpp"
-
-namespace Aidge {
-template <class I1, class I2, class O>
-void MulImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
-                                const std::vector<std::size_t>& input2Dims,
-                                const std::vector<std::size_t>& outputDims,
-                                const void* input1_,
-                                const void* input2_,
-                                void* output_) {
-
-    const I1* input_1 = static_cast<const I1*>(input1_);
-    const I2* input_2 = static_cast<const I2*>(input2_);
-    O* output = static_cast<O*>(output_);
-
-    size_t totalElements = 1;
-    for (size_t dimSize : outputDims) {
-        totalElements *= dimSize;
-    }
-
-	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
-	{
-		std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
-
-		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
-		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
-
-        output[oIndex] = input_1[idx1] * input_2[idx2];
-    }
-}
-
-namespace {
-static Registrar<MulImplForward_cpu> registrarMulImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::MulImpl_cpu_forward_kernel<float, float, float>);
-static Registrar<MulImplForward_cpu> registrarMulImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::MulImpl_cpu_forward_kernel<double, double, double>);
-static Registrar<MulImplForward_cpu> registrarMulImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::MulImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>);
-static Registrar<MulImplForward_cpu> registrarMulImplForward_cpu_Int64(
-        {DataType::Int64, DataType::Int64, DataType::Int64},
-        Aidge::MulImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_MULIMPL_FORWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c015b8f0182608fecd3da94220e9411decfd186c
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
@@ -0,0 +1,126 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MULIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_MULIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include <cstdint>     // std::int32_t, std::int64_t
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/MulImpl.hpp"
+
+namespace Aidge {
+template <class I1, class I2, class O>
+void MulImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
+                                const std::vector<std::size_t>& input2Dims,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input1_,
+                                const void* input2_,
+                                void* output_) {
+
+    const I1* input_1 = static_cast<const I1*>(input1_);
+    const I2* input_2 = static_cast<const I2*>(input2_);
+    O* output = static_cast<O*>(output_);
+
+    size_t totalElements = 1;
+    for (size_t dimSize : outputDims) {
+        totalElements *= dimSize;
+    }
+
+	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+	{
+		std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+
+		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
+
+        output[oIndex] = input_1[idx1] * input_2[idx2];
+    }
+}
+
+template <class I1, class I2, class O>
+void MulImpl_cpu_backward_kernel(const std::size_t input0Length, 
+                                 const std::size_t input1Length,
+                                 const std::size_t grad0Length,
+                                 const std::vector<std::size_t> input0Dims,
+                                 const std::vector<std::size_t> input1Dims,
+                                 const void* input0_, 
+                                 const void* input1_, 
+                                 const void* grad_output_, 
+                                 void* gradientInput0,
+                                 void* gradientInput1)
+{
+    const auto* input0 = static_cast<const I1*>(input0_);
+    const auto* input1 = static_cast<const I1*>(input1_);
+    const auto* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0);
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1);
+
+
+    if(input0Dims.size() >= input1Dims.size())
+    {
+        AIDGE_ASSERT(input0Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
+
+        for(auto i = 0U; i < input0Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input1Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
+
+            grad_input_0[i] = input1[flattenedIndex] * grad_output[i];
+        }
+
+        for(std::size_t i = 0 ; i < grad0Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input1Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
+
+            grad_input_1[flattenedIndex] += input0[i] * grad_output[i];
+        }
+
+    } else {
+        AIDGE_ASSERT(input1Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
+
+        for(auto i = 0U; i < input1Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input0Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
+
+            grad_input_1[i] = input0[flattenedIndex] * grad_output[i];
+        }
+
+        for(std::size_t i = 0 ; i < grad0Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input0Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
+
+            grad_input_0[flattenedIndex] += input1[i] * grad_output[i];
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(MulImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, float, float>, Aidge::MulImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(MulImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<double, double, double>, Aidge::MulImpl_cpu_backward_kernel<double, double, double>});
+REGISTRAR(MulImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, Aidge::MulImpl_cpu_backward_kernel<std::int32_t, std::int32_t, std::int32_t>});
+REGISTRAR(MulImpl_cpu,
+    {DataType::Int64},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, Aidge::MulImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MULIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/OperatorImpl.hpp b/include/aidge/backend/cpu/operator/OperatorImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..abf94ab9069a07e8f87819cb29c027b1adbfd9c6
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/OperatorImpl.hpp
@@ -0,0 +1,50 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_IMPL_H_
+#define AIDGE_CPU_OPERATOR_IMPL_H_
+
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>    // std::tuple
+#include <vector>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+template <class Op, class FwdFunc, class BwdFunc = void()>
+class OperatorImpl_cpu : public OperatorImpl,
+    public Registrable<OperatorImpl_cpu<Op, FwdFunc, BwdFunc>, ImplSpec, Impl<FwdFunc, BwdFunc>>
+{
+public:
+    OperatorImpl_cpu(const Op& op) : OperatorImpl(op, "cpu") {}
+
+    static std::unique_ptr<OperatorImpl_cpu<Op, FwdFunc, BwdFunc>> create(const Op& op) {
+        return std::make_unique<OperatorImpl_cpu<Op, FwdFunc, BwdFunc>>(op);
+    }
+
+    virtual std::shared_ptr<ProdConso> getProdConso() const override {
+        const auto impl = Registrar<OperatorImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+        return impl.prodConso(mOp);
+    }
+
+    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
+        return Registrar<OperatorImpl_cpu>::getKeys();
+    }
+
+    void forward() override;
+    void backward() override;
+};
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_IMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/PadImpl.hpp b/include/aidge/backend/cpu/operator/PadImpl.hpp
index c6e41c29fd203fdd80b2acb9ad0dfcac91a0f66c..bc0bd8cad3b630b89f728d78b59652f31bbcf410 100644
--- a/include/aidge/backend/cpu/operator/PadImpl.hpp
+++ b/include/aidge/backend/cpu/operator/PadImpl.hpp
@@ -17,79 +17,46 @@
 #include <tuple>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Pad.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
-// class Pad_Op;
-// compute kernel registry for forward and backward
-class PadImpl1DForward_cpu
-    : public Registrable<PadImpl1DForward_cpu,
-                         std::tuple<DataType, DataType>,
-                         void(const std::array<DimSize_t, 2>&,
-                            const PadBorderType,
-                            const double,
-                            const std::array<DimSize_t, 3> &,
-                            const void *,
-                            void *)> {};
-
-class PadImpl1D_cpu : public OperatorImpl {
+class Pad_ProdConso_cpu : public ProdConso {
 public:
-    PadImpl1D_cpu(const Pad_Op<1> &op) : OperatorImpl(op, "cpu") {}
+    Pad_ProdConso_cpu(const Operator& op): ProdConso(op) {}
 
-    static std::unique_ptr<PadImpl1D_cpu> create(const Pad_Op<1> &op) {
-        return std::make_unique<PadImpl1D_cpu>(op);
+    static std::unique_ptr<ProdConso> defaultModel(const Operator& op) {
+        return std::make_unique<Pad_ProdConso_cpu>(op);
     }
 
     Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
 };
 
-namespace {
-// add cpu backend to Pad_Op<1> implementation registry
-static Registrar<Pad_Op<1>> registrarPadImpl1D_cpu("cpu", Aidge::PadImpl1D_cpu::create);
-}  // namespace
-
-
-// compute kernel registry for forward and backward
-class PadImpl2DForward_cpu
-    : public Registrable<PadImpl2DForward_cpu,
-                         std::tuple<DataType, DataType>,
-                         void(const std::array<DimSize_t, 4>&,
+// Operator implementation entry point for the backend
+using Pad1D_Op = Pad_Op<1>;
+using PadImpl1D_cpu = OperatorImpl_cpu<Pad_Op<1>,
+    void(const std::array<DimSize_t, 2>&,
                             const PadBorderType,
                             const double,
-                            const std::array<DimSize_t, 4> &,
+                            const std::array<DimSize_t, 3> &,
                             const void *,
-                            void *)> {};
-class PadImpl2DBackward_cpu
-    : public Registrable<PadImpl2DBackward_cpu,
-                         std::tuple<DataType, DataType>,
-                         void(const std::array<DimSize_t, 4>&,
+                            void *)>;
+
+using Pad2D_Op = Pad_Op<2>;
+using PadImpl2D_cpu = OperatorImpl_cpu<Pad_Op<2>,
+    void(const std::array<DimSize_t, 4>&,
                             const PadBorderType,
                             const double,
                             const std::array<DimSize_t, 4> &,
                             const void *,
-                            void *)> {};
-
-class PadImpl2D_cpu : public OperatorImpl {
-public:
-    PadImpl2D_cpu(const Pad_Op<2> &op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<PadImpl2D_cpu> create(const Pad_Op<2> &op) {
-        return std::make_unique<PadImpl2D_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
+                            void *)>;
 
-namespace {
-// add cpu backend to Pad_Op<2> implementation registry
-static Registrar<Pad_Op<2>> registrarPadImpl2D_cpu("cpu", Aidge::PadImpl2D_cpu::create);
-}  // namespace
+// Implementation entry point registration to Operator
+REGISTRAR(Pad1D_Op, "cpu", Aidge::PadImpl1D_cpu::create);
+REGISTRAR(Pad2D_Op, "cpu", Aidge::PadImpl2D_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_PADIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/PadImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
similarity index 78%
rename from include/aidge/backend/cpu/operator/PadImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
index 26c873c8fe7f140b09b31d0f1a9d4125acbcf50f..a362be0944aa18c36dd74a2f0066aaa21a1fc4c0 100644
--- a/include/aidge/backend/cpu/operator/PadImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_PADIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_PADIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_PADIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_PADIMPL_KERNELS_H_
 
 #include <algorithm>  // std::max, std::min
 #include <array>
@@ -88,17 +88,16 @@ void PadImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorder
     }
 }
 
-namespace {
-static Registrar<PadImpl1DForward_cpu> registrarPadImpl1DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32},
-        PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Float32>, cpptype_t<DataType::Float32>>);
-static Registrar<PadImpl1DForward_cpu> registrarPadImpl1DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32},
-        PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Int32>, cpptype_t<DataType::Int32>>);
-static Registrar<PadImpl1DForward_cpu> registrarPadImpl1DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64},
-        PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Float64>, cpptype_t<DataType::Float64>>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(PadImpl1D_cpu,
+    {{DataType::Float32, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Float32>, cpptype_t<DataType::Float32>>, nullptr});
+REGISTRAR(PadImpl1D_cpu,
+    {{DataType::Float64, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Float64>, cpptype_t<DataType::Float64>>, nullptr});
+REGISTRAR(PadImpl1D_cpu,
+    {{DataType::Int32, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Int32>, cpptype_t<DataType::Int32>>, nullptr});
 
 
 /**
@@ -131,7 +130,7 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder
 
             for (std::uint32_t oy = 0; oy < oySize; ++oy) {
                 for (std::uint32_t ox = 0; ox < oxSize; ++ox) {
-                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
+                    const std::size_t oIndexFull = oIndex + oy*oxSize + ox;
 
                     O outputValue = static_cast<O>(borderValue);
 
@@ -140,14 +139,14 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder
                         std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1]);
 
                         if (ix >= 0  && ix < static_cast<std::int32_t>(dims[3]) && iy >= 0  && iy < static_cast<std::int32_t>(dims[2])) {
-                            outputValue = input[iIndex + static_cast<std::size_t>(ix)*dims[2] + static_cast<std::size_t>(iy)];
+                            outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                         }
                     }
                     else if (borderType == PadBorderType::Edge) {
                         std::int32_t ix = std::max(0, std::min(static_cast<std::int32_t>(dims[3]) - 1, static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3])));
                         std::int32_t iy = std::max(0, std::min(static_cast<std::int32_t>(dims[2]) - 1, static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1])));
 
-                        outputValue = input[iIndex + static_cast<std::size_t>(ix)*dims[2] + static_cast<std::size_t>(iy)];
+                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                     }
                     else if (borderType == PadBorderType::Reflect) {
                         std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3]);
@@ -162,13 +161,13 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder
                         if (iy >= static_cast<std::int32_t>(dims[2]))
                             iy = static_cast<std::int32_t>(dims[2]) - iy;
 
-                        outputValue = input[iIndex + static_cast<std::size_t>(ix)*dims[2] + static_cast<std::size_t>(iy)];
+                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                     }
                     else if (borderType == PadBorderType::Wrap) {
                         std::int32_t ix = (static_cast<std::int32_t>(dims[3]) + static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3])) % static_cast<std::int32_t>(dims[3]);
                         std::int32_t iy = (static_cast<std::int32_t>(dims[2]) + static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1])) % static_cast<std::int32_t>(dims[2]);
 
-                        outputValue = input[iIndex + static_cast<std::size_t>(ix)*dims[2] + static_cast<std::size_t>(iy)];
+                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                     }
 
                     output[oIndexFull] = outputValue;
@@ -178,17 +177,16 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder
     }
 }
 
-namespace {
-static Registrar<PadImpl2DForward_cpu> registrarPadImpl2DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32},
-        Aidge::PadImpl2D_cpu_forward_kernel<float, float>);
-static Registrar<PadImpl2DForward_cpu> registrarPadImpl2DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32},
-        Aidge::PadImpl2D_cpu_forward_kernel<std::int32_t, std::int32_t>);
-static Registrar<PadImpl2DForward_cpu> registrarPadImpl2DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64},
-        Aidge::PadImpl2D_cpu_forward_kernel<double, double>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(PadImpl2D_cpu,
+    {{DataType::Float32, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl2D_cpu_forward_kernel<cpptype_t<DataType::Float32>, cpptype_t<DataType::Float32>>, nullptr});
+REGISTRAR(PadImpl2D_cpu,
+    {{DataType::Float64, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl2D_cpu_forward_kernel<cpptype_t<DataType::Float64>, cpptype_t<DataType::Float64>>, nullptr});
+REGISTRAR(PadImpl2D_cpu,
+    {{DataType::Int32, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl2D_cpu_forward_kernel<cpptype_t<DataType::Int32>, cpptype_t<DataType::Int32>>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_PADIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_PADIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/PowImpl.hpp b/include/aidge/backend/cpu/operator/PowImpl.hpp
index 514e63af5ae5d1d1d00f7f328f5367df2bfa163d..cfbb8173d1f83162519016a8f2b3c3166977a5b7 100644
--- a/include/aidge/backend/cpu/operator/PowImpl.hpp
+++ b/include/aidge/backend/cpu/operator/PowImpl.hpp
@@ -12,7 +12,7 @@
 #ifndef AIDGE_CPU_OPERATOR_POWIMPL_H_
 #define AIDGE_CPU_OPERATOR_POWIMPL_H_
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Pow.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
@@ -21,32 +21,14 @@
 #include <vector>
 
 namespace Aidge {
-// class Pow_Op;
+// Operator implementation entry point for the backend
+using PowImpl_cpu = OperatorImpl_cpu<Pow_Op,
+    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*),
+    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, const void*, void*, void*)>;
 
-// compute kernel registry for forward and backward
-class PowImplForward_cpu
-    : public Registrable<PowImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)> {
-};
-class PowImplBackward_cpu
-    : public Registrable<PowImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)> {
-};
 
-class PowImpl_cpu : public OperatorImpl {
-public:
-    PowImpl_cpu(const Pow_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<PowImpl_cpu> create(const Pow_Op& op) {
-        return std::make_unique<PowImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-    void backward() override;
-};
-
-namespace {
-static Registrar<Pow_Op> registrarPowImpl_cpu("cpu", Aidge::PowImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(Pow_Op, "cpu", Aidge::PowImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_POWIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab9b2ccc7b823842decd044b90a5c6364cedc9c9
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
@@ -0,0 +1,95 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+#include <cmath>
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/PowImpl.hpp"
+
+namespace Aidge {
+template <class I1, class I2, class O>
+void PowImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
+                                const std::vector<std::size_t>& input2Dims,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input1_,
+                                const void* input2_,
+                                void* output_) {
+
+    const I1* input_1 = static_cast<const I1*>(input1_);
+    const I2* input_2 = static_cast<const I2*>(input2_);
+    O* output = static_cast<O*>(output_);
+
+    std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) 
+	{
+		std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+
+		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
+		
+        output[oIndex] = std::pow(input_1[idx1], input_2[idx2]);
+	}
+}
+
+template <class I1, class I2, class O>
+void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims,
+                                const std::vector<std::size_t>& input1Dims,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                const void* gradOutput_,
+                                void* gradientInput0_,
+                                void* gradientInput1_) {
+	const I1* input0 = static_cast<const I1*>(input0_);
+	I1* grad0 = static_cast<I1*>(gradientInput0_);
+    const I2* input1 = static_cast<const I2*>(input1_);
+    I2* grad1 = static_cast<I2*>(gradientInput1_);
+    const O* gradOut = static_cast<const O*>(gradOutput_);
+
+    // Fill input grads with zeros
+	std::size_t input0Elements = std::accumulate(input0Dims.cbegin(), input0Dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+	std::fill(grad0, grad0 + input0Elements, I1(0));
+	std::size_t input1Elements = std::accumulate(input1Dims.cbegin(), input1Dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+	std::fill(grad1, grad1 + input1Elements, I2(0));
+
+	std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    for (size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+    {
+        // Compute indexes in inputs 0 and 1 to support broadcasting
+        std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+        std::size_t idx0 = getFlattenedIndex(input0Dims, indexes);
+        std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+
+        // grad0 = grad_output * (input1 * pow(input0, (input1 -1)))
+        grad0[idx0] += gradOut[oIndex]*input1[idx1]* std::pow(input0[idx0], input1[idx1]-1);
+
+        // grad1 = grad_output * (output * ln(input0))
+        grad1[idx1] += gradOut[oIndex] * std::pow(input0[idx0], input1[idx1]) * std::log(input0[idx0]);
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(PowImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(PowImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>});
+REGISTRAR(PowImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/ReLUImpl.hpp b/include/aidge/backend/cpu/operator/ReLUImpl.hpp
index e2ebf44616db876b462157db650ff48362dd7bac..5b900618abce83ff1c3822d4f61cc62c93f5081f 100644
--- a/include/aidge/backend/cpu/operator/ReLUImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ReLUImpl.hpp
@@ -17,40 +17,19 @@
 #include <tuple>    // std::tuple
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/ReLU.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
-// class ReLU_Op;
+// Operator implementation entry point for the backend
+using ReLUImpl_cpu = OperatorImpl_cpu<ReLU_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, const void*, void*)>;
 
-// compute kernel registry for forward and backward
-class ReLUImplForward_cpu
-    : public Registrable<ReLUImplForward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-class ReLUImplBackward_cpu
-    : public Registrable<ReLUImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t, const void*, const void*, void*)> {
-};
-
-class ReLUImpl_cpu : public OperatorImpl {
-public:
-    ReLUImpl_cpu(const ReLU_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<ReLUImpl_cpu> create(const ReLU_Op& op) {
-        return std::make_unique<ReLUImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-
-    void forward() override final;
-
-    void backward() override final;
-};
-
-namespace {
-static Registrar<ReLU_Op> registrarReLUImpl_cpu("cpu", Aidge::ReLUImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(ReLU_Op, "cpu", Aidge::ReLUImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_RELUIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ReLUImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/ReLUImpl_backward_kernels.hpp
deleted file mode 100644
index 1bd932e43608d98f737cc9046aed74b2fec6abc6..0000000000000000000000000000000000000000
--- a/include/aidge/backend/cpu/operator/ReLUImpl_backward_kernels.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_RELUIMPL_BACKWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_RELUIMPL_BACKWARD_KERNEL_H_
-
-#include <cstddef>  // std::size_t
-
-#include "aidge/backend/cpu/operator/ReLUImpl.hpp"
-#include "aidge/utils/Registrar.hpp"
-
-namespace Aidge {
-template <class I, class GI, class GO>
-void ReLUImpl_cpu_backward_kernel(const std::size_t inputLenght,
-                                  const void* input_, const void* grad_output_,
-				  void* grad_input_) {
-    const I* input = static_cast<const I*>(input_);
-    const GO* grad_output = static_cast<const GO*>(grad_output_);
-    GI* grad_input = static_cast<GI*>(grad_input_);
-    for (std::size_t i = 0; i < inputLenght; ++i) {
-        grad_input[i] = (input[i] > 0) ? grad_output[i] : 0;
-    }
-}
-
-namespace {
-static Registrar<ReLUImplBackward_cpu> registrarReLUImplBackward_cpu_Float32(
-    {DataType::Float32, DataType::Float32, DataType::Float32},
-    Aidge::ReLUImpl_cpu_backward_kernel<float, float, float>);
-static Registrar<ReLUImplBackward_cpu> registrarReLUImplBackward_cpu_Int32(
-    {DataType::Int32, DataType::Int32, DataType::Int32},
-    Aidge::ReLUImpl_cpu_backward_kernel<int, int, int>);
-static Registrar<ReLUImplBackward_cpu> registrarReLUImplBackward_cpu_Float64(
-    {DataType::Float64, DataType::Float64, DataType::Float64},
-    Aidge::ReLUImpl_cpu_backward_kernel<double, double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_RELUIMPL_BACKWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ReLUImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/ReLUImpl_forward_kernels.hpp
deleted file mode 100644
index af9c65590c7182185c9d79669dde49e592cbeb5d..0000000000000000000000000000000000000000
--- a/include/aidge/backend/cpu/operator/ReLUImpl_forward_kernels.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_RELUIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_RELUIMPL_FORWARD_KERNEL_H_
-
-#include "aidge/utils/Registrar.hpp"
-
-#include "aidge/backend/cpu/operator/ReLUImpl.hpp"
-
-namespace Aidge {
-template <class I, class O>
-void ReLUImpl_cpu_forward_kernel(std::size_t inputLenght,
-                                     const void* input_,
-                                     void* output_) {
-
-    const I* input = static_cast<const I*>(input_);
-    O* output = static_cast<O*>(output_);
-
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
-        output[i] = (input[i] > 0) ? input[i] : 0;
-    }
-}
-
-namespace {
-static Registrar<ReLUImplForward_cpu> registrarReLUImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::ReLUImpl_cpu_forward_kernel<float, float>);
-static Registrar<ReLUImplForward_cpu> registrarReLUImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32}, Aidge::ReLUImpl_cpu_forward_kernel<int, int>);
-static Registrar<ReLUImplForward_cpu> registrarReLUImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::ReLUImpl_cpu_forward_kernel<double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_RELUIMPL_FORWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e39e9b7decd91e392c5db7e9e9bc4ed0f366829d
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
@@ -0,0 +1,66 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_RELUIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_RELUIMPL_KERNELS_H_
+
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>    // std::tuple
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/ReLUImpl.hpp"
+#include "aidge/operator/ReLU.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Kernels
+template <class I, class O>
+void ReLUImpl_cpu_forward_kernel(std::size_t inputLenght,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+//#pragma omp parallel for if (inputLenght > 1024)
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        output[i] = (input[i] > 0) ? input[i] : 0;
+    }
+}
+
+template <class I, class GI, class GO>
+void ReLUImpl_cpu_backward_kernel(const std::size_t inputLenght,
+                                  const void* input_, const void* grad_output_,
+				  void* grad_input_) {
+    const I* input = static_cast<const I*>(input_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        grad_input[i] = (input[i] > 0) ? grad_output[i] : 0;
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ReLUImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<float, float>, Aidge::ReLUImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(ReLUImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<double, double>, Aidge::ReLUImpl_cpu_backward_kernel<double, double, double>});
+REGISTRAR(ReLUImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::ReLUImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_RELUIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp b/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp
index 8d784c38dc006ea82f040dfe83b4bef05908dd68..1c50805d5af768dfc160488fda1e8fadfa798454 100644
--- a/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp
@@ -17,116 +17,22 @@
 #include <tuple>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/ReduceMean.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
-// class ReduceMean_Op;
-
-// Every DIM
-class ReduceMeanImplForward_cpu
-    : public Registrable<ReduceMeanImplForward_cpu,
-                        std::tuple<DataType, DataType>,
-                        void(const std::vector<std::int32_t>&,
+// Operator implementation entry point for the backend
+using ReduceMeanImpl_cpu = OperatorImpl_cpu<ReduceMean_Op,
+    void(const std::vector<std::int32_t>&,
                             DimSize_t,
                             const std::vector<DimSize_t>&,
                             const void *,
-                            void *)> {};
-class ReduceMeanImpl1DBackward_cpu
-    : public Registrable<ReduceMeanImpl1DBackward_cpu,
-                        std::tuple<DataType, DataType>,
-                        void(const std::vector<std::int32_t>&,
-                            DimSize_t,
-                            const std::vector<DimSize_t>&,
-                            const void *,
-                            void *)> {};
-
-class ReduceMeanImpl_cpu : public OperatorImpl {
-   public:
-    ReduceMeanImpl_cpu(const ReduceMean_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<ReduceMeanImpl_cpu> create(const ReduceMean_Op &op) {
-        return std::make_unique<ReduceMeanImpl_cpu>(op);
-    }
-
-   public:
-    void forward() override;
-};
-
-// // compute kernel registry for forward and backward
-// // DIM 1
-// class ReduceMeanImpl1DForward_cpu
-//     : public Registrable<ReduceMeanImpl1DForward_cpu,
-//                          std::tuple<DataType, DataType>,
-//                          void(const ReduceMean_Op<1>::Attrs &, const std::vector<DimSize_t>&, const void *, void *)> {};
-// class ReduceMeanImpl1DBackward_cpu
-//     : public Registrable<ReduceMeanImpl1DBackward_cpu,
-//                          std::tuple<DataType, DataType>,
-//                          void(const ReduceMean_Op<1>::Attrs &, const std::vector<DimSize_t>&, const void *,  void *)> {};
-
-// // DIM 2
-// class ReduceMeanImpl2DForward_cpu
-//     : public Registrable<ReduceMeanImpl2DForward_cpu,
-//                          std::tuple<DataType, DataType>,
-//                          void(const ReduceMean_Op<2>::Attrs &, const std::vector<DimSize_t>&, const void *, void *)> {};
-// class ReduceMeanImpl2DBackward_cpu
-//     : public Registrable<ReduceMeanImpl2DBackward_cpu,
-//                          std::tuple<DataType, DataType>,
-//                          void(const ReduceMean_Op<2>::Attrs &, const std::vector<DimSize_t>&, const void *,  void *)> {};
-// // DIM 3
-// class ReduceMeanImpl3DForward_cpu
-//     : public Registrable<ReduceMeanImpl3DForward_cpu,
-//                          std::tuple<DataType, DataType>,
-//                          void(const ReduceMean_Op<3>::Attrs &, const std::vector<DimSize_t>&, const void *, void *)> {};
-// class ReduceMeanImpl3DBackward_cpu
-//     : public Registrable<ReduceMeanImpl3DBackward_cpu,
-//                          std::tuple<DataType, DataType>,
-//                          void(const ReduceMean_Op<3>::Attrs &, const std::vector<DimSize_t>&, const void *, void *)> {};
-
-// class ReduceMeanImpl1D_cpu : public OperatorImpl {
-//    public:
-//     ReduceMeanImpl1D_cpu(const ReduceMean_Op<1>& op) : OperatorImpl(op, "cpu") {}
-
-//     static std::unique_ptr<ReduceMeanImpl1D_cpu> create(const ReduceMean_Op<1> &op) {
-//         return std::make_unique<ReduceMeanImpl1D_cpu>(op);
-//     }
-
-//    public:
-//     void forward() override;
-// };
-
-// class ReduceMeanImpl2D_cpu : public OperatorImpl {
-//    public:
-//     ReduceMeanImpl2D_cpu(const ReduceMean_Op<2>& op) : OperatorImpl(op, "cpu") {}
-
-//     static std::unique_ptr<ReduceMeanImpl2D_cpu> create(const ReduceMean_Op<2> &op) {
-//         return std::make_unique<ReduceMeanImpl2D_cpu>(op);
-//     }
-
-//    public:
-//     void forward() override;
-// };
-
-// class ReduceMeanImpl3D_cpu : public OperatorImpl {
-//    public:
-//     ReduceMeanImpl3D_cpu(const ReduceMean_Op<3>& op) : OperatorImpl(op, "cpu") {}
-
-//     static std::unique_ptr<ReduceMeanImpl3D_cpu> create(const ReduceMean_Op<3> &op) {
-//         return std::make_unique<ReduceMeanImpl3D_cpu>(op);
-//     }
+                            void *)>;
 
-//    public:
-//     void forward() override;
-// };
-namespace {
-// add cpu backend to ReduceMean_Op<2> implementation registry
-static Registrar<ReduceMean_Op> registrarReduceMeanImpl_cpu("cpu", Aidge::ReduceMeanImpl_cpu::create);
-// static Registrar<ReduceMean_Op<1>> registrarReduceMeanImpl1D_cpu("cpu", Aidge::ReduceMeanImpl1D_cpu::create);
-// static Registrar<ReduceMean_Op<2>> registrarReduceMeanImpl2D_cpu("cpu", Aidge::ReduceMeanImpl2D_cpu::create);
-// static Registrar<ReduceMean_Op<3>> registrarReduceMeanImpl3D_cpu("cpu", Aidge::ReduceMeanImpl3D_cpu::create);
-}  // namespace
+// Implementation entry point registration to Operator
+REGISTRAR(ReduceMean_Op, "cpu", Aidge::ReduceMeanImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
similarity index 63%
rename from include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
index bba355e16958bb1a22bde1d24304d992a658ade8..5a143164d7e4fa2585ea72c38eaaa123f215d21a 100644
--- a/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_KERNELS_H_
 
 #include <algorithm>   // std::for_each
 #include <cstddef>     // std::size_t
@@ -38,7 +38,10 @@ void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
     const std::size_t nb_dims = inputDims.size();
     const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
 
-    if (axes.size() == 1) {
+    if (axes.empty()){
+        std::copy_n(input,totalElements, output);
+    }
+    else if (axes.size() == 1) {
         const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axes[0], 1, std::multiplies<std::size_t>());
         const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axes[0], 1, std::multiplies<std::size_t>());
 
@@ -104,38 +107,16 @@ void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
     }
 }
 
-namespace {
-static Registrar<ReduceMeanImplForward_cpu> registrarReduceMeanImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::ReduceMeanImpl_cpu_forward_kernel<float, float>);
-static Registrar<ReduceMeanImplForward_cpu> registrarReduceMeanImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32}, Aidge::ReduceMeanImpl_cpu_forward_kernel<int, int>);
-static Registrar<ReduceMeanImplForward_cpu> registrarReduceMeanImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::ReduceMeanImpl_cpu_forward_kernel<double, double>);
-
-// // DIM = 1
-// static Registrar<ReduceMeanImpl1DForward_cpu> registrarReduceMeanImplForward_1D_cpu_Float32(
-//         {DataType::Float32, DataType::Float32}, Aidge::ReduceMeanImpl_cpu_forward_kernel<float, float,1>);
-// static Registrar<ReduceMeanImpl1DForward_cpu> registrarReduceMeanImplForward_1D_cpu_Int32(
-//         {DataType::Int32, DataType::Int32}, Aidge::ReduceMeanImpl_cpu_forward_kernel<int, int,1>);
-// static Registrar<ReduceMeanImpl1DForward_cpu> registrarReduceMeanImplForward_1D_cpu_Float64(
-//         {DataType::Float64, DataType::Float64}, Aidge::ReduceMeanImpl_cpu_forward_kernel<double, double,1>);
-
-// // DIM = 2
-// static Registrar<ReduceMeanImpl2DForward_cpu> registrarReduceMeanImplForward_2D_cpu_Float32(
-//         {DataType::Float32, DataType::Float32}, Aidge::ReduceMeanImpl_cpu_forward_kernel<float, float,2>);
-// static Registrar<ReduceMeanImpl2DForward_cpu> registrarReduceMeanImplForward_2D_cpu_Int32(
-//         {DataType::Int32, DataType::Int32}, Aidge::ReduceMeanImpl_cpu_forward_kernel<int, int,2>);
-// static Registrar<ReduceMeanImpl2DForward_cpu> registrarReduceMeanImplForward_2D_cpu_Float64(
-//         {DataType::Float64, DataType::Float64}, Aidge::ReduceMeanImpl_cpu_forward_kernel<double, double,2>);
-
-// // DIM = 3
-// static Registrar<ReduceMeanImpl3DForward_cpu> registrarReduceMeanImplForward_3D_cpu_Float32(
-//         {DataType::Float32, DataType::Float32}, Aidge::ReduceMeanImpl_cpu_forward_kernel<float, float,3>);
-// static Registrar<ReduceMeanImpl3DForward_cpu> registrarReduceMeanImplForward_3D_cpu_Int32(
-//         {DataType::Int32, DataType::Int32}, Aidge::ReduceMeanImpl_cpu_forward_kernel<int, int,3>);
-// static Registrar<ReduceMeanImpl3DForward_cpu> registrarReduceMeanImplForward_3D_cpu_Float64(
-//         {DataType::Float64, DataType::Float64}, Aidge::ReduceMeanImpl_cpu_forward_kernel<double, double,3>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(ReduceMeanImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ReduceMeanImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(ReduceMeanImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ReduceMeanImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(ReduceMeanImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ReduceMeanImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/ReduceSumImpl.hpp b/include/aidge/backend/cpu/operator/ReduceSumImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4138c62c24149c15cfad5e85e8f50889b2b6a433
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ReduceSumImpl.hpp
@@ -0,0 +1,38 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_
+#define AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ReduceSumImpl_cpu = OperatorImpl_cpu<ReduceSum_Op,
+    void(const std::vector<std::int32_t>&,
+                            DimSize_t,
+                            const std::vector<DimSize_t>&,
+                            const void *,
+                            void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(ReduceSum_Op, "cpu", Aidge::ReduceSumImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ReduceSumImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ReduceSumImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..72671421796a0d5e799e6f762dfcaf02457220f3
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ReduceSumImpl_kernels.hpp
@@ -0,0 +1,120 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_REDUCESUMIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_REDUCESUMIMPL_KERNELS_H_
+
+#include <algorithm>   // std::for_each
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t
+#include <functional>  //std::multiplies
+#include <numeric>     //std::accumulate
+#include <vector>
+
+#include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void ReduceSumImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
+                                    DimSize_t /*keepDims*/,
+                                    const std::vector<DimSize_t>& inputDims,
+                                    const void* input_,
+                                    void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    const std::size_t nb_dims = inputDims.size();
+    const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
+
+    if (axes.empty()){
+        std::copy_n(input,totalElements, output);
+    }
+    else if (axes.size() == 1) {
+        const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axes[0], 1, std::multiplies<std::size_t>());
+        const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axes[0], 1, std::multiplies<std::size_t>());
+
+        const std::size_t dim_i = inputDims[axes[0]];
+        for (std::size_t pre = 0; pre < stride_pre; ++pre) {
+            for (std::size_t post = 0; post < stride_post; ++post) {
+                const std::size_t idx_i = pre * dim_i * stride_post + post;
+                const std::size_t idx_o = pre * stride_post + post;
+                O sum = 0;
+                for (std::size_t i = 0; i < dim_i; ++i) {
+                    sum +=input[idx_i + i*stride_post];
+                }
+                output[idx_o]  = sum;
+            }
+        }
+    } else {
+        std::size_t outputElements = totalElements;
+
+        auto stride_post = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
+        stride_post[nb_dims - 1] = 1;
+        for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post[i] = stride_post[i+1]*inputDims[i+1];
+        }
+        auto stride_pre = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
+        stride_pre[0] = 1;
+        for (std::size_t i = 1; i < nb_dims; ++i) {
+            stride_pre[i] = stride_pre[i-1]*inputDims[i-1];
+        }
+
+        const I* inputAccumulation = input;
+        I* outputAccumulation = nullptr;
+
+        for (const auto& axisInt : axes) {
+            const std::size_t a = static_cast<std::size_t>(axisInt);
+            outputElements /= inputDims[a];
+            outputAccumulation = new I[outputElements];
+            const std::size_t dim_i = inputDims[a];
+            for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) {
+                for (std::size_t post = 0; post < stride_post[a]; ++post) {
+                    const std::size_t idx_i = pre * dim_i * stride_post[a] + post;
+                    const std::size_t idx_o = pre * stride_post[a] + post;
+                    I sum = 0;
+                    for (std::size_t i = 0; i < dim_i; ++i) {
+                        sum += inputAccumulation[idx_i + i*stride_post[a]];
+                    }
+                    outputAccumulation[idx_o] = sum;
+                }
+            }
+            std::for_each(stride_pre.get()+a+1, stride_pre.get()+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; });
+            if (inputAccumulation != input) {
+                delete[] inputAccumulation;
+            }
+            inputAccumulation = outputAccumulation;
+        }
+
+        // Copy elements from inputAccumulation to output while dividing by divisor
+        std::copy(inputAccumulation, inputAccumulation + outputElements, output);
+        if (outputAccumulation) {
+            delete[] outputAccumulation;
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ReduceSumImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ReduceSumImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(ReduceSumImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ReduceSumImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(ReduceSumImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ReduceSumImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_REDUCESUMIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/ScalingImpl.hpp b/include/aidge/backend/cpu/operator/ScalingImpl.hpp
index 8590169272818a225fe4299150f873733cdd9cd9..c1cc247c548701d43e01b1e92d02f42a11cfc710 100644
--- a/include/aidge/backend/cpu/operator/ScalingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ScalingImpl.hpp
@@ -12,7 +12,7 @@
 #ifndef __AIDGE_CPU_OPERATOR_ScalingIMPL_H__
 #define __AIDGE_CPU_OPERATOR_ScalingIMPL_H__
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Scaling.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
@@ -22,43 +22,17 @@
 #include <array>
 
 namespace Aidge {
-// class Scaling_Op;
-
-// compute kernel registry for forward and backward
-class ScalingImplForward_cpu
-    : public Registrable<ScalingImplForward_cpu,
-                        std::tuple<DataType, DataType>,
-                        void(const float,
-                            const std::size_t,
-                            const bool,
-                            std::size_t,
-                            const void*,
-                            void*)> {};
-class ScalingImplBackward_cpu
-    : public Registrable<ScalingImplBackward_cpu,
-                        std::tuple<DataType, DataType>,
-                        void(const float,
-                            const std::size_t,
-                            const bool,
-                            std::size_t,
-                            const void*,
-                            void*)> {};
-
-class ScalingImpl_cpu : public OperatorImpl {
-public:
-    ScalingImpl_cpu(const Scaling_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<ScalingImpl_cpu> create(const Scaling_Op& op) {
-        return std::make_unique<ScalingImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-static Registrar<Scaling_Op> registrarScalingImpl_cpu("cpu", Aidge::ScalingImpl_cpu::create);
-}
+// Operator implementation entry point for the backend
+using ScalingImpl_cpu = OperatorImpl_cpu<Scaling_Op,
+    void(const float,
+        const std::size_t,
+        const bool,
+        std::size_t,
+        const void*,
+        void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Scaling_Op, "cpu", Aidge::ScalingImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* __AIDGE_CPU_OPERATOR_ScalingIMPL_H__ */
\ No newline at end of file
diff --git a/include/aidge/backend/cpu/operator/ScalingImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
similarity index 79%
rename from include/aidge/backend/cpu/operator/ScalingImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
index c654265dd6f650129201037976d89da4b0f39d96..c758c9cf39e76bb370c6d03c28e3a670c280eefc 100644
--- a/include/aidge/backend/cpu/operator/ScalingImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_SCALINGIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_SCALINGIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_SCALINGIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_SCALINGIMPL_KERNELS_H_
 
 #include <cmath>
 #include <cstddef>
@@ -92,14 +92,16 @@ void ScalingImpl_cpu_forward_kernel(const float scalingFactor,
     }
 }
 
-namespace {
-static Registrar<ScalingImplForward_cpu> registrarScalingImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::ScalingImpl_cpu_forward_kernel<float, float>);
-static Registrar<ScalingImplForward_cpu> registrarScalingImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32}, Aidge::ScalingImpl_cpu_forward_kernel<int, int>);
-static Registrar<ScalingImplForward_cpu> registrarScalingImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::ScalingImpl_cpu_forward_kernel<double, double>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(ScalingImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ScalingImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(ScalingImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ScalingImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(ScalingImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ScalingImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_SCALINGIMPL_FORWARD_KERNEL_H_ */
\ No newline at end of file
+#endif /* AIDGE_CPU_OPERATOR_SCALINGIMPL_KERNELS_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cpu/operator/SigmoidImpl.hpp b/include/aidge/backend/cpu/operator/SigmoidImpl.hpp
index 34340e6166a48b465c7723e85d91c195bfb42277..ee1c36edecbe50cc1765da59737509a2b6333caf 100644
--- a/include/aidge/backend/cpu/operator/SigmoidImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SigmoidImpl.hpp
@@ -12,7 +12,7 @@
 #ifndef AIDGE_CPU_OPERATOR_SIGMOIDIMPL_H_
 #define AIDGE_CPU_OPERATOR_SIGMOIDIMPL_H_
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Sigmoid.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
@@ -21,34 +21,13 @@
 #include <vector>
 
 namespace Aidge {
-// class Sigmoid_Op;
+// Operator implementation entry point for the backend
+using SigmoidImpl_cpu = OperatorImpl_cpu<Sigmoid_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, const void*, void*)>;
 
-// compute kernel registry for forward and backward
-class SigmoidImplForward_cpu
-    : public Registrable<SigmoidImplForward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-class SigmoidImplBackward_cpu
-    : public Registrable<SigmoidImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t, const void*, const void*, void*)> {
-};
-
-class SigmoidImpl_cpu : public OperatorImpl {
-public:
-    SigmoidImpl_cpu(const Sigmoid_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<SigmoidImpl_cpu> create(const Sigmoid_Op& op) {
-        return std::make_unique<SigmoidImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-	
-    void forward() override final;
-
-    void backward() override final;
-};
-
-namespace {
-static Registrar<Sigmoid_Op> registrarSigmoidImpl_cpu("cpu", Aidge::SigmoidImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(Sigmoid_Op, "cpu", Aidge::SigmoidImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_SIGMOIDIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/SigmoidImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/SigmoidImpl_backward_kernels.hpp
deleted file mode 100644
index 4ceb3bd7ed9a3fb739591eee488f8035770fef18..0000000000000000000000000000000000000000
--- a/include/aidge/backend/cpu/operator/SigmoidImpl_backward_kernels.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_SIGMOIDIMPL_BACKWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_SIGMOIDIMPL_BACKWARD_KERNEL_H_
-
-#include <cstddef>  // std::size_t
-
-#include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
-#include "aidge/utils/Registrar.hpp"
-
-namespace Aidge {
-template <class O, class GI, class GO>
-void SigmoidImpl_cpu_backward_kernel(const std::size_t inputLenght,
-                                     const void* output_, const void* grad_output_,
-				     void* grad_input_) {
-    const O* output = static_cast<const O*>(output_);
-    const GO* grad_output = static_cast<const GO*>(grad_output_);
-    GI* grad_input = static_cast<GI*>(grad_input_);
-    for (std::size_t i = 0; i < inputLenght; ++i) {
-        grad_input[i] = output[i] * (O(1) - output[i]) * grad_output[i];
-    }
-}
-
-namespace {
-static Registrar<SigmoidImplBackward_cpu> registrarSigmoidImplBackward_cpu_Float32(
-    {DataType::Float32, DataType::Float32, DataType::Float32},
-    Aidge::SigmoidImpl_cpu_backward_kernel<float, float, float>);
-static Registrar<SigmoidImplBackward_cpu> registrarSigmoidImplBackward_cpu_Float64(
-    {DataType::Float64, DataType::Float64, DataType::Float64},
-    Aidge::SigmoidImpl_cpu_backward_kernel<double, double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_SIGMOIDIMPL_BACKWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cpu/operator/SigmoidImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/SigmoidImpl_forward_kernels.hpp
deleted file mode 100644
index 24ba11a0bca7f3fa15f9ac1e2c13e29f88eaf074..0000000000000000000000000000000000000000
--- a/include/aidge/backend/cpu/operator/SigmoidImpl_forward_kernels.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_SIGMOIDIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_SIGMOIDIMPL_FORWARD_KERNEL_H_
-
-#include "aidge/utils/Registrar.hpp"
-
-#include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
-
-namespace Aidge {
-template <class I, class O>
-void SigmoidImpl_cpu_forward_kernel(std::size_t inputLenght,
-                                    const void* input_,
-                                    void* output_) {
-
-    const I* input = static_cast<const I*>(input_);
-    O* output = static_cast<O*>(output_);
-
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
-		if (input[i] > I(0)) {
-			output[i] = O(1) / (O(1) + std::exp(-input[i]));
-		} else {
-			output[i] = std::exp(input[i]) / (O(1) + std::exp(input[i]));
-		}
-    }
-}
-
-namespace {
-static Registrar<SigmoidImplForward_cpu> registrarSigmoidImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::SigmoidImpl_cpu_forward_kernel<float, float>);
-static Registrar<SigmoidImplForward_cpu> registrarSigmoidImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::SigmoidImpl_cpu_forward_kernel<double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_SIGMOIDIMPL_FORWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..dfd71ce0a878efbeb779f3a67ad4ccc762bb8363
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
@@ -0,0 +1,59 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_SIGMOIDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_SIGMOIDIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void SigmoidImpl_cpu_forward_kernel(std::size_t inputLenght,
+                                    const void* input_,
+                                    void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+//#pragma omp parallel for if (inputLenght > 1024)
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+		if (input[i] > I(0)) {
+			output[i] = O(1) / (O(1) + std::exp(-input[i]));
+		} else {
+			output[i] = std::exp(input[i]) / (O(1) + std::exp(input[i]));
+		}
+    }
+}
+
+template <class O, class GI, class GO>
+void SigmoidImpl_cpu_backward_kernel(const std::size_t inputLenght,
+                                     const void* output_, const void* grad_output_,
+				     void* grad_input_) {
+    const O* output = static_cast<const O*>(output_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        grad_input[i] = output[i] * (O(1) - output[i]) * grad_output[i];
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(SigmoidImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::SigmoidImpl_cpu_forward_kernel<float, float>, Aidge::SigmoidImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(SigmoidImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::SigmoidImpl_cpu_forward_kernel<double, double>, Aidge::SigmoidImpl_cpu_backward_kernel<double, double, double>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_SIGMOIDIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/SliceImpl.hpp b/include/aidge/backend/cpu/operator/SliceImpl.hpp
index 61aed1553bfbd2e67fc837ec6ea8d80b26ef3558..fd98b38d7117eaa14e35fe3cb89abf95b2913997 100644
--- a/include/aidge/backend/cpu/operator/SliceImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SliceImpl.hpp
@@ -16,52 +16,25 @@
 #include <vector>
 #include <array>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Slice.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
-// class Slice_Op;
-
-// compute kernel registry for forward and backward
-class SliceImplForward_cpu
-    : public Registrable<SliceImplForward_cpu,
-                        std::tuple<DataType, DataType>,
-                        void(const std::vector<std::int64_t>&,
+// Operator implementation entry point for the backend
+using SliceImpl_cpu = OperatorImpl_cpu<Slice_Op,
+    void(const std::vector<std::int64_t>&,
                             const std::vector<std::int64_t>&,
                             const std::vector<std::int8_t>&,
                             const std::vector<std::int64_t>&,
                             const std::vector<DimSize_t>&,
                             const void*,
-                            void*)> {};
-class SliceImplBackward_cpu
-    : public Registrable<SliceImplBackward_cpu,
-                        std::tuple<DataType, DataType>,
-                        void(const std::vector<std::int64_t>&,
-                            const std::vector<std::int64_t>&,
-                            const std::vector<std::int8_t>&,
-                            const std::vector<std::int64_t>&,
-                            const std::vector<DimSize_t>&,
-                            const void*,
-                            void*)> {};
-
-class SliceImpl_cpu : public OperatorImpl {
-public:
-    SliceImpl_cpu(const Slice_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<SliceImpl_cpu> create(const Slice_Op& op) {
-        return std::make_unique<SliceImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
+                            void*)>;
 
-namespace {
-static Registrar<Slice_Op> registrarSliceImpl_cpu("cpu", Aidge::SliceImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(Slice_Op, "cpu", Aidge::SliceImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* __AIDGE_CPU_OPERATOR_SLICEIMPL_H__ */
diff --git a/include/aidge/backend/cpu/operator/SliceImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
similarity index 84%
rename from include/aidge/backend/cpu/operator/SliceImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
index 31e409369cc640bbda9f54c54652af7f72b509b6..1bf4c491723c570fa8bfd1774beca1630d2de9be 100644
--- a/include/aidge/backend/cpu/operator/SliceImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_SLICEIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_SLICEIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_SLICEIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_SLICEIMPL_KERNELS_H_
 
 #include <algorithm>
 #include <cmath>
@@ -88,14 +88,15 @@ void SliceImpl_cpu_forward_kernel(const std::vector<std::int64_t>& starts,
     }
 }
 
-namespace {
-static Registrar<SliceImplForward_cpu> registrarSliceImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::SliceImpl_cpu_forward_kernel<float, float>);
-static Registrar<SliceImplForward_cpu> registrarSliceImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32}, Aidge::SliceImpl_cpu_forward_kernel<int, int>);
-static Registrar<SliceImplForward_cpu> registrarSliceImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::SliceImpl_cpu_forward_kernel<double, double>);
-}  // namespace
+REGISTRAR(SliceImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(SliceImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(SliceImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_SLICEIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_SLICEIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/SoftmaxImpl.hpp b/include/aidge/backend/cpu/operator/SoftmaxImpl.hpp
index 2b2fab485656efdc37ee134cb4ae574b6b403405..ec2c2696ed6e2ba8cad1536519298d9331921c07 100644
--- a/include/aidge/backend/cpu/operator/SoftmaxImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SoftmaxImpl.hpp
@@ -12,7 +12,7 @@
 #ifndef AIDGE_CPU_OPERATOR_SOFTMAXIMPL_H_
 #define AIDGE_CPU_OPERATOR_SOFTMAXIMPL_H_
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Softmax.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
@@ -21,31 +21,12 @@
 #include <vector>
 
 namespace Aidge {
-// class Softmax_Op;
+// Operator implementation entry point for the backend
+using SoftmaxImpl_cpu = OperatorImpl_cpu<Softmax_Op,
+    void(std::size_t, const std::vector<DimSize_t>&, const void*, void*)>;
 
-// compute kernel registry for forward and backward
-class SoftmaxImplForward_cpu
-    : public Registrable<SoftmaxImplForward_cpu, std::tuple<DataType, DataType>, void(std::size_t, const std::vector<DimSize_t>&, const void*, void*)> {
-};
-class SoftmaxImplBackward_cpu
-    : public Registrable<SoftmaxImplBackward_cpu, std::tuple<DataType, DataType>, void(std::size_t, const std::vector<DimSize_t>&, const void*, void*)> {
-};
-
-class SoftmaxImpl_cpu : public OperatorImpl {
-public:
-    SoftmaxImpl_cpu(const Softmax_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<SoftmaxImpl_cpu> create(const Softmax_Op& op) {
-        return std::make_unique<SoftmaxImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-static Registrar<Softmax_Op> registrarSoftmaxImpl_cpu("cpu", Aidge::SoftmaxImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(Softmax_Op, "cpu", Aidge::SoftmaxImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_SOFTMAXIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/SoftmaxImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
similarity index 64%
rename from include/aidge/backend/cpu/operator/SoftmaxImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
index cc384c38e34d01887fc328d11de383aeef39fb8e..07486a48f1b8cf29f6a6ef8aa934a9decdbafef7 100644
--- a/include/aidge/backend/cpu/operator/SoftmaxImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_SOFTMAXIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_SOFTMAXIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_SOFTMAXIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_SOFTMAXIMPL_KERNELS_H_
 
 #include "aidge/utils/Registrar.hpp"
 #include <cstddef>
@@ -39,30 +39,37 @@ void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSi
 
     for (std::size_t i = 0; i < preAxisElems; ++i) {
         for (std::size_t j = 0; j < postAxisElems; ++j) {
+            I maxVal = input[i * inputDims[axisIdx] * postAxisElems + j];
+            for (std::size_t k = 1; k < inputDims[axisIdx]; ++k) {
+                std::size_t inIdx = i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j;
+                maxVal = std::max(maxVal, input[inIdx]);
+            }
+
             // Calculate sum of exponentials within the axis
             I sumExp = 0;
             for (std::size_t k = 0; k < inputDims[axisIdx]; ++k) {
                 std::size_t inIdx = i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j;
-                sumExp += std::exp(input[inIdx]);
+                sumExp += std::exp(input[inIdx] - maxVal);
             }
 
             // Calculate softmax for the current slice along the axis
             for (std::size_t  k = 0; k < inputDims[axisIdx]; ++k) {
                 std::size_t inIdx = i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j;
-                output[inIdx] = std::exp(input[inIdx]) / sumExp;
+                output[inIdx] = std::exp(input[inIdx] - maxVal) / sumExp;
             }
         }
     }
 }
 
-namespace {
-static Registrar<SoftmaxImplForward_cpu> registrarSoftmaxImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::SoftmaxImpl_cpu_forward_kernel<float, float>);
-static Registrar<SoftmaxImplForward_cpu> registrarSoftmaxImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32}, Aidge::SoftmaxImpl_cpu_forward_kernel<int, int>);
-static Registrar<SoftmaxImplForward_cpu> registrarSoftmaxImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::SoftmaxImpl_cpu_forward_kernel<double, double>);
-}  // namespace
+REGISTRAR(SoftmaxImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::SoftmaxImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(SoftmaxImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::SoftmaxImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(SoftmaxImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::SoftmaxImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_SOFTMAXIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_SOFTMAXIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/SqrtImpl.hpp b/include/aidge/backend/cpu/operator/SqrtImpl.hpp
index 1691d951678509274736d558360c8110958820a9..dba75d1c58fb19ab2284ee0e98a32bff7ac58557 100644
--- a/include/aidge/backend/cpu/operator/SqrtImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SqrtImpl.hpp
@@ -17,39 +17,19 @@
 #include <tuple>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Sqrt.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
+// Operator implementation entry point for the backend
+using SqrtImpl_cpu = OperatorImpl_cpu<Sqrt_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, void*)>;
 
-// compute kernel registry for forward and backward
-class SqrtImplForward_cpu
-    : public Registrable<SqrtImplForward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-class SqrtImplBackward_cpu
-    : public Registrable<SqrtImplBackward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-
-class SqrtImpl_cpu : public OperatorImpl {
-public:
-    SqrtImpl_cpu(const Sqrt_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<SqrtImpl_cpu> create(const Sqrt_Op& op) {
-        return std::make_unique<SqrtImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-
-    void forward() override final;
-
-    void backward() override final;
-};
-
-namespace {
-static Registrar<Sqrt_Op> registrarSqrtImpl_cpu("cpu", Aidge::SqrtImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(Sqrt_Op, "cpu", Aidge::SqrtImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_SQRTIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/SqrtImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/SqrtImpl_backward_kernels.hpp
deleted file mode 100644
index 9cf5118a5ac81520d7a180b6aba22417ca512890..0000000000000000000000000000000000000000
--- a/include/aidge/backend/cpu/operator/SqrtImpl_backward_kernels.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_SQRTIMPL_BACKWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_SQRTIMPL_BACKWARD_KERNEL_H_
-
-#include <cmath>    // std::sqrt
-#include <cstddef>  // std::size_t
-
-#include "aidge/utils/Registrar.hpp"
-
-#include "aidge/backend/cpu/operator/SqrtImpl.hpp"
-
-namespace Aidge {
-template <class I, class O>
-void SqrtImpl_cpu_backward_kernel(const std::size_t inputLenght,
-                                     const void* input_,
-                                     void* output_) {
-
-    const I* input = static_cast<const I*>(input_);
-    O* output = static_cast<O*>(output_);
-
-    for (std::size_t i = 0; i < inputLenght; ++i) {
-        output[i] = static_cast<O>(0.5/(std::sqrt(static_cast<float>(input[i]))));
-    }
-}
-
-namespace {
-static Registrar<SqrtImplBackward_cpu> registrarSqrtImplBackward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::SqrtImpl_cpu_backward_kernel<float, float>);
-static Registrar<SqrtImplBackward_cpu> registrarSqrtImplBackward_cpu_Int32(
-        {DataType::Int32, DataType::Int32}, Aidge::SqrtImpl_cpu_backward_kernel<int, int>);
-static Registrar<SqrtImplBackward_cpu> registrarSqrtImplBackward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::SqrtImpl_cpu_backward_kernel<double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_SQRTIMPL_BACKWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cpu/operator/SqrtImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/SqrtImpl_forward_kernels.hpp
deleted file mode 100644
index 886b978c2345ce555d229d684ba83f952be9e00e..0000000000000000000000000000000000000000
--- a/include/aidge/backend/cpu/operator/SqrtImpl_forward_kernels.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_SQRTIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_SQRTIMPL_FORWARD_KERNEL_H_
-
-#include <cmath>    // std::sqrt
-#include <cstddef>  // std::size_t
-
-#include "aidge/utils/Registrar.hpp"
-
-#include "aidge/backend/cpu/operator/SqrtImpl.hpp"
-
-namespace Aidge {
-template <class I, class O>
-void SqrtImpl_cpu_forward_kernel(const std::size_t inputLenght,
-                                     const void* input_,
-                                     void* output_) {
-
-    const I* input = static_cast<const I*>(input_);
-    O* output = static_cast<O*>(output_);
-
-    for (std::size_t i = 0; i < inputLenght; ++i) {
-        output[i] = static_cast<O>(std::sqrt(static_cast<float>(input[i])));
-    }
-}
-
-namespace {
-static Registrar<SqrtImplForward_cpu> registrarSqrtImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::SqrtImpl_cpu_forward_kernel<float, float>);
-static Registrar<SqrtImplForward_cpu> registrarSqrtImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32}, Aidge::SqrtImpl_cpu_forward_kernel<int, int>);
-static Registrar<SqrtImplForward_cpu> registrarSqrtImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::SqrtImpl_cpu_forward_kernel<double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_SQRTIMPL_FORWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0464119cad60742bc58c79da984b30776bc7932f
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
@@ -0,0 +1,60 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_SQRTIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_SQRTIMPL_KERNELS_H_
+
+#include <cmath>    // std::sqrt
+#include <cstddef>  // std::size_t
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/SqrtImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void SqrtImpl_cpu_forward_kernel(const std::size_t inputLenght,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        output[i] = static_cast<O>(std::sqrt(static_cast<float>(input[i])));
+    }
+}
+
+template <class I, class O>
+void SqrtImpl_cpu_backward_kernel(const std::size_t inputLenght,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        output[i] = static_cast<O>(0.5/(std::sqrt(static_cast<float>(input[i]))));
+    }
+}
+
+REGISTRAR(SqrtImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::SqrtImpl_cpu_forward_kernel<float, float>, Aidge::SqrtImpl_cpu_backward_kernel<float, float>});
+REGISTRAR(SqrtImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::SqrtImpl_cpu_forward_kernel<double, double>, Aidge::SqrtImpl_cpu_backward_kernel<double, double>});
+REGISTRAR(SqrtImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::SqrtImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::SqrtImpl_cpu_backward_kernel<int32_t, int32_t>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_SQRTIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/SubImpl.hpp b/include/aidge/backend/cpu/operator/SubImpl.hpp
index 15c028ae6289f39e0b6e6fd74e51e138b1f2675c..2bb22bda74edf7db09404fd5613b6714ddcdf513 100644
--- a/include/aidge/backend/cpu/operator/SubImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl.hpp
@@ -12,7 +12,7 @@
 #ifndef AIDGE_CPU_OPERATOR_SUBIMPL_H_
 #define AIDGE_CPU_OPERATOR_SUBIMPL_H_
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Sub.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
@@ -21,31 +21,12 @@
 #include <vector>
 
 namespace Aidge {
-// class Sub_Op;
+// Operator implementation entry point for the backend
+using SubImpl_cpu = OperatorImpl_cpu<Sub_Op,
+    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>;
 
-// compute kernel registry for forward and backward
-class SubImplForward_cpu
-    : public Registrable<SubImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)> {
-};
-class SubImplBackward_cpu
-    : public Registrable<SubImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)> {
-};
-
-class SubImpl_cpu : public OperatorImpl {
-public:
-    SubImpl_cpu(const Sub_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<SubImpl_cpu> create(const Sub_Op& op) {
-        return std::make_unique<SubImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-static Registrar<Sub_Op> registrarSubImpl_cpu("cpu", Aidge::SubImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(Sub_Op, "cpu", Aidge::SubImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_SUBIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/SubImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
similarity index 62%
rename from include/aidge/backend/cpu/operator/SubImpl_forward_kernels.hpp
rename to include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
index 10e6f58bb44b63f2d8712dc0aa64e0660f3356b2..0486ed2105b23e95f9cdfcda578e14900fcb2c8e 100644
--- a/include/aidge/backend/cpu/operator/SubImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_SUBIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_SUBIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_
 
 #include "aidge/utils/Registrar.hpp"
 
@@ -49,20 +49,19 @@ void SubImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
 	}
 }
 
-namespace {
-static Registrar<SubImplForward_cpu> registrarSubImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::SubImpl_cpu_forward_kernel<float, float, float>);
-static Registrar<SubImplForward_cpu> registrarSubImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::SubImpl_cpu_forward_kernel<double, double, double>);
-static Registrar<SubImplForward_cpu> registrarSubImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::SubImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>);
-static Registrar<SubImplForward_cpu> registrarSubImplForward_cpu_Int64(
-        {DataType::Int64, DataType::Int64, DataType::Int64},
-        Aidge::SubImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(SubImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<float, float, float>, nullptr});
+REGISTRAR(SubImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<double, double, double>, nullptr});
+REGISTRAR(SubImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(SubImpl_cpu,
+    {DataType::Int64},
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_SUBIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/TanhImpl.hpp b/include/aidge/backend/cpu/operator/TanhImpl.hpp
index 0bf851e77d94c160c0362301df33d682347daf0c..b1c2217bd29805eca2cf7b7906316756b75a74e0 100644
--- a/include/aidge/backend/cpu/operator/TanhImpl.hpp
+++ b/include/aidge/backend/cpu/operator/TanhImpl.hpp
@@ -12,7 +12,7 @@
 #ifndef AIDGE_CPU_OPERATOR_TANHIMPL_H_
 #define AIDGE_CPU_OPERATOR_TANHIMPL_H_
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Tanh.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
@@ -21,34 +21,13 @@
 #include <vector>
 
 namespace Aidge {
-// class Tanh_Op;
+// Operator implementation entry point for the backend
+using TanhImpl_cpu = OperatorImpl_cpu<Tanh_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, const void*, void*)>;
 
-// compute kernel registry for forward and backward
-class TanhImplForward_cpu
-    : public Registrable<TanhImplForward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-class TanhImplBackward_cpu
-    : public Registrable<TanhImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t, const void*, const void*, void*)> {
-};
-
-class TanhImpl_cpu : public OperatorImpl {
-public:
-    TanhImpl_cpu(const Tanh_Op& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<TanhImpl_cpu> create(const Tanh_Op& op) {
-        return std::make_unique<TanhImpl_cpu>(op);
-    }
-
-    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-	
-    void forward() override final;
-
-    void backward() override final;
-};
-
-namespace {
-static Registrar<Tanh_Op> registrarTanhImpl_cpu("cpu", Aidge::TanhImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(Tanh_Op, "cpu", Aidge::TanhImpl_cpu::create);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_TANHIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/TanhImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
similarity index 51%
rename from include/aidge/backend/cpu/operator/TanhImpl_backward_kernels.hpp
rename to include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
index 3a13c2cad21c35822fc6248590550e4716ee046d..fdcac210484b11f2220dcc2a6813efed503d1913 100644
--- a/include/aidge/backend/cpu/operator/TanhImpl_backward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
@@ -9,15 +9,28 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_TANHIMPL_BACKWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_TANHIMPL_BACKWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_TANHIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_TANHIMPL_KERNELS_H_
 
-#include <cstddef>  // std::size_t
+#include "aidge/utils/Registrar.hpp"
 
 #include "aidge/backend/cpu/operator/TanhImpl.hpp"
-#include "aidge/utils/Registrar.hpp"
 
 namespace Aidge {
+template <class I, class O>
+void TanhImpl_cpu_forward_kernel(std::size_t inputLenght,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+//#pragma omp parallel for if (inputLenght > 1024)
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        output[i] = std::tanh(input[i]);
+    }
+}
+
 template <class O, class GI, class GO>
 void TanhImpl_cpu_backward_kernel(const std::size_t inputLenght,
                                   const void* output_, const void* grad_output_,
@@ -30,14 +43,13 @@ void TanhImpl_cpu_backward_kernel(const std::size_t inputLenght,
     }
 }
 
-namespace {
-static Registrar<TanhImplBackward_cpu> registrarTanhImplBackward_cpu_Float32(
-    {DataType::Float32, DataType::Float32, DataType::Float32},
-    Aidge::TanhImpl_cpu_backward_kernel<float, float, float>);
-static Registrar<TanhImplBackward_cpu> registrarTanhImplBackward_cpu_Float64(
-    {DataType::Float64, DataType::Float64, DataType::Float64},
-    Aidge::TanhImpl_cpu_backward_kernel<double, double, double>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(TanhImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::TanhImpl_cpu_forward_kernel<float, float>, Aidge::TanhImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(TanhImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::TanhImpl_cpu_forward_kernel<double, double>, Aidge::TanhImpl_cpu_backward_kernel<double, double, double>});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_TANHIMPL_BACKWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_TANHIMPL_KERNELS_H_ */
diff --git a/project_name.txt b/project_name.txt
deleted file mode 100644
index f8a086fc063978638db5a0fcfe1dc2e5c9d0c1b7..0000000000000000000000000000000000000000
--- a/project_name.txt
+++ /dev/null
@@ -1 +0,0 @@
-aidge_backend_cpu
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..9dbdbede6083ea2ededd5a861449a2dfbea6f40e
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,605 @@
+[project]
+name = "aidge_backend_cpu"
+description="CPU implementation of operators of the AIDGE framework"
+dependencies = [
+    "numpy",
+]
+requires-python = ">= 3.7"
+readme = "README.md"
+license = { file = "LICENSE" }
+classifiers = [ 
+    "Development Status :: 2 - Pre-Alpha",
+    "Programming Language :: Python :: 3"
+    ]
+dynamic = ["version"] # defined in tool.setuptools_scm
+
+[build-system]
+requires = [
+    "setuptools>=64",
+    "setuptools_scm[toml]==7.1.0",
+    "cmake>=3.18.4.post1"
+]
+build-backend = "setuptools.build_meta"
+
+#####################################################
+# SETUPTOOLS
+[tool.setuptools]
+[tool.setuptools.packages.find]
+where = ["."]  # list of folders that contain the packages (["."] by default)
+include = ["aidge_backend_cpu*"]  # package names should match these glob patterns (["*"] by default)
+exclude = ["aidge_backend_cpu.unit_tests*"]  # exclude packages matching these glob patterns (empty by default)
+namespaces = false  # to disable scanning PEP 420 namespaces (true by default)
+# SETUPTOOLS_SCM
+[tool.setuptools_scm]
+write_to = "aidge_backend_cpu/_version.py"
+
+#####################################################
+# CIBUILDWHEEL
+[tool.cibuildwheel]
+build-frontend = "build"
+test-requires = "pytest"
+test-command = "pytest {project}/aidge_backend_cpu/unit_tests"
+# uncomment to run cibuildwheel locally on selected distros
+#build=[
+#     "cp38-manylinux_x86_64",
+#     "cp39-manylinux_x86_64",
+#     "cp310-manylinux_x86_64"
+#     "cp38-win_amd64",
+#]
+### AIDGE DEPENDENCIES DECLARATION
+[tool.cibuildwheel.environment]
+AIDGE_DEPENDENCIES = "aidge_core" # format => "dep_1 dep_2 ... dep_n"
+AIDGE_INSTALL="/AIDGE_INSTALL_CIBUILDWHEEL"
+[tool.cibuildwheel.linux]
+before-build = [
+    "bash .gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh /host"
+]
+before-test = [
+    "bash .gitlab/ci/cibuildwheel_build_deps_before_build_wheel.sh /host"
+]
+[tool.cibuildwheel.windows]
+before-build = [
+    "powershell -File .\\.gitlab\\ci\\cibuildwheel_build_deps_before_build_wheel.ps1"
+]
+before-test = [
+    "powershell -File .\\.gitlab\\ci\\cibuildwheel_build_deps_before_build_wheel.ps1"
+]
+
+
+#####################################################
+# PYLINT
+[tool.pylint.main]
+# Analyse import fallback blocks. This can be used to support both Python 2 and 3
+# compatible code, which means that the block might have code that exists only in
+# one or another interpreter, leading to false positives when analysed.
+# analyse-fallback-blocks =
+
+# Clear in-memory caches upon conclusion of linting. Useful if running pylint in
+# a server-like mode.
+# clear-cache-post-run =
+
+# Always return a 0 (non-error) status code, even if lint errors are found. This
+# is primarily useful in continuous integration scripts.
+# exit-zero =
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+extension-pkg-allow-list = ["aidge_core", "aidge_backend_cpu", "torch", "tensorflow"]
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
+# for backward compatibility.)
+# extension-pkg-whitelist =
+
+# Return non-zero exit code if any of these messages/categories are detected,
+# even if score is above --fail-under value. Syntax same as enable. Messages
+# specified are enabled, while categories only check already-enabled messages.
+# fail-on =
+
+# Specify a score threshold under which the program will exit with error.
+# fail-under =
+
+# Interpret the stdin as a python script, whose filename needs to be passed as
+# the module_or_package argument.
+# from-stdin =
+
+# Files or directories to be skipped. They should be base names, not paths.
+ignore = ["CVS"]
+
+# Add files or directories matching the regular expressions patterns to the
+# ignore-list. The regex matches against paths and can be in Posix or Windows
+# format. Because '\\' represents the directory delimiter on Windows systems, it
+# can't be used as an escape character.
+# ignore-paths =
+
+# Files or directories matching the regular expression patterns are skipped. The
+# regex matches against base names, not paths. The default value ignores Emacs
+# file locks
+# ignore-patterns =
+
+# List of module names for which member attributes should not be checked (useful
+# for modules/projects where namespaces are manipulated during runtime and thus
+# existing member attributes cannot be deduced by static analysis). It supports
+# qualified module names, as well as Unix pattern matching.
+ignored-modules = ["aidge_core", "aidge_backend_cpu"]
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+# init-hook =
+
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use, and will cap the count on Windows to
+# avoid hangs.
+jobs = 1
+# Control the amount of potential inferred values when inferring a single object.
+# This can help the performance when dealing with large functions or complex,
+# nested conditions.
+limit-inference-results = 100
+
+# List of plugins (as comma separated values of python module names) to load,
+# usually to register additional checkers.
+# load-plugins =
+
+# Pickle collected data for later comparisons.
+persistent = true
+
+# Minimum Python version to use for version dependent checks. Will default to the
+# version used to run pylint.
+py-version = "3.11"
+
+# Discover python modules and packages in the file system subtree.
+# recursive =
+
+# Add paths to the list of the source roots. Supports globbing patterns. The
+# source root is an absolute path or a path relative to the current working
+# directory used to determine a package namespace for modules located under the
+# source root.
+# source-roots =
+
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode = true
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+# unsafe-load-any-extension =
+
+[tool.pylint.basic]
+# Naming style matching correct argument names.
+argument-naming-style = "snake_case"
+
+# Regular expression matching correct argument names. Overrides argument-naming-
+# style. If left empty, argument names will be checked with the set naming style.
+# argument-rgx =
+
+# Naming style matching correct attribute names.
+attr-naming-style = "snake_case"
+
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style. If left empty, attribute names will be checked with the set naming
+# style.
+# attr-rgx =
+
+# Bad variable names which should always be refused, separated by a comma.
+bad-names = ["foo", "bar", "baz", "toto", "tutu", "tata"]
+
+# Bad variable names regexes, separated by a comma. If names match any regex,
+# they will always be refused
+# bad-names-rgxs =
+
+# Naming style matching correct class attribute names.
+class-attribute-naming-style = "any"
+
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style. If left empty, class attribute names will be checked
+# with the set naming style.
+# class-attribute-rgx =
+
+# Naming style matching correct class constant names.
+class-const-naming-style = "UPPER_CASE"
+
+# Regular expression matching correct class constant names. Overrides class-
+# const-naming-style. If left empty, class constant names will be checked with
+# the set naming style.
+# class-const-rgx =
+
+# Naming style matching correct class names.
+class-naming-style = "PascalCase"
+
+# Regular expression matching correct class names. Overrides class-naming-style.
+# If left empty, class names will be checked with the set naming style.
+# class-rgx =
+
+# Naming style matching correct constant names.
+const-naming-style = "UPPER_CASE"
+
+# Regular expression matching correct constant names. Overrides const-naming-
+# style. If left empty, constant names will be checked with the set naming style.
+# const-rgx =
+
+# Minimum line length for functions/classes that require docstrings, shorter ones
+# are exempt.
+docstring-min-length = -1
+
+# Naming style matching correct function names.
+function-naming-style = "snake_case"
+
+# Regular expression matching correct function names. Overrides function-naming-
+# style. If left empty, function names will be checked with the set naming style.
+# function-rgx =
+
+# Good variable names which should always be accepted, separated by a comma.
+good-names = ["i", "j", "k", "ex", "Run", "_"]
+
+# Good variable names regexes, separated by a comma. If names match any regex,
+# they will always be accepted
+# good-names-rgxs =
+
+# Include a hint for the correct naming format with invalid-name.
+# include-naming-hint =
+
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style = "any"
+
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style. If left empty, inline iteration names will be checked
+# with the set naming style.
+# inlinevar-rgx =
+
+# Naming style matching correct method names.
+method-naming-style = "snake_case"
+
+# Regular expression matching correct method names. Overrides method-naming-
+# style. If left empty, method names will be checked with the set naming style.
+# method-rgx =
+
+# Naming style matching correct module names.
+module-naming-style = "snake_case"
+
+# Regular expression matching correct module names. Overrides module-naming-
+# style. If left empty, module names will be checked with the set naming style.
+# module-rgx =
+
+# Colon-delimited sets of names that determine each other's naming style when the
+# name regexes allow several styles.
+# name-group =
+
+# Regular expression which should only match function or class names that do not
+# require a docstring.
+no-docstring-rgx = "^_"
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties. These
+# decorators are taken in consideration only for invalid-name.
+property-classes = ["abc.abstractproperty"]
+
+# Regular expression matching correct type alias names. If left empty, type alias
+# names will be checked with the set naming style.
+# typealias-rgx =
+
+# Regular expression matching correct type variable names. If left empty, type
+# variable names will be checked with the set naming style.
+# typevar-rgx =
+
+# Naming style matching correct variable names.
+variable-naming-style = "snake_case"
+
+# Regular expression matching correct variable names. Overrides variable-naming-
+# style. If left empty, variable names will be checked with the set naming style.
+# variable-rgx =
+
+[tool.pylint.classes]
+# Warn about protected attribute access inside special methods
+# check-protected-access-in-special-methods =
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods = ["__init__", "__new__", "setUp", "__post_init__"]
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected = ["_asdict", "_fields", "_replace", "_source", "_make"]
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg = ["cls"]
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg = ["cls"]
+
+[tool.pylint.design]
+# List of regular expressions of class ancestor names to ignore when counting
+# public methods (see R0903)
+# exclude-too-few-public-methods =
+
+# List of qualified class names to ignore when counting class parents (see R0901)
+# ignored-parents =
+
+# Maximum number of arguments for function / method.
+max-args = 5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes = 7
+
+# Maximum number of boolean expressions in an if statement (see R0916).
+max-bool-expr = 5
+
+# Maximum number of branch for function / method body.
+max-branches = 12
+
+# Maximum number of locals for function / method body.
+max-locals = 15
+
+# Maximum number of parents for a class (see R0901).
+max-parents = 7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods = 20
+
+# Maximum number of return / yield for function / method body.
+max-returns = 6
+
+# Maximum number of statements in function / method body.
+max-statements = 50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods = 2
+
+[tool.pylint.exceptions]
+# Exceptions that will emit a warning when caught.
+overgeneral-exceptions = ["BaseException", "Exception"]
+
+[tool.pylint.format]
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+# expected-line-ending-format =
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines = "^\\s*(# )?<?https?://\\S+>?$"
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren = 4
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string = "    "
+
+# Maximum number of characters on a single line.
+max-line-length = 200
+
+# Maximum number of lines in a module.
+max-module-lines = 1000
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+# single-line-class-stmt =
+
+# Allow the body of an if to be on the same line as the test if there is no else.
+# single-line-if-stmt =
+
+[tool.pylint.imports]
+# List of modules that can be imported at any level, not just the top level one.
+# allow-any-import-level =
+
+# Allow explicit reexports by alias from a package __init__.
+# allow-reexport-from-package =
+
+# Allow wildcard imports from modules that define __all__.
+# allow-wildcard-with-all =
+
+# Deprecated modules which should not be used, separated by a comma.
+# deprecated-modules =
+
+# Output a graph (.gv or any supported image format) of external dependencies to
+# the given file (report RP0402 must not be disabled).
+# ext-import-graph =
+
+# Output a graph (.gv or any supported image format) of all (i.e. internal and
+# external) dependencies to the given file (report RP0402 must not be disabled).
+# import-graph =
+
+# Output a graph (.gv or any supported image format) of internal dependencies to
+# the given file (report RP0402 must not be disabled).
+# int-import-graph =
+
+# Force import order to recognize a module as part of the standard compatibility
+# libraries.
+# known-standard-library =
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party = ["enchant"]
+
+# Couples of modules and preferred modules, separated by a comma.
+# preferred-modules =
+
+[tool.pylint.logging]
+# The type of string formatting that logging methods do. `old` means using %
+# formatting, `new` is for `{}` formatting.
+logging-format-style = "old"
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules = ["logging"]
+
+[tool.pylint."messages control"]
+# Only show warnings with the listed confidence levels. Leave empty to show all.
+# Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
+confidence = ["HIGH", "CONTROL_FLOW", "INFERENCE", "INFERENCE_FAILURE", "UNDEFINED"]
+# Disable the message, report, category or checker with the given id(s). You can
+# either give multiple identifiers separated by comma (,) or put this option
+# multiple times (only on the command line, not in the configuration file where
+# it should appear only once). You can also use "--disable=all" to disable
+# everything first and then re-enable specific checks. For example, if you want
+# to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable = ["raw-checker-failed", "bad-inline-option", "locally-disabled", "file-ignored", "suppressed-message", "useless-suppression", "deprecated-pragma", "use-symbolic-message-instead", "use-implicit-booleaness-not-comparison-to-string", "use-implicit-booleaness-not-comparison-to-zero", "too-many-locals", "missing-class-docstring", "missing-function-docstring", "too-many-arguments", "protected-access", "too-many-branches", "too-many-ancestors", "wrong-import-order", "wrong-import-position"]
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where it
+# should appear only once). See also the "--disable" option for examples.
+enable = ["c-extension-no-member"]
+[tool.pylint.method_args]
+# List of qualified names (i.e., library.method) which require a timeout
+# parameter e.g. 'requests.api.get,requests.api.post'
+timeout-methods = ["requests.api.delete", "requests.api.get", "requests.api.head", "requests.api.options", "requests.api.patch", "requests.api.post", "requests.api.put", "requests.api.request"]
+
+[tool.pylint.miscellaneous]
+# List of note tags to take in consideration, separated by a comma.
+notes = ["FIXME", "XXX", "TODO"]
+
+# Regular expression of note tags to take in consideration.
+# notes-rgx =
+
+[tool.pylint.refactoring]
+# Maximum number of nested blocks for function / method body
+max-nested-blocks = 5
+
+# Complete name of functions that never returns. When checking for inconsistent-
+# return-statements if a never returning function is called then it will be
+# considered as an explicit return statement and no message will be printed.
+never-returning-functions = ["sys.exit", "argparse.parse_error"]
+
+# Let 'consider-using-join' be raised when the separator to join on would be non-
+# empty (resulting in expected fixes of the type: ``"- " + " - ".join(items)``)
+suggest-join-with-non-empty-separator = true
+
+[tool.pylint.reports]
+# Python expression which should return a score less than or equal to 10. You
+# have access to the variables 'fatal', 'error', 'warning', 'refactor',
+# 'convention', and 'info' which contain the number of messages in each category,
+# as well as 'statement' which is the total number of statements analyzed. This
+# score is used by the global evaluation report (RP0004).
+evaluation = "10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)"
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+# msg-template =
+
+# Set the output format. Available formats are: text, parseable, colorized, json2
+# (improved json format), json (old json format) and msvs (visual studio). You
+# can also give a reporter class, e.g. mypackage.mymodule.MyReporterClass.
+# output-format =
+
+# Tells whether to display a full report or only the messages.
+# reports =
+
+# Activate the evaluation score.
+score = true
+
+[tool.pylint.similarities]
+# Comments are removed from the similarity computation
+ignore-comments = true
+
+# Docstrings are removed from the similarity computation
+ignore-docstrings = true
+
+# Imports are removed from the similarity computation
+# ignore-imports =
+
+# Signatures are removed from the similarity computation
+# ignore-signatures =
+
+# Minimum lines number of a similarity.
+min-similarity-lines = 4
+
+[tool.pylint.spelling]
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions = 4
+
+# Spelling dictionary name. No available dictionaries : You need to install both
+# the python package and the system dependency for enchant to work.
+# spelling-dict =
+
+# List of comma separated words that should be considered directives if they
+# appear at the beginning of a comment and should not be checked.
+spelling-ignore-comment-directives = "fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:"
+
+# List of comma separated words that should not be checked.
+# spelling-ignore-words =
+
+# A path to a file that contains the private dictionary; one word per line.
+# spelling-private-dict-file =
+
+# Tells whether to store unknown words to the private dictionary (see the
+# --spelling-private-dict-file option) instead of raising a message.
+# spelling-store-unknown-words =
+
+[tool.pylint.typecheck]
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators = ["contextlib.contextmanager"]
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+# generated-members =
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# class is considered mixin if its name matches the mixin-class-rgx option.
+# Tells whether to warn about missing members when the owner of the attribute is
+# inferred to be None.
+ignore-none = true
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference can
+# return multiple potential results while evaluating a Python object, but some
+# branches might not be evaluated, which results in partial inference. In that
+# case, it might be useful to still emit no-member and other checks for the rest
+# of the inferred objects.
+ignore-on-opaque-inference = true
+
+# List of symbolic message names to ignore for Mixin members.
+ignored-checks-for-mixins = ["no-member", "not-async-context-manager", "not-context-manager", "attribute-defined-outside-init"]
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes = ["optparse.Values", "thread._local", "_thread._local", "aidge.global_variables", "aidge.cells.abstract_cell.Trainable", "torch", "tensorflow"]
+
+# Show a hint with possible names when a member name was not found. The aspect of
+# finding the hint is based on edit distance.
+missing-member-hint = true
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance = 1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices = 1
+
+# Regex pattern to define which classes are considered mixins.
+mixin-class-rgx = ".*[Mm]ixin"
+
+# List of decorators that change the signature of a decorated function.
+# signature-mutators =
+
+[tool.pylint.variables]
+# List of additional names supposed to be defined in builtins. Remember that you
+# should avoid defining new builtins when possible.
+# additional-builtins =
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables = true
+
+# List of names allowed to shadow builtins
+# allowed-redefined-builtins =
+
+# List of strings which can identify a callback function by name. A callback name
+# must start or end with one of those strings.
+callbacks = ["cb_", "_cb"]
+
+# A regular expression matching the name of dummy variables (i.e. expected to not
+# be used).
+dummy-variables-rgx = "_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_"
+
+# Argument names that match this expression will be ignored.
+ignored-argument-names = "_.*|^ignored_|^unused_"
+
+# Tells whether we should check for unused import in __init__ files.
+# init-import =
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules = ["six.moves", "past.builtins", "future.builtins", "builtins", "io"]
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 24ce15ab7ead32f98c7ac3edcd34bb2010ff4326..0000000000000000000000000000000000000000
--- a/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-numpy
diff --git a/setup.py b/setup.py
index 80500f3165dd87eb7b6dd73c78b89806cc8a874a..22cbd9732c8b9e1099c3e322032e8377f6d4506b 100644
--- a/setup.py
+++ b/setup.py
@@ -1,51 +1,39 @@
-#!/usr/bin/env python3
-""" Aidge
-
-#TODO To change
-POC of the next framework named Aidge
-"""
-
-DOCLINES = (__doc__ or '').split("\n")
-
+#! /usr/bin/env python3
 import sys
 import os
 
-# Python supported version checks
-if sys.version_info[:2] < (3, 7):
-    raise RuntimeError("Python version >= 3.7 required.")
-
-
-CLASSIFIERS = """\
-Development Status :: 2 - Pre-Alpha
-"""
-
 import shutil
 import pathlib
-import subprocess
 import multiprocessing
 
 from math import ceil
 
 from setuptools import setup, Extension
-from setuptools import find_packages
 from setuptools.command.build_ext import build_ext
 
-def get_project_name() -> str:
-    return open(pathlib.Path().absolute() / "project_name.txt", "r").read()
 
-def get_project_version() -> str:
-    aidge_root = pathlib.Path().absolute()
-    version = open(aidge_root / "version.txt", "r").read().strip()
-    return version
+PROJECT_NAME = "aidge_backend_cpu"
 
+SETUP_DIR = pathlib.Path(__file__).parent
 
-class CMakeExtension(Extension):
+class AidgeBuildExtension(Extension):
     def __init__(self, name):
         super().__init__(name, sources=[])
 
-class CMakeBuild(build_ext):
+
+class AidgePkgBuild(build_ext):
+    def __init__(self, dist, *args, **kwargs):
+        super().__init__(dist, *args, **kwargs)
+        # Detect editable_mode for old versions of setuptools
+        if not hasattr(self, "editable_mode"):
+            if hasattr(dist, "commands"):
+                self.editable_mode = "develop" in dist.commands
+            else:
+                self.editable_mode = False
 
     def run(self):
+        ####################################
+        # BUILD PACKAGE
         # This lists the number of processors available on the machine
         # The compilation will use half of them
         max_jobs = str(ceil(multiprocessing.cpu_count() / 2))
@@ -60,55 +48,54 @@ class CMakeBuild(build_ext):
         if not build_lib.exists():
             build_lib.mkdir(parents=True, exist_ok=True)
 
-        os.chdir(str(build_temp))
+        package_prefix = build_lib if not self.editable_mode else SETUP_DIR
+        pybind_install_prefix = (package_prefix / PROJECT_NAME).absolute()
 
-        # Impose to use the executable of the python
-        # used to launch setup.py to setup PythonInterp
-        param_py = "-DPYTHON_EXECUTABLE=" + sys.executable
+        os.chdir(str(build_temp))
 
-        compile_type = 'Debug'
-        install_path = os.path.join(sys.prefix, "lib", "libAidge")  if "AIDGE_INSTALL" not in os.environ else os.environ["AIDGE_INSTALL"]
+        compile_type = os.environ.get("AIDGE_PYTHON_BUILD_TYPE", "Release")
+        install_path = (
+            os.path.join(sys.prefix, "lib", "libAidge")
+            if "AIDGE_INSTALL" not in os.environ
+            else os.environ["AIDGE_INSTALL"]
+        )
+        build_gen = os.environ.get("AIDGE_BUILD_GEN", "")
+        build_gen_opts = (
+            ["-G", build_gen]
+            if build_gen
+            else []
+        )
+        test_onoff = os.environ.get("AIDGE_BUILD_TEST", "OFF")
+        
+        self.spawn(
+            [
+                "cmake",
+                *build_gen_opts,
+                str(cwd),
+                f"-DTEST={test_onoff}",
+                f"-DCMAKE_INSTALL_PREFIX:PATH={install_path}",
+                f"-DCMAKE_BUILD_TYPE={compile_type}",
+                "-DPYBIND=ON",
+                f"-DPYBIND_INSTALL_PREFIX:PATH={pybind_install_prefix}",
+                "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
+                "-DCOVERAGE=OFF",
+            ]
+        )
 
-        self.spawn(['cmake', str(cwd), param_py, '-DTEST=OFF', f'-DCMAKE_INSTALL_PREFIX:PATH={install_path}', f'-DCMAKE_BUILD_TYPE={compile_type}'])
         if not self.dry_run:
-            self.spawn(['cmake', '--build', '.', '--config', compile_type, '-j', max_jobs])
-            self.spawn(['cmake', '--install', '.', '--config', compile_type])
+            self.spawn(
+                ["cmake", "--build", ".", "--config", compile_type, "-j", max_jobs]
+            )
+            self.spawn(["cmake", "--install", ".", "--config", compile_type])
         os.chdir(str(cwd))
 
-        aidge_package = build_lib / (get_project_name())
-
-        # Get "aidge core" package
-        # ext_lib = build_temp
-        print(build_temp.absolute())
-        # Copy all shared object files from build_temp/lib to aidge_package
-        for root, _, files in os.walk(build_temp.absolute()):
-            for file in files:
-                if (file.endswith('.so') or file.endswith('.pyd')) and (root != str(aidge_package.absolute())):
-                    currentFile=os.path.join(root, file)
-                    shutil.copy(currentFile, str(aidge_package.absolute()))
-
-        # Copy version.txt in aidge_package
-        os.chdir(os.path.dirname(__file__))
-        shutil.copy("version.txt", str(aidge_package.absolute()))
-
-
-if __name__ == '__main__':
 
+if __name__ == "__main__":
     setup(
-        name=get_project_name(),
-        version=get_project_version(),
-        python_requires='>=3.7',
-        description=DOCLINES[0],
-        long_description_content_type="text/markdown",
-        long_description="\n".join(DOCLINES[2:]),
-        classifiers=[c for c in CLASSIFIERS.split('\n') if c],
-        packages=find_packages(where="."),
         include_package_data=True,
-        ext_modules=[CMakeExtension(get_project_name())],
+        ext_modules=[AidgeBuildExtension(PROJECT_NAME)],
         cmdclass={
-            'build_ext': CMakeBuild,
+            "build_ext": AidgePkgBuild,
         },
-        install_requires=['aidge_core'],
         zip_safe=False,
-
     )
diff --git a/src/operator/AbsImpl.cpp b/src/operator/AbsImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..130d6cf7a64e1e75b8ef128974101a477f802caf
--- /dev/null
+++ b/src/operator/AbsImpl.cpp
@@ -0,0 +1,40 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/AbsImpl.hpp"
+
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/AbsImpl_kernels.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Abs.hpp"
+#include "aidge/utils/Types.h"
+
+template <>
+void Aidge::AbsImpl_cpu::forward() {
+    const Abs_Op& op = static_cast<const Abs_Op&>(mOp);
+
+    // Find the correct kernel type
+    const auto impl = Registrar<AbsImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(
+        op.getInput(0)->size(),
+        op.getInput(0)->getImpl()->rawPtr(),
+        op.getOutput(0)->getImpl()->rawPtr()
+    );
+}
+
+template <>
+void Aidge::AbsImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Abs_Op on backend cpu");
+}
diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp
index d6d75a608e4da7d8b9ed8a28912ff2eb1751e042..457a0b17e531fac35ff873f9eedca7bbbe82d459 100644
--- a/src/operator/AddImpl.cpp
+++ b/src/operator/AddImpl.cpp
@@ -16,69 +16,57 @@
 #include <vector>
 
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/AddImpl_kernels.hpp"
 #include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/utils/ErrorHandling.hpp"
 
-Aidge::Elts_t  Aidge::AddImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
-
+template <>
 void  Aidge::AddImpl_cpu::forward() {
-    const auto& opTensor = static_cast<const OperatorTensor&>(mOp);
-    AIDGE_ASSERT(opTensor.getInput(0)->hasImpl(), "cannot run Add forward because the 0-th input has no implementation.");
-    assert(opTensor.getInput(0) && "missing input in Add operator");
-    DataType datatypeFirstInput = opTensor.getInput(0)->dataType();
-    for (IOIndex_t i = 1; i < opTensor.nbInputs(); ++i) {
-        AIDGE_ASSERT(opTensor.getInput(i)->hasImpl(), "cannot run Add forward because the {}-th input has no implementation.", i);
-        assert(opTensor.getInput(i) && "missing input in Add operator");
-        assert(opTensor.getInput(i)->dataType() == datatypeFirstInput);
+    const Add_Op& op = static_cast<const Add_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in Add operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Add forward because the 0-th input has no implementation.");
+    DataType datatypeFirstInput = op.getInput(0)->dataType();
+    for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
+        AIDGE_ASSERT(op.getInput(i), "missing input in Add operator");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Add forward because the {}-th input has no implementation.", i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type.");
     }
 
     // Find the correct kernel type
-    const auto outputDataType = opTensor.getOutput(0)->dataType();
-    const Registrar<AddImplForward_cpu>::registrar_key registrarKey = {
-        datatypeFirstInput,
-        outputDataType};
-
-    Registrar<AddImplForward_cpu>::registrar_type kernelFunc;
-    if (Registrar<AddImplForward_cpu>::exists(registrarKey)) {
-        // One exists with the right inputs/output types
-        kernelFunc = Registrar<AddImplForward_cpu>::create(registrarKey);
-    }
-    else {
-        // Otherwise, fallback to the kernel with all types matching output type
-        kernelFunc = Registrar<AddImplForward_cpu>::create({
-            outputDataType, outputDataType});
-    }
+    const auto impl = Registrar<AddImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Convert input data (no overhead if not needed!)
     // TODO: right now, if needed, memory will be allocated/deallocated at each
     // call to forward(). We might put the following shared_ptr as members of
     // this class to avoid that.
-    const std::size_t nbDims = opTensor.getOutput(0)->nbDims();
+    const std::size_t nbDims = op.getOutput(0)->nbDims();
     std::vector<std::vector<std::size_t>> inputsDims;
     std::vector<const void*> opInputs;
-    std::vector<std::shared_ptr<Tensor>> inputsFallback(opTensor.nbInputs());
-    for (IOIndex_t i = 0; i < opTensor.nbInputs(); ++i) {
+    std::vector<std::shared_ptr<Tensor>> inputsFallback(op.nbInputs());
+    for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
         std::vector<std::size_t> inputDims(nbDims, 1);
-        auto dims = opTensor.getInput(i)->dims();
+        auto dims = op.getInput(i)->dims();
 		for(std::size_t j=dims.size()-1; j+1>0; --j)
 		{
 			std::size_t idx = nbDims - (dims.size()-j);
 			inputDims[idx] = dims[j];
 		}
         inputsDims.push_back(inputDims);
-        const auto& input = opTensor.getInput(i)->refCastFrom(inputsFallback[i], *opTensor.getOutput(0));
+        const auto& input = op.getInput(i)->refCastFrom(inputsFallback[i], *op.getOutput(0));
         opInputs.push_back(input.getImpl()->rawPtr());
     }
 
-    kernelFunc(opInputs,
+    impl.forward(opInputs,
                inputsDims,
-               opTensor.getOutput(0)->size(),
-               opTensor.getOutput(0)->dims(),
-               getCPUPtr(opTensor.getRawOutput(0)));
+               op.getOutput(0)->size(),
+               op.getOutput(0)->dims(),
+               getCPUPtr(op.getRawOutput(0)));
+}
+
+template <>
+void Aidge::AddImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Add_Op on backend cpu");
 }
diff --git a/src/operator/AndImpl.cpp b/src/operator/AndImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e0f59769ad86f6e4143ab59d089706e34792244
--- /dev/null
+++ b/src/operator/AndImpl.cpp
@@ -0,0 +1,49 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/operator/And.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/AndImpl.hpp"
+#include "aidge/backend/cpu/operator/AndImpl_kernels.hpp"
+
+template <>
+void Aidge::AndImpl_cpu::forward() {
+    const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
+    const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
+
+
+    // Find the correct kernel type
+    const auto impl = Registrar<AndImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(inputDims0,
+        inputDims1,
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+        getCPUPtr(mOp.getRawInput(0)),
+        getCPUPtr(mOp.getRawInput(1)),
+        getCPUPtr(mOp.getRawOutput(0)));
+}
+
+template <>
+void Aidge::AndImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for And_Op on backend cpu");
+}
diff --git a/src/operator/ArgMaxImpl.cpp b/src/operator/ArgMaxImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8fb85a7cd86a788cda69307d5ed8f363619f9f0
--- /dev/null
+++ b/src/operator/ArgMaxImpl.cpp
@@ -0,0 +1,39 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
+
+#include <memory>
+#include <vector>
+
+#include "aidge/utils/Types.h"
+#include "aidge/operator/ArgMax.hpp"
+#include "aidge/backend/cpu/operator/ArgMaxImpl_kernels.hpp"
+
+template <>
+void Aidge::ArgMaxImpl_cpu::forward() {
+    const ArgMax_Op& op_ = dynamic_cast<const ArgMax_Op&>(mOp);
+
+    // Find the correct kernel type
+    const auto impl = Registrar<ArgMaxImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(op_.axis(),
+                op_.selectLastIndex(),
+                op_.getInput(0)->dims(),
+                op_.getInput(0)->getImpl()->rawPtr(),
+                op_.getOutput(0)->getImpl()->rawPtr());
+}
+
+template <>
+void Aidge::ArgMaxImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for ArgMax_Op on backend cpu");
+}
diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp
index feaa7e67a8d0bc726462aed99e557493d3b8d0c6..01a5e8cf1772161f5cf98d3a8bd52f43ac7a1d0d 100644
--- a/src/operator/AvgPoolingImpl.cpp
+++ b/src/operator/AvgPoolingImpl.cpp
@@ -16,29 +16,29 @@
 #include <vector>
 
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/AvgPoolingImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/AvgPooling.hpp"
 #include "aidge/utils/Types.h"
 
-Aidge::Elts_t Aidge::AvgPoolingImpl2D_cpu::getNbRequiredProtected(IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
-
+template <>
 void Aidge::AvgPoolingImpl2D_cpu::forward() {
     const auto& op_ = dynamic_cast<const AvgPooling_Op<2>&>(mOp);
     assert(op_.getInput(0) && "missing input #0");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<AvgPoolingImpl2DForward_cpu>::create(
-        {op_.getInput(0)->dataType(),
-         op_.getOutput(0)->dataType()});
+    const auto impl = Registrar<AvgPoolingImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(op_.strideDims(),
+    impl.forward(op_.strideDims(),
                op_.kernelDims(),
                op_.getInput(0)->template dims<4>(),
                getCPUPtr(op_.getInput(0)),
                getCPUPtr(op_.getOutput(0)));
 }
+
+template <>
+void Aidge::AvgPoolingImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for AvgPooling_Op<2> on backend cpu");
+}
+
diff --git a/src/operator/BatchNormImpl.cpp b/src/operator/BatchNormImpl.cpp
index 3046eea9bd241732daf39cce1783b5ee50de01c7..9f1d986e63f14e6038c80054e5e3bc631ec24224 100644
--- a/src/operator/BatchNormImpl.cpp
+++ b/src/operator/BatchNormImpl.cpp
@@ -19,13 +19,9 @@
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 #include "aidge/operator/BatchNorm.hpp"
 
-#include "aidge/backend/cpu/operator/BatchNormImpl_forward_kernels.hpp"
-
-Aidge::Elts_t Aidge::BatchNormImpl2D_cpu::getNbRequiredProtected(IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
+#include "aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp"
 
+template <>
 void Aidge::BatchNormImpl2D_cpu::forward() {
     const auto& op_ = dynamic_cast<const BatchNorm_Op<2>&>(mOp);
     AIDGE_ASSERT(op_.getInput(0), "missing input #0 for BatchNorm Operator");
@@ -35,14 +31,12 @@ void Aidge::BatchNormImpl2D_cpu::forward() {
     AIDGE_ASSERT(op_.getInput(4), "missing input #4 for BatchNorm Operator");
 
     AIDGE_ASSERT(op_.getOutput(0)->nbDims() == 4, "");
+
     // Find the correct kernel type
-    auto kernelFunc =
-            Registrar<BatchNormImpl2DForward_cpu>::create({op_.getInput(0)->dataType(),
-                                                           op_.getInput(1)->dataType(),
-                                                           op_.getOutput(0)->dataType()});
+    const auto impl = Registrar<BatchNormImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(op_.epsilon(),
+    impl.forward(op_.epsilon(),
             op_.momentum(),
             op_.getInput(0)->template dims<4>(),
             getCPUPtr(op_.getRawInput(0)),
@@ -53,3 +47,8 @@ void Aidge::BatchNormImpl2D_cpu::forward() {
             getCPUPtr(op_.getRawOutput(0)),
             true);
 }
+
+template <>
+void Aidge::BatchNormImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for BatchNorm_Op<2> on backend cpu");
+}
diff --git a/src/operator/BitShiftImpl.cpp b/src/operator/BitShiftImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e0f79fd29fd140f0b41c64d245b9b240da80028
--- /dev/null
+++ b/src/operator/BitShiftImpl.cpp
@@ -0,0 +1,57 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric>
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
+#include "aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp"
+
+template<>
+void Aidge::BitShiftImpl_cpu::forward() {
+
+    const auto& op_ = dynamic_cast<const BitShift_Op&>(mOp);
+
+
+    const auto impl = Registrar<BitShiftImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+
+    const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
+    const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
+
+    BitShift_Op::BitShiftDirection direction = op_.direction();
+
+    // Call kernel
+    impl.forward(
+        direction,
+        inputDims0,
+        inputDims1,
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+        getCPUPtr(mOp.getRawInput(0)),
+        getCPUPtr(mOp.getRawInput(1)),
+        getCPUPtr(mOp.getRawOutput(0)));
+        
+}
+
+template <>
+void Aidge::BitShiftImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for BitShift_Op on backend cpu");
+}
\ No newline at end of file
diff --git a/src/operator/ConstantOfShapeImpl.cpp b/src/operator/ConstantOfShapeImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..16e4b762ba04e5f01bfccf965f6de3650fa2e734
--- /dev/null
+++ b/src/operator/ConstantOfShapeImpl.cpp
@@ -0,0 +1,44 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/ConstantOfShape.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+template <>
+void Aidge::ConstantOfShapeImpl_cpu::forward() {
+  const ConstantOfShape_Op &op_ = static_cast<const ConstantOfShape_Op &>(mOp);
+  // Check if input is provided
+  AIDGE_ASSERT(op_.getInput(0), "{} : Missing input 0", __func__);
+
+    // Find the correct kernel type
+    const auto impl = Registrar<ConstantOfShapeImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(op_.getOutput(0)->dims(),
+             op_.value(), 
+             op_.getOutput(0)->getImpl()->rawPtr());
+}
+
+template <>
+void Aidge::ConstantOfShapeImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for ConstantOfShape_Op on backend cpu");
+}
diff --git a/src/operator/ConvDepthWiseImpl.cpp b/src/operator/ConvDepthWiseImpl.cpp
index 591e8a0637d1e52c75193ac1750a210a08815ccc..d86bba8d1abf348eb25e2d9c69d04b5c33a8a176 100644
--- a/src/operator/ConvDepthWiseImpl.cpp
+++ b/src/operator/ConvDepthWiseImpl.cpp
@@ -15,18 +15,13 @@
 #include <vector>
 
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/ConvDepthWiseImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/ConvDepthWise.hpp"
 #include "aidge/utils/Log.hpp"
 #include "aidge/utils/Types.h"
 
-
-Aidge::Elts_t Aidge::ConvDepthWiseImpl1D_cpu::getNbRequiredProtected(Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
-
+template <>
 void Aidge::ConvDepthWiseImpl1D_cpu::forward() {
     const auto& op_ = dynamic_cast<const ConvDepthWise_Op<1>&>(mOp);
 
@@ -36,23 +31,7 @@ void Aidge::ConvDepthWiseImpl1D_cpu::forward() {
     AIDGE_ASSERT((op_.getInput(0)->nbDims() == 3), "support for 4-dimensions tensors only");
 
     // Find the correct kernel type
-    const auto outputDataType = op_.getOutput(0)->dataType();
-    const Registrar<ConvDepthWiseImpl1DForward_cpu>::registrar_key registrarKey = {
-        op_.getInput(0)->dataType(),
-        op_.getInput(1)->dataType(),
-        ((op_.getInput(2)) ? op_.getInput(2)->dataType() : op_.getInput(1)->dataType()),
-        outputDataType};
-
-    Registrar<ConvDepthWiseImpl1DForward_cpu>::registrar_type kernelFunc;
-    if (Registrar<ConvDepthWiseImpl1DForward_cpu>::exists(registrarKey)) {
-        // One exists with the right inputs/output types
-        kernelFunc = Registrar<ConvDepthWiseImpl1DForward_cpu>::create(registrarKey);
-    }
-    else {
-        // Otherwise, fallback to the kernel with all types matching output type
-        kernelFunc = Registrar<ConvDepthWiseImpl1DForward_cpu>::create({
-            outputDataType, outputDataType, outputDataType, outputDataType});
-    }
+    const auto impl = Registrar<ConvDepthWiseImpl1D_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Convert input data (no overhead if not needed!)
     // TODO: right now, if needed, memory will be allocated/deallocated at each
@@ -64,7 +43,7 @@ void Aidge::ConvDepthWiseImpl1D_cpu::forward() {
     const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();
 
     // Call kernel
-    kernelFunc(op_.strideDims(),
+    impl.forward(op_.strideDims(),
                 op_.dilationDims(),
                 op_.kernelDims(), // Conv attributes
                op_.getInput(0)->template dims<3>(), // input dimensions
@@ -75,11 +54,12 @@ void Aidge::ConvDepthWiseImpl1D_cpu::forward() {
             );
 }
 
-Aidge::Elts_t Aidge::ConvDepthWiseImpl2D_cpu::getNbRequiredProtected(IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
+template <>
+void Aidge::ConvDepthWiseImpl1D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for ConvDepthWise_Op<1> on backend cpu");
 }
 
+template <>
 void Aidge::ConvDepthWiseImpl2D_cpu::forward() {
     const auto& op_ = dynamic_cast<const ConvDepthWise_Op<2>&>(mOp);
 
@@ -90,11 +70,7 @@ void Aidge::ConvDepthWiseImpl2D_cpu::forward() {
     AIDGE_ASSERT((op_.getInput(0)->nbDims() == 4), "support for 4-dimensions tensors only");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<ConvDepthWiseImpl2DForward_cpu>::create(
-        {op_.getInput(0)->dataType(),
-        op_.getInput(1)->dataType(),
-        op_.getInput(2)->dataType(),
-        op_.getOutput(0)->dataType()});
+    const auto impl = Registrar<ConvDepthWiseImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
 
         // Convert input data (no overhead if not needed!)
     // TODO: right now, if needed, memory will be allocated/deallocated at each
@@ -106,7 +82,7 @@ void Aidge::ConvDepthWiseImpl2D_cpu::forward() {
     const auto& input2 = op_.getInput(2) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();
 
     // Call kernel
-    kernelFunc(op_.strideDims(),
+    impl.forward(op_.strideDims(),
             op_.dilationDims(),
             op_.kernelDims(),
             op_.getInput(0)->template dims<4>(),
@@ -115,3 +91,8 @@ void Aidge::ConvDepthWiseImpl2D_cpu::forward() {
             op_.getInput(2) ?  input2.getImpl()->rawPtr() : nullptr,
             getCPUPtr(op_.getRawOutput(0)));
 }
+
+template <>
+void Aidge::ConvDepthWiseImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for ConvDepthWise_Op<2> on backend cpu");
+}
diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp
index 0be31befe2019d70b628db878443f14b1d622f1c..fdfe19fbf4bf3e71c86aa28b966cfb21a1b5ba40 100644
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -10,6 +10,7 @@
  ********************************************************************************/
 
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/backend/cpu/operator/ConvImpl_kernels.hpp"
 
 #include <cassert>
 #include <chrono>  // std::chrono::milliseconds
@@ -18,40 +19,19 @@
 #include <vector>
 
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp"
 #include "aidge/operator/Conv.hpp"
 #include "aidge/utils/Types.h"
 
-Aidge::Elts_t Aidge::ConvImpl1D_cpu::getNbRequiredProtected(IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
-
+template <>
 void Aidge::ConvImpl1D_cpu::forward() {
     const auto& op_ = static_cast<const Conv_Op<1>&>(mOp);
 
     // FIXME: uncomment the following code once memory handling will work
-AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
+    AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
     AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator.");
 
     // Find the correct kernel type
-    const auto outputDataType = op_.getOutput(0)->dataType();
-    const Registrar<ConvImpl1DForward_cpu>::registrar_key registrarKey = {
-        op_.getInput(0)->dataType(),
-        op_.getInput(1)->dataType(),
-        (op_.getInput(2) ? op_.getInput(2)->dataType() : op_.getInput(1)->dataType()),
-        outputDataType};
-
-    Registrar<ConvImpl1DForward_cpu>::registrar_type kernelFunc;
-    if (Registrar<ConvImpl1DForward_cpu>::exists(registrarKey)) {
-        // One exists with the right inputs/output types
-        kernelFunc = Registrar<ConvImpl1DForward_cpu>::create(registrarKey);
-    }
-    else {
-        // Otherwise, fallback to the kernel with all types matching output type
-        kernelFunc = Registrar<ConvImpl1DForward_cpu>::create({
-            outputDataType, outputDataType, outputDataType, outputDataType});
-    }
+    const auto impl = Registrar<ConvImpl1D_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Convert input data (no overhead if not needed!)
     // TODO: right now, if needed, memory will be allocated/deallocated at each
@@ -63,11 +43,11 @@ AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
     const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();
 
     // Call kernel
-    kernelFunc(op_.strideDims(),
+    impl.forward(op_.strideDims(),
             op_.dilationDims(),
             op_.kernelDims(),
             op_.getInput(0)->template dims<3>(), // input dimensions
-            dynamic_cast<const Conv_Op<2>&>(mOp).outChannels(), // outChannels
+            dynamic_cast<const Conv_Op<1>&>(mOp).outChannels(), // outChannels
             input0.getImpl()->rawPtr(), // input
             input1.getImpl()->rawPtr(), // weight
             op_.getInput(2) ? input2.getImpl()->rawPtr() : nullptr, // bias
@@ -75,11 +55,12 @@ AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
             );
 }
 
-Aidge::Elts_t Aidge::ConvImpl2D_cpu::getNbRequiredProtected(IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
+template <>
+void Aidge::ConvImpl1D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<1> on backend cpu");
 }
 
+template <>
 void Aidge::ConvImpl2D_cpu::forward() {
     const auto& op_ = dynamic_cast<const Conv_Op<2>&>(mOp);
 
@@ -88,23 +69,7 @@ void Aidge::ConvImpl2D_cpu::forward() {
     AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator.");
 
     // Find the correct kernel type
-    const auto outputDataType = op_.getOutput(0)->dataType();
-    const Registrar<ConvImpl2DForward_cpu>::registrar_key registrarKey = {
-        op_.getInput(0)->dataType(),
-        op_.getInput(1)->dataType(),
-        (op_.getInput(2) ? op_.getInput(2)->dataType() : op_.getInput(1)->dataType()),
-        outputDataType};
-
-    Registrar<ConvImpl2DForward_cpu>::registrar_type kernelFunc;
-    if (Registrar<ConvImpl2DForward_cpu>::exists(registrarKey)) {
-        // One exists with the right inputs/output types
-        kernelFunc = Registrar<ConvImpl2DForward_cpu>::create(registrarKey);
-    }
-    else {
-        // Otherwise, fallback to the kernel with all types matching output type
-        kernelFunc = Registrar<ConvImpl2DForward_cpu>::create({
-            outputDataType, outputDataType, outputDataType, outputDataType});
-    }
+    const auto impl = Registrar<ConvImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Convert input data (no overhead if not needed!)
     // TODO: right now, if needed, memory will be allocated/deallocated at each
@@ -116,7 +81,7 @@ void Aidge::ConvImpl2D_cpu::forward() {
     const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();
 
     // Call kernel
-    kernelFunc(op_.strideDims(),
+    impl.forward(op_.strideDims(),
             op_.dilationDims(),
             op_.kernelDims(),
             op_.getInput(0)->template dims<4>(), // input dimensions
@@ -127,3 +92,8 @@ void Aidge::ConvImpl2D_cpu::forward() {
             getCPUPtr(mOp.getRawOutput(0)) // output
             );
 }
+
+template <>
+void Aidge::ConvImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<2> on backend cpu");
+}
diff --git a/src/operator/DivImpl.cpp b/src/operator/DivImpl.cpp
index cfd74be45b29852c89e4a27035ce2d38fc7266cc..135b32b5005a961e55910e758f9b7102ca51b63c 100644
--- a/src/operator/DivImpl.cpp
+++ b/src/operator/DivImpl.cpp
@@ -15,15 +15,11 @@
 #include "aidge/backend/cpu/data/Broadcasting.hpp"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
-#include "aidge/backend/cpu/operator/DivImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/DivImpl_kernels.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/utils/Types.h"
 
-Aidge::Elts_t Aidge::DivImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
-
+template <>
 void Aidge::DivImpl_cpu::forward() {
     // Find the correct kernel type
     // auto kernelFunc = Registrar<DivImplForward_cpu>::create({
@@ -60,10 +56,7 @@ void Aidge::DivImpl_cpu::forward() {
     const auto& opTensor = static_cast<const Div_Op&>(mOp);
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<DivImplForward_cpu>::create({
-        opTensor.getInput(0)->dataType(),
-        opTensor.getInput(1)->dataType(),
-        opTensor.getOutput(0)->dataType()});
+    const auto impl = Registrar<DivImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Compute compatible input dimensions
     std::vector<std::size_t>        dims0   = opTensor.getInput(0)->dims();
@@ -73,7 +66,7 @@ void Aidge::DivImpl_cpu::forward() {
     // special case for equal dimensions, the kernel is called with the entire arrays at once
     if (dims0 == dims1) {
         const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
-        kernelFunc(input0_contiguous_size, input0_contiguous_size, input0_contiguous_size,
+        impl.forward(input0_contiguous_size, input0_contiguous_size, input0_contiguous_size,
                     getCPUPtr(mOp.getRawInput(0)),
                     getCPUPtr(mOp.getRawInput(1)),
                     getCPUPtr(mOp.getRawOutput(0)));
@@ -139,7 +132,7 @@ void Aidge::DivImpl_cpu::forward() {
     std::size_t dim = contiguousIdx - 1;
     const std::size_t nbStacks = std::accumulate(outDims.cbegin(), outDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
     for (std::size_t stack = 0; stack < nbStacks;) {
-        kernelFunc(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+        impl.forward(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
                     getCPUPtr(mOp.getRawInput(0), offsetIn0*input0_contiguous_size),
                     getCPUPtr(mOp.getRawInput(1), offsetIn1*input1_contiguous_size),
                     getCPUPtr(mOp.getRawOutput(0), offsetOut*output_contiguous_size));
@@ -156,3 +149,8 @@ void Aidge::DivImpl_cpu::forward() {
         }
     }
 }
+
+template <>
+void Aidge::DivImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Div_Op on backend cpu");
+}
diff --git a/src/operator/ErfImpl.cpp b/src/operator/ErfImpl.cpp
index ace098468c05b80c4116e6f85d00b5fabaf754cd..42c6ce878abe227f74d7df4a9bf31ebc4c63eb88 100644
--- a/src/operator/ErfImpl.cpp
+++ b/src/operator/ErfImpl.cpp
@@ -14,29 +14,27 @@
 #include <memory>
 #include <vector>
 
-#include "aidge/backend/cpu/operator/ErfImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/ErfImpl_kernels.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Erf.hpp"
 #include "aidge/utils/Types.h"
 
-Aidge::Elts_t Aidge::ErfImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
-
+template <>
 void Aidge::ErfImpl_cpu::forward() {
     const Erf_Op& op = static_cast<const Erf_Op&>(mOp);
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<ErfImplForward_cpu>::create({
-                            op.getInput(0)->dataType(),
-                            op.getOutput(0)->dataType()
-                        });
+    const auto impl = Registrar<ErfImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(
+    impl.forward(
         op.getInput(0)->size(),
         op.getInput(0)->getImpl()->rawPtr(),
         op.getOutput(0)->getImpl()->rawPtr()
     );
 }
+
+template <>
+void Aidge::ErfImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Erf_Op on backend cpu");
+}
diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp
index f7eebb7b21512fb3b388b6927409fba9a1d92b34..359452712f94be078122266089cc1da89baf50d5 100644
--- a/src/operator/FCImpl.cpp
+++ b/src/operator/FCImpl.cpp
@@ -17,37 +17,20 @@
 #include <tuple>
 
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp"
-#include "aidge/backend/cpu/operator/FCImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/FCImpl_kernels.hpp"
 #include "aidge/operator/FC.hpp"
 #include "aidge/utils/ErrorHandling.hpp"
 #include "aidge/utils/Types.h"
 
 
+template <>
 void Aidge::FCImpl_cpu::forward()
 {
     const FC_Op& op_ = dynamic_cast<const FC_Op&>(mOp);
     AIDGE_ASSERT(op_.getInput(0), "missing input #0");
     AIDGE_ASSERT(op_.getInput(1), "missing input #1");
 
-    // Find the correct kernel type
-    const auto outputDataType = op_.getOutput(0)->dataType();
-    const Registrar<FCImplForward_cpu>::registrar_key registrarKey = {
-        op_.getInput(0)->dataType(),
-        op_.getInput(1)->dataType(),
-        ((op_.getInput(2)) ? op_.getInput(2)->dataType() : op_.getInput(1)->dataType()),
-        outputDataType};
-
-    Registrar<FCImplForward_cpu>::registrar_type kernelFunc;
-    if (Registrar<FCImplForward_cpu>::exists(registrarKey)) {
-        // One exists with the right inputs/output types
-        kernelFunc = Registrar<FCImplForward_cpu>::create(registrarKey);
-    }
-    else {
-        // Otherwise, fallback to the kernel with all types matching output type
-        kernelFunc = Registrar<FCImplForward_cpu>::create({
-            outputDataType, outputDataType, outputDataType, outputDataType});
-    }
+    const auto impl = Registrar<FCImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Convert input data (no overhead if not needed!)
     // TODO: right now, if needed, memory will be allocated/deallocated at each
@@ -60,7 +43,7 @@ void Aidge::FCImpl_cpu::forward()
 
     // Call kernel
     const auto batchSize = (input0.dims().size() > 1) ? input0.dims()[0] : 1;
-    kernelFunc(batchSize,
+    impl.forward(batchSize,
         input1.dims()[1], // nb input features
         input1.dims()[0], // nb output features
         input0.getImpl()->rawPtr(),
@@ -69,6 +52,7 @@ void Aidge::FCImpl_cpu::forward()
         getCPUPtr(mOp.getRawOutput(0)));
 }
 
+template <>
 void Aidge::FCImpl_cpu::backward()
 {
     const FC_Op& op_ = dynamic_cast<const FC_Op&>(mOp);
@@ -77,23 +61,7 @@ void Aidge::FCImpl_cpu::backward()
     AIDGE_ASSERT(op_.getInput(0)->grad(), "missing input #0 gradient");
     AIDGE_ASSERT(op_.getInput(1)->grad(), "missing input #1 gradient");
 
-    // Find the correct kernel type
-    const Registrar<FCImplBackward_cpu>::registrar_key registrarKey = {
-        fc_grad->dataType(),
-        op_.getInput(1)->grad()->dataType(),
-        (op_.getInput(2)) ? op_.getInput(2)->grad()->dataType() : op_.getInput(1)->grad()->dataType(),
-        op_.getInput(0)->grad()->dataType()};
-
-    Registrar<FCImplBackward_cpu>::registrar_type kernelFunc;
-    if (Registrar<FCImplBackward_cpu>::exists(registrarKey)) {
-        // One exists with the right inputs/output types
-        kernelFunc = Registrar<FCImplBackward_cpu>::create(registrarKey);
-    }
-    else {
-        // Otherwise, fallback to the kernel with all types matching output type
-        kernelFunc = Registrar<FCImplBackward_cpu>::create({
-            fc_grad->dataType(), fc_grad->dataType(), fc_grad->dataType(), fc_grad->dataType()});
-    }
+    const auto impl = Registrar<FCImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Convert input data (no overhead if not needed!)
     // TODO: right now, if needed, memory will be allocated/deallocated at each
@@ -106,7 +74,7 @@ void Aidge::FCImpl_cpu::backward()
 
     // Call kernel
     const auto batchSize = (input0grad.dims().size() > 1) ? input0grad.dims()[0] : 1;
-    kernelFunc(batchSize,
+    impl.backward(batchSize,
         input1grad.dims()[1], // nb input features
         input1grad.dims()[0], // nb output features
         getCPUPtr(fc_grad),
diff --git a/src/operator/FoldImpl.cpp b/src/operator/FoldImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..10f3d7b50bac9a1fbfc403609bdccb67a79cceac
--- /dev/null
+++ b/src/operator/FoldImpl.cpp
@@ -0,0 +1,46 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/operator/Conv.hpp"
+
+#include "aidge/backend/cpu/operator/FoldImpl.hpp"
+#include "aidge/backend/cpu/operator/FoldImpl_kernels.hpp"
+
+template <>
+void Aidge::FoldImpl2D_cpu::forward() {
+    const auto& op_ = static_cast<const Fold_Op<2>&>(mOp);
+    assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) && "missing input #0");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<FoldImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(op_.outputDims(),
+                op_.strideDims(),
+                op_.dilationDims(),
+                op_.kernelDims(),
+                std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims(),
+                getCPUPtr(mOp.getRawInput(0)),
+                getCPUPtr(mOp.getRawOutput(0)));
+}
+
+template <>
+void Aidge::FoldImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Fold_Op<2> on backend cpu");
+}
diff --git a/src/operator/GlobalAveragePoolingImpl.cpp b/src/operator/GlobalAveragePoolingImpl.cpp
index f7280360a4486fe5db6c4dfdd4c492bbe6ba302b..c53f92e199aee30d55ddafe39b5ef121979acbf7 100644
--- a/src/operator/GlobalAveragePoolingImpl.cpp
+++ b/src/operator/GlobalAveragePoolingImpl.cpp
@@ -15,7 +15,7 @@
 #include <memory>
 #include <vector>
 
-#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp"
 #include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/GlobalAveragePooling.hpp"
@@ -24,18 +24,23 @@
 #include "aidge/utils/Types.h"
 
 
+template <>
 void Aidge::GlobalAveragePoolingImpl_cpu::forward()
 {
     const GlobalAveragePooling_Op& op_ = static_cast<const GlobalAveragePooling_Op&>(mOp);
     // Check if input is provided
     AIDGE_ASSERT(op_.getInput(0), "missing input 0");
 
-    // Create the forward kernal with the wanted types
-    auto kernelFunc = Registrar<GlobalAveragePoolingImplForward_cpu>::create({op_.getInput(0)->dataType(),
-                                                                              op_.getOutput(0)->dataType()});
+    // Find the correct kernel type
+    const auto impl = Registrar<GlobalAveragePoolingImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(op_.getInput(0)->dims(),
+    impl.forward(op_.getInput(0)->dims(),
                op_.getInput(0)->getImpl()->rawPtr(),
                op_.getOutput(0)->getImpl()->rawPtr());
-}
\ No newline at end of file
+}
+
+template <>
+void Aidge::GlobalAveragePoolingImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for GlobalAveragePooling_Op on backend cpu");
+}
diff --git a/src/operator/GridSampleImpl.cpp b/src/operator/GridSampleImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5b87390fc3de21d5d406d893e4827e80cce06c35
--- /dev/null
+++ b/src/operator/GridSampleImpl.cpp
@@ -0,0 +1,48 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/GridSampleImpl.hpp"
+
+#include <functional>
+#include <vector>
+
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp"
+#include "aidge/operator/GridSample.hpp"
+#include "aidge/utils/Types.h"
+
+template <>
+void Aidge::GridSampleImpl_cpu::forward() {
+    const auto& op_ = static_cast<const GridSample_Op&>(mOp);
+
+    // Find the correct kernel type
+    const auto impl = Registrar<GridSampleImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback;
+    const auto& input0 = std::make_shared<Tensor>(op_.getInput(0)->refCastFrom(input0Fallback, *op_.getOutput(0)));
+    const auto& input1 = std::make_shared<Tensor>(op_.getInput(1)->refCastFrom(input1Fallback, *op_.getOutput(0)));
+
+    // Call kernel
+    impl.forward(op_,
+            input0, // input
+            input1, // grid
+            op_.getOutput(0) // output
+            );
+}
+
+template <>
+void Aidge::GridSampleImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for GridSample_Op on backend cpu");
+}
diff --git a/src/operator/LeakyReLUImpl.cpp b/src/operator/LeakyReLUImpl.cpp
index 9d4f2a7edcdf263751ec1d9cea10cd4d60055610..6c0802dd967d2a20b34a2f1ca91fc0640c063c83 100644
--- a/src/operator/LeakyReLUImpl.cpp
+++ b/src/operator/LeakyReLUImpl.cpp
@@ -14,20 +14,14 @@
 #include <vector>
 
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/LeakyReLUImpl_forward_kernels.hpp"
-#include "aidge/backend/cpu/operator/LeakyReLUImpl_backward_kernels.hpp"
+#include "aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/LeakyReLU.hpp"
 #include "aidge/utils/Log.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/utils/Registrar.hpp"
 
-
-Aidge::Elts_t Aidge::LeakyReLUImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
-
+template <>
 void Aidge::LeakyReLUImpl_cpu::forward() {
     const LeakyReLU_Op& op_ = dynamic_cast<const LeakyReLU_Op&>(mOp);
 
@@ -36,17 +30,16 @@ void Aidge::LeakyReLUImpl_cpu::forward() {
     AIDGE_ASSERT(in0, "missing input #0");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<LeakyReLUImplForward_cpu>::create({
-        in0->dataType(),
-        out0->dataType()});
+    const auto impl = Registrar<LeakyReLUImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(op_.negativeSlope(),
+    impl.forward(op_.negativeSlope(),
         in0->size(),
         getCPUPtr(mOp.getRawInput(0)),
         getCPUPtr(mOp.getRawOutput(0)));
 }
 
+template <>
 void Aidge::LeakyReLUImpl_cpu::backward() {
     // reversing in and out Data for backprop
     const LeakyReLU_Op& op_ = dynamic_cast<const LeakyReLU_Op&>(mOp);
@@ -55,12 +48,10 @@ void Aidge::LeakyReLUImpl_cpu::backward() {
     AIDGE_ASSERT(in0, "missing input #0");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<LeakyReLUImplForward_cpu>::create({
-        in0->dataType(),
-        out0->dataType()});
+    const auto impl = Registrar<LeakyReLUImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(op_.negativeSlope(),
+    impl.backward(op_.negativeSlope(),
         in0->size(),
         getCPUPtr(in0),
         getCPUPtr(out0));
diff --git a/src/operator/LnImpl.cpp b/src/operator/LnImpl.cpp
index 12885a944be46a977463e900af4047319bb1c8b2..79df733963ea8826439530d3adccde6affc9dfa8 100644
--- a/src/operator/LnImpl.cpp
+++ b/src/operator/LnImpl.cpp
@@ -20,14 +20,9 @@
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 #include "aidge/backend/cpu/operator/LnImpl.hpp"
-#include "aidge/backend/cpu/operator/LnImpl_forward_kernels.hpp"
-#include "aidge/backend/cpu/operator/LnImpl_backward_kernels.hpp"
-
-Aidge::Elts_t Aidge::LnImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
+#include "aidge/backend/cpu/operator/LnImpl_kernels.hpp"
 
+template <>
 void Aidge::LnImpl_cpu::forward() {
     const Ln_Op& op_ = static_cast<const Ln_Op&>(mOp);
 	std::shared_ptr<Tensor> in0 = op_.getInput(0);
@@ -35,16 +30,15 @@ void Aidge::LnImpl_cpu::forward() {
     AIDGE_ASSERT(in0, "missing input #0");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<LnImplForward_cpu>::create({
-        in0->dataType(),
-	    out0->dataType()});
+    const auto impl = Registrar<LnImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(in0->size(),
+    impl.forward(in0->size(),
         getCPUPtr(mOp.getRawInput(0)),
         getCPUPtr(mOp.getRawOutput(0)));
 }
 
+template <>
 void Aidge::LnImpl_cpu::backward() {
     const Ln_Op& op_ = dynamic_cast<const Ln_Op&>(mOp);
 	std::shared_ptr<Tensor> in0  = op_.getInput(0);
@@ -54,12 +48,8 @@ void Aidge::LnImpl_cpu::backward() {
     AIDGE_ASSERT(out0, "missing output #0 for current {} operator", op_.type());
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<LnImplBackward_cpu>::create({
-        in0->dataType(),
-	    gra_int0->dataType(),
-        gra_out0->dataType()        
-    });
+    const auto impl = Registrar<LnImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(gra_int0->size(), getCPUPtr(in0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
+    impl.backward(gra_int0->size(), getCPUPtr(in0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
 }
diff --git a/src/operator/MatMulImpl.cpp b/src/operator/MatMulImpl.cpp
index e716726886225f703e7cf482d0bfcfb9ec733948..ccd3265ed230e4f9cdc5ad85785a6473d9f131f0 100644
--- a/src/operator/MatMulImpl.cpp
+++ b/src/operator/MatMulImpl.cpp
@@ -19,17 +19,16 @@
 #include "aidge/utils/Types.h"
 
 #include "aidge/backend/cpu/operator/MatMulImpl.hpp"
-#include "aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/MatMulImpl_kernels.hpp"
 
+template <>
 void Aidge::MatMulImpl_cpu::forward()
 {
     assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) && "missing input #0");
     assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(1)) && "missing input #1");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<MatMulImplForward_cpu>::create(
-        {std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
-         std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
+    const auto impl = Registrar<MatMulImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Compute compatible input dimensions
     std::vector<std::size_t> dims0 = static_cast<const MatMul_Op&>(mOp).getInput(0)->dims();
@@ -91,7 +90,7 @@ void Aidge::MatMulImpl_cpu::forward()
     const std::size_t matrix1Size = k*m;
     const std::size_t matrixOutSize = n*m;
     for (std::size_t stack = 0; stack < nbMatrices;) {
-        kernelFunc(n, k, m,
+        impl.forward(n, k, m,
                     getCPUPtr(mOp.getRawInput(0), offsetIn0*matrix0Size),
                     getCPUPtr(mOp.getRawInput(1), offsetIn1*matrix1Size),
                     getCPUPtr(mOp.getRawOutput(0), offsetOut*matrixOutSize));
@@ -126,3 +125,8 @@ void Aidge::MatMulImpl_cpu::forward()
 //         getCPUPtr(mOp.getRawInput(1)),
 //         getCPUPtr(mOp.getRawOutput(0)));
 // }
+
+template <>
+void Aidge::MatMulImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for MatMul_Op on backend cpu");
+}
diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp
index 2e6d67abbdd6776a1f75449a0f4562143cbaae87..90075a397be3f082ef95fd4df074c99d926fd385 100644
--- a/src/operator/MaxPoolingImpl.cpp
+++ b/src/operator/MaxPoolingImpl.cpp
@@ -14,32 +14,29 @@
 #include <vector>
 
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/MaxPoolingImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp"
 #include "aidge/operator/MaxPooling.hpp"
 #include "aidge/utils/Log.hpp"
 #include "aidge/utils/Types.h"
 
-
-Aidge::Elts_t Aidge::MaxPoolingImpl2D_cpu::getNbRequiredProtected(IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
-
+template <>
 void Aidge::MaxPoolingImpl2D_cpu::forward() {
     const auto& op_ = dynamic_cast<const MaxPooling_Op<2>&>(mOp);
     AIDGE_ASSERT(op_.getInput(0), "missing input #0 in MaxPooling Operator.");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<MaxPoolingImpl2DForward_cpu>::create({
-        op_.getInput(0)->dataType(),
-        op_.getOutput(0)->dataType()
-    });
+    const auto impl = Registrar<MaxPoolingImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(op_.strideDims(),
+    impl.forward(op_.strideDims(),
                 op_.kernelDims(),
                 op_.ceilMode(),
                 op_.getInput(0)->template dims<4>(),
                 getCPUPtr(mOp.getRawInput(0)),
                 getCPUPtr(mOp.getRawOutput(0)));
 }
+
+template <>
+void Aidge::MaxPoolingImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for MaxPooling_Op<2> on backend cpu");
+}
diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp
index d7feb9b76e25a0e874b3682cdc5b3e53bf8e9228..ea5e3d3ab8ac24934a0cb6f9042858fa094700af 100644
--- a/src/operator/MulImpl.cpp
+++ b/src/operator/MulImpl.cpp
@@ -21,30 +21,49 @@
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 #include "aidge/backend/cpu/operator/MulImpl.hpp"
-#include "aidge/backend/cpu/operator/MulImpl_forward_kernels.hpp"
-
-Aidge::Elts_t Aidge::MulImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
+#include "aidge/backend/cpu/operator/MulImpl_kernels.hpp"
 
+template <>
 void Aidge::MulImpl_cpu::forward() {
-    // Find the correct kernel type
-    auto kernelFunc = Registrar<MulImplForward_cpu>::create({
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
-        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
-
     const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
                                                                    std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
     const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
                                                                    std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
 
+    // Find the correct kernel type
+    const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
     // Call kernel
-    kernelFunc(inputDims0,
+    impl.forward(inputDims0,
         inputDims1,
         std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
         getCPUPtr(mOp.getRawInput(0)),
         getCPUPtr(mOp.getRawInput(1)),
         getCPUPtr(mOp.getRawOutput(0)));
 }
+
+template <>
+void Aidge::MulImpl_cpu::backward() {
+    const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp);
+    
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    // Find the correct kernel type
+    const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.backward(/* input0Length */ in0grad->size(), 
+               /* input1Length */ in1grad->size(),
+               /* grad0Length  */ out0grad->size(),
+               /* input0Dims   */ in0->dims(),
+               /* input1Dims   */ in1->dims(),
+               getCPUPtr(in0), 
+               getCPUPtr(in1), 
+               getCPUPtr(out0grad), 
+               getCPUPtr(in0grad), 
+               getCPUPtr(in1grad));
+}
diff --git a/src/operator/PadImpl.cpp b/src/operator/PadImpl.cpp
index b4b52d6be855b6a1f8c0a71a6a9169ee9690f34c..cdae21f8ed2757128f6a36b661b0897a4ba65f89 100644
--- a/src/operator/PadImpl.cpp
+++ b/src/operator/PadImpl.cpp
@@ -16,9 +16,9 @@
 #include "aidge/operator/Conv.hpp"
 
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
-#include "aidge/backend/cpu/operator/PadImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/PadImpl_kernels.hpp"
 
-Aidge::Elts_t Aidge::PadImpl1D_cpu::getNbRequiredProtected(Aidge::IOIndex_t inputIdx) const {
+Aidge::Elts_t Aidge::Pad_ProdConso_cpu::getNbRequiredProtected(Aidge::IOIndex_t inputIdx) const {
     AIDGE_ASSERT(inputIdx == 0, "input index out of range."
         "{} Operator has only one input", mOp.type());
     (void) inputIdx;
@@ -31,17 +31,16 @@ Aidge::Elts_t Aidge::PadImpl1D_cpu::getNbRequiredProtected(Aidge::IOIndex_t inpu
     return Elts_t::DataElts(outputSize - inputSize);
 }
 
+template <>
 void Aidge::PadImpl1D_cpu::forward() {
     const auto& op_ = dynamic_cast<const Pad_Op<1>&>(mOp);
     AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Pad Operator.");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<PadImpl1DForward_cpu>::create({
-        op_.getInput(0)->dataType(),
-        op_.getOutput(0)->dataType()});
+    const auto impl = Registrar<PadImpl1D_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-     kernelFunc(op_.beginEndBorders(),
+    impl.forward(op_.beginEndBorders(),
                 op_.borderType(),
                 op_.borderValue(),
                 op_.getInput(0)->template dims<3>(),
@@ -49,32 +48,29 @@ void Aidge::PadImpl1D_cpu::forward() {
                 getCPUPtr(mOp.getRawOutput(0)));
 }
 
-Aidge::Elts_t Aidge::PadImpl2D_cpu::getNbRequiredProtected(Aidge::IOIndex_t inputIdx) const {
-    AIDGE_ASSERT(inputIdx == 0, "input index out of range."
-        "{} Operator has only one input", mOp.type());
-    (void) inputIdx;
-
-    // Padding cannot be in-place!
-    // We must ensure that we do not override data that has not been consummed yet.
-    const auto inputSize = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size();
-    const auto outputSize = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->size();
-    return Elts_t::DataElts(outputSize - inputSize);
+template <>
+void Aidge::PadImpl1D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Pad_Op<1> on backend cpu");
 }
 
+template <>
 void Aidge::PadImpl2D_cpu::forward() {
     const auto& op_ = dynamic_cast<const Pad_Op<2>&>(mOp);
     AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Pad Operator.");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<PadImpl2DForward_cpu>::create({
-        op_.getInput(0)->dataType(),
-        op_.getOutput(0)->dataType()});
+    const auto impl = Registrar<PadImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(op_.beginEndBorders(),
+    impl.forward(op_.beginEndBorders(),
                 op_.borderType(),
                 op_.borderValue(),
                 op_.getInput(0)->template dims<4>(),
                 getCPUPtr(mOp.getRawInput(0)),
                 getCPUPtr(mOp.getRawOutput(0)));
 }
+
+template <>
+void Aidge::PadImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Pad_Op<2> on backend cpu");
+}
diff --git a/src/operator/PowImpl.cpp b/src/operator/PowImpl.cpp
index 811d13804cffdd2477fc830f1779b0fb6271eb0b..74a7be71e176ba8e1cb8851050e575d6aa7465df 100644
--- a/src/operator/PowImpl.cpp
+++ b/src/operator/PowImpl.cpp
@@ -21,27 +21,20 @@
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 #include "aidge/backend/cpu/operator/PowImpl.hpp"
-#include "aidge/backend/cpu/operator/PowImpl_forward_kernels.hpp"
-
-Aidge::Elts_t Aidge::PowImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
+#include "aidge/backend/cpu/operator/PowImpl_kernels.hpp"
 
+template <>
 void Aidge::PowImpl_cpu::forward() {
-    // Find the correct kernel type
-    auto kernelFunc = Registrar<PowImplForward_cpu>::create({
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
-        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
-
     const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
                                                                    std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
     const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
                                                                    std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
 
+    // Find the correct kernel type
+    const auto impl = Registrar<PowImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
     // Call kernel
-    kernelFunc(inputDims0,
+    impl.forward(inputDims0,
         inputDims1,
         std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
         getCPUPtr(mOp.getRawInput(0)),
@@ -49,24 +42,31 @@ void Aidge::PowImpl_cpu::forward() {
         getCPUPtr(mOp.getRawOutput(0)));
 }
 
+template <>
 void Aidge::PowImpl_cpu::backward() {
-    // Find the correct kernel type
     const Pow_Op& op_ = dynamic_cast<const Pow_Op&>(mOp);
-    auto kernelFunc = Registrar<PowImplForward_cpu>::create({
-        op_.getOutput(0)->grad()->dataType(),
-        op_.getInput(0)->grad()->dataType(),
-        op_.getInput(1)->grad()->dataType()});
 
-    const std::vector<std::size_t> input0gradDims = getBroadcastedDims(op_.getInput(0)->grad()->dims(),
-                                                                   op_.getOutput(0)->grad()->dims());
-    const std::vector<std::size_t> input1gradDims = getBroadcastedDims(op_.getInput(1)->grad()->dims(),
-                                                                   op_.getOutput(0)->grad()->dims());
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    const std::vector<std::size_t> input0gradDims = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->grad()->dims(),
+                                                                       std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->grad()->dims());
+    const std::vector<std::size_t> input1gradDims = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->grad()->dims(),
+                                                                       std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->grad()->dims());
+
+    // Find the correct kernel type
+    const auto impl = Registrar<PowImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(op_.getOutput(0)->grad()->dims(),
-               input0gradDims,
-               input1gradDims,
-               getCPUPtr(mOp.getRawOutput(0)),
-               getCPUPtr(mOp.getRawInput(0)),
-               getCPUPtr(mOp.getRawInput(1)));
+    impl.backward(input0gradDims,
+                input1gradDims,
+                out0grad->dims(),
+                getCPUPtr(in0),
+                getCPUPtr(in1),
+                getCPUPtr(out0grad),
+                getCPUPtr(in0grad),
+                getCPUPtr(in1grad));
 }
\ No newline at end of file
diff --git a/src/operator/ReLUImpl.cpp b/src/operator/ReLUImpl.cpp
index 4a0fb9f5d929e2ce731a21b5553e1b9257a32daa..832f91aad347fc081439ec487d06b14b0e2fe8da 100644
--- a/src/operator/ReLUImpl.cpp
+++ b/src/operator/ReLUImpl.cpp
@@ -19,14 +19,9 @@
 #include "aidge/utils/ErrorHandling.hpp"
 
 #include "aidge/backend/cpu/operator/ReLUImpl.hpp"
-#include "aidge/backend/cpu/operator/ReLUImpl_forward_kernels.hpp"
-#include "aidge/backend/cpu/operator/ReLUImpl_backward_kernels.hpp"
-
-Aidge::Elts_t Aidge::ReLUImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
+#include "aidge/backend/cpu/operator/ReLUImpl_kernels.hpp"
 
+template <>
 void Aidge::ReLUImpl_cpu::forward() {
 	const ReLU_Op& op_ = dynamic_cast<const ReLU_Op&>(mOp);
     std::shared_ptr<Tensor> in0 = op_.getInput(0);
@@ -34,16 +29,15 @@ void Aidge::ReLUImpl_cpu::forward() {
     AIDGE_ASSERT(in0, "missing input #0");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<ReLUImplForward_cpu>::create({
-        in0->dataType(),
-	    out0->dataType()});
+    const auto impl = Registrar<ReLUImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(in0->size(),
+    impl.forward(in0->size(),
         getCPUPtr(mOp.getRawInput(0)),
         getCPUPtr(mOp.getRawOutput(0)));
 }
 
+template <>
 void Aidge::ReLUImpl_cpu::backward() {
     const ReLU_Op& op_ = dynamic_cast<const ReLU_Op&>(mOp);
     std::shared_ptr<Tensor> in0  = op_.getInput(0);
@@ -53,12 +47,8 @@ void Aidge::ReLUImpl_cpu::backward() {
     AIDGE_ASSERT(out0, "missing output #0 for current {} operator", op_.type());
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<ReLUImplBackward_cpu>::create({
-	in0->dataType(),
-        gra_int0->dataType(),
-	gra_out0->dataType()
-    });
+    const auto impl = Registrar<ReLUImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(gra_int0->size(), getCPUPtr(in0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
+    impl.backward(gra_int0->size(), getCPUPtr(in0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
 }
diff --git a/src/operator/ReduceMeanImpl.cpp b/src/operator/ReduceMeanImpl.cpp
index b4cd8ffa9b46aaa1c1d7a2eca947ed0254947fef..622672569372ff4e9f135e36255095f4246d5920 100644
--- a/src/operator/ReduceMeanImpl.cpp
+++ b/src/operator/ReduceMeanImpl.cpp
@@ -16,23 +16,29 @@
 
 #include "aidge/utils/Types.h"
 #include "aidge/operator/ReduceMean.hpp"
-#include "aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp"
 
+template <>
 void Aidge::ReduceMeanImpl_cpu::forward() {
     const ReduceMean_Op& op_ = dynamic_cast<const ReduceMean_Op&>(mOp);
+
     // Find the correct kernel type
-    auto kernelFunc = Registrar<ReduceMeanImplForward_cpu>::create({
-        op_.getInput(0)->dataType(),
-        op_.getOutput(0)->dataType()});
+    const auto impl = Registrar<ReduceMeanImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(op_.axes(),
+    impl.forward(op_.axes(),
                 op_.keepDims(),
                 op_.getInput(0)->dims(),
                 op_.getInput(0)->getImpl()->rawPtr(),
                 op_.getOutput(0)->getImpl()->rawPtr());
 }
 
+template <>
+void Aidge::ReduceMeanImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for ReduceMean_Op on backend cpu");
+}
+
+
 // void Aidge::ReduceMeanImpl1D_cpu::forward() {
 
 //     // Find the correct kernel type
diff --git a/src/operator/ReduceSumImpl.cpp b/src/operator/ReduceSumImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aad0801835a74ecefb046f3dc64729ae1f8bd8bb
--- /dev/null
+++ b/src/operator/ReduceSumImpl.cpp
@@ -0,0 +1,39 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
+
+#include <memory>
+#include <vector>
+
+#include "aidge/utils/Types.h"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/backend/cpu/operator/ReduceSumImpl_kernels.hpp"
+
+template <>
+void Aidge::ReduceSumImpl_cpu::forward() {
+    const ReduceSum_Op& op_ = dynamic_cast<const ReduceSum_Op&>(mOp);
+
+    // Find the correct kernel type
+    const auto impl = Registrar<ReduceSumImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(op_.axes(),
+                op_.keepDims(),
+                op_.getInput(0)->dims(),
+                op_.getInput(0)->getImpl()->rawPtr(),
+                op_.getOutput(0)->getImpl()->rawPtr());
+}
+
+template <>
+void Aidge::ReduceSumImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for ReduceSum_Op on backend cpu");
+}
diff --git a/src/operator/ScalingImpl.cpp b/src/operator/ScalingImpl.cpp
index db4670836e702f536243aadec36c5ba85b2344c8..1e7a408f267c5eb2d60d188f0ed2ba0394222561 100644
--- a/src/operator/ScalingImpl.cpp
+++ b/src/operator/ScalingImpl.cpp
@@ -17,29 +17,28 @@
 #include "aidge/operator/Scaling.hpp"
 
 #include "aidge/backend/cpu/operator/ScalingImpl.hpp"
-#include "aidge/backend/cpu/operator/ScalingImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/ScalingImpl_kernels.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
-Aidge::Elts_t Aidge::ScalingImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
-
+template <>
 void Aidge::ScalingImpl_cpu::forward() {
     const auto& op_ = dynamic_cast<const Scaling_Op&>(mOp);
     AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Scaling Operator.");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<ScalingImplForward_cpu>::create({
-        op_.getInput(0)->dataType(),
-        op_.getOutput(0)->dataType()});
+    const auto impl = Registrar<ScalingImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(op_.scalingFactor(),
+    impl.forward(op_.scalingFactor(),
             op_.quantizedNbBits(),
             op_.isOutputUnsigned(),
             op_.getInput(0)->size(),
             getCPUPtr(mOp.getRawInput(0)),
             getCPUPtr(mOp.getRawOutput(0)));
 }
+
+template <>
+void Aidge::ScalingImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Scaling_Op on backend cpu");
+}
diff --git a/src/operator/SigmoidImpl.cpp b/src/operator/SigmoidImpl.cpp
index ad69935c02e392d7aa1c9601acb827c5baf8970f..cdcbac85df3a38fea9b7100324e0618949262fc9 100644
--- a/src/operator/SigmoidImpl.cpp
+++ b/src/operator/SigmoidImpl.cpp
@@ -20,14 +20,9 @@
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 #include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
-#include "aidge/backend/cpu/operator/SigmoidImpl_forward_kernels.hpp"
-#include "aidge/backend/cpu/operator/SigmoidImpl_backward_kernels.hpp"
-
-Aidge::Elts_t Aidge::SigmoidImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
+#include "aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp"
 
+template <>
 void Aidge::SigmoidImpl_cpu::forward() {
 	const Sigmoid_Op& op_ = dynamic_cast<const Sigmoid_Op&>(mOp);
     std::shared_ptr<Tensor> in0 = op_.getInput(0);
@@ -35,16 +30,15 @@ void Aidge::SigmoidImpl_cpu::forward() {
     AIDGE_ASSERT(in0, "missing input #0");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<SigmoidImplForward_cpu>::create({
-        in0->dataType(),
-	    out0->dataType()});
+    const auto impl = Registrar<SigmoidImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(in0->size(),
+    impl.forward(in0->size(),
         getCPUPtr(mOp.getRawInput(0)),
         getCPUPtr(mOp.getRawOutput(0)));
 }
 
+template <>
 void Aidge::SigmoidImpl_cpu::backward() {
     const Sigmoid_Op& op_ = dynamic_cast<const Sigmoid_Op&>(mOp);
     std::shared_ptr<Tensor> out0  = op_.getOutput(0);
@@ -53,12 +47,8 @@ void Aidge::SigmoidImpl_cpu::backward() {
     AIDGE_ASSERT(out0, "missing output #0 for current {} operator", op_.type());
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<SigmoidImplBackward_cpu>::create({
-        out0->dataType(),
-	gra_int0->dataType(),
-        gra_out0->dataType()        
-    });
+    const auto impl = Registrar<SigmoidImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(gra_int0->size(), getCPUPtr(out0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
+    impl.backward(gra_int0->size(), getCPUPtr(out0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
 }
diff --git a/src/operator/SliceImpl.cpp b/src/operator/SliceImpl.cpp
index 8ffe4dcdd97b58758885b013d0c1770bd98a83ba..945c1bc752feb8e6a194b1aff99b26f01a6a0e69 100644
--- a/src/operator/SliceImpl.cpp
+++ b/src/operator/SliceImpl.cpp
@@ -14,27 +14,21 @@
 #include <vector>
 
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/SliceImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/SliceImpl_kernels.hpp"
 #include "aidge/operator/Slice.hpp"
 #include "aidge/utils/Log.hpp"
 #include "aidge/utils/Types.h"
 
-Aidge::Elts_t Aidge::SliceImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
-
+template <>
 void Aidge::SliceImpl_cpu::forward() {
     const auto& op_ = dynamic_cast<const Slice_Op&>(mOp);
     AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Slice Operator.");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<SliceImplForward_cpu>::create({
-        op_.getInput(0)->dataType(),
-        op_.getOutput(0)->dataType()});
+    const auto impl = Registrar<SliceImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(op_.starts(),
+    impl.forward(op_.starts(),
             op_.ends(),
             op_.axes(),
             op_.steps(),
@@ -42,3 +36,8 @@ void Aidge::SliceImpl_cpu::forward() {
             getCPUPtr(mOp.getRawInput(0)),
             getCPUPtr(mOp.getRawOutput(0)));
 }
+
+template <>
+void Aidge::SliceImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Slice_Op on backend cpu");
+}
diff --git a/src/operator/SoftmaxImpl.cpp b/src/operator/SoftmaxImpl.cpp
index 5bc3699e2146e36a63b4a1602ca1cb86e3ff1e2f..8b6933f22f3673476f4a9f1e261fbcdc09857300 100644
--- a/src/operator/SoftmaxImpl.cpp
+++ b/src/operator/SoftmaxImpl.cpp
@@ -20,27 +20,25 @@
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 #include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
-#include "aidge/backend/cpu/operator/SoftmaxImpl_forward_kernels.hpp"
-
-Aidge::Elts_t Aidge::SoftmaxImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
+#include "aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp"
 
+template <>
 void Aidge::SoftmaxImpl_cpu::forward() {
     const auto& op_ = dynamic_cast<const Softmax_Op&>(mOp);
     AIDGE_ASSERT(!op_.getInput(0)->empty(), "Softmax input empty");
+    std::int32_t axis = (op_.axis() >= 0) ? op_.axis() : op_.getInput(0)->nbDims() + op_.axis();
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<SoftmaxImplForward_cpu>::create({
-        op_.getInput(0)->dataType(),
-        op_.getOutput(0)->dataType()});
-
-    std::int32_t axis = (op_.axis() >= 0) ? op_.axis() : op_.getInput(0)->nbDims() + op_.axis();
+    const auto impl = Registrar<SoftmaxImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(static_cast<std::size_t>(axis), // axisIdx
+    impl.forward(static_cast<std::size_t>(axis), // axisIdx
                std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims(),
                std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->getImpl()->rawPtr(),
                std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
 }
+
+template <>
+void Aidge::SoftmaxImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Softmax_Op on backend cpu");
+}
diff --git a/src/operator/SqrtImpl.cpp b/src/operator/SqrtImpl.cpp
index edb8858fc4ac07fa5725d24688b22d64134afb0e..25bdb42fd5140ef4f64d704fc3a5ccf237f17f81 100644
--- a/src/operator/SqrtImpl.cpp
+++ b/src/operator/SqrtImpl.cpp
@@ -19,30 +19,24 @@
 #include "aidge/utils/Types.h"
 
 #include "aidge/backend/cpu/operator/SqrtImpl.hpp"
-#include "aidge/backend/cpu/operator/SqrtImpl_forward_kernels.hpp"
-#include "aidge/backend/cpu/operator/SqrtImpl_backward_kernels.hpp"
-
-Aidge::Elts_t Aidge::SqrtImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
+#include "aidge/backend/cpu/operator/SqrtImpl_kernels.hpp"
 
+template <>
 void Aidge::SqrtImpl_cpu::forward() {
     std::shared_ptr<Tensor> in0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0));
     std::shared_ptr<Tensor> out0 = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0));
     AIDGE_ASSERT(in0, "missing input #0");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<SqrtImplForward_cpu>::create({
-        in0->dataType(),
-        out0->dataType()});
+    const auto impl = Registrar<SqrtImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(in0->size(),
+    impl.forward(in0->size(),
         getCPUPtr(mOp.getRawInput(0)),
         getCPUPtr(mOp.getRawOutput(0)));
 }
 
+template <>
 void Aidge::SqrtImpl_cpu::backward() {
     // reversing in and out Data for backprop
     const Sqrt_Op& op_ = dynamic_cast<const Sqrt_Op&>(mOp);
@@ -51,12 +45,10 @@ void Aidge::SqrtImpl_cpu::backward() {
     AIDGE_ASSERT(out0grad, "missing output #0");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<SqrtImplForward_cpu>::create({
-        out0grad->dataType(),
-        in0grad->dataType()});
+    const auto impl = Registrar<SqrtImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(out0grad->size(),
+    impl.backward(out0grad->size(),
         getCPUPtr(out0grad),
         getCPUPtr(in0grad));
 }
\ No newline at end of file
diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp
index ffddb59ee3373c4a0a6c2653747744a43fd471d9..d43771b967889183801cb93418c967ce9d9c8453 100644
--- a/src/operator/SubImpl.cpp
+++ b/src/operator/SubImpl.cpp
@@ -21,31 +21,28 @@
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 #include "aidge/backend/cpu/operator/SubImpl.hpp"
-#include "aidge/backend/cpu/operator/SubImpl_forward_kernels.hpp"
-
-Aidge::Elts_t Aidge::SubImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
+#include "aidge/backend/cpu/operator/SubImpl_kernels.hpp"
 
+template <>
 void Aidge::SubImpl_cpu::forward() {
-
-    // Find the correct kernel type
-    auto kernelFunc = Registrar<SubImplForward_cpu>::create({
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
-        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
-
     const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
                                                                    std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
     const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
                                                                    std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
 
+    // Find the correct kernel type
+    const auto impl = Registrar<SubImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
     // Call kernel
-    kernelFunc(inputDims0,
+    impl.forward(inputDims0,
         inputDims1,
         std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
         getCPUPtr(mOp.getRawInput(0)),
         getCPUPtr(mOp.getRawInput(1)),
         getCPUPtr(mOp.getRawOutput(0)));
 }
+
+template <>
+void Aidge::SubImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Sub_Op on backend cpu");
+}
diff --git a/src/operator/TanhImpl.cpp b/src/operator/TanhImpl.cpp
index a2469ed9b83679c0edf8d0a761abf9d3d046db6e..ed8dce08b9f710c9e5830b2c72ffef71013edb6e 100644
--- a/src/operator/TanhImpl.cpp
+++ b/src/operator/TanhImpl.cpp
@@ -20,14 +20,9 @@
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 #include "aidge/backend/cpu/operator/TanhImpl.hpp"
-#include "aidge/backend/cpu/operator/TanhImpl_forward_kernels.hpp"
-#include "aidge/backend/cpu/operator/TanhImpl_backward_kernels.hpp"
-
-Aidge::Elts_t Aidge::TanhImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // this implementation can be in-place
-    return Elts_t::DataElts(0);
-}
+#include "aidge/backend/cpu/operator/TanhImpl_kernels.hpp"
 
+template <>
 void Aidge::TanhImpl_cpu::forward() {
 	const Tanh_Op& op_ = dynamic_cast<const Tanh_Op&>(mOp);
     std::shared_ptr<Tensor> in0 = op_.getInput(0);
@@ -35,16 +30,15 @@ void Aidge::TanhImpl_cpu::forward() {
     AIDGE_ASSERT(in0, "missing input #0");
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<TanhImplForward_cpu>::create({
-        in0->dataType(),
-	    out0->dataType()});
+    const auto impl = Registrar<TanhImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(in0->size(),
+    impl.forward(in0->size(),
         getCPUPtr(mOp.getRawInput(0)),
         getCPUPtr(mOp.getRawOutput(0)));
 }
 
+template <>
 void Aidge::TanhImpl_cpu::backward() {
     const Tanh_Op& op_ = dynamic_cast<const Tanh_Op&>(mOp);
     std::shared_ptr<Tensor> out0  = op_.getOutput(0);
@@ -53,13 +47,9 @@ void Aidge::TanhImpl_cpu::backward() {
     AIDGE_ASSERT(out0, "missing output #0 for current {} operator", op_.type());
 
     // Find the correct kernel type
-    auto kernelFunc = Registrar<TanhImplBackward_cpu>::create({
-        out0->dataType(),
-	gra_int0->dataType(),
-        gra_out0->dataType()        
-    });
+    const auto impl = Registrar<TanhImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    kernelFunc(gra_int0->size(), getCPUPtr(out0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
+    impl.backward(gra_int0->size(), getCPUPtr(out0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
 }
 
diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index 671cdd5ac1262ab61b35a70a234236aff4a3cc15..8178df93beb96a3a7538dae8d9a706380c06ecf8 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -12,7 +12,7 @@ file(GLOB_RECURSE src_files "*.cpp")
 
 add_executable(tests${module_name} ${src_files})
 
-target_link_libraries(tests${module_name} PUBLIC ${module_name})
+target_link_libraries(tests${module_name} PRIVATE ${module_name})
 
 target_link_libraries(tests${module_name} PRIVATE Catch2::Catch2WithMain)
 
diff --git a/unit_tests/operator/Test_AndImpl.cpp b/unit_tests/operator/Test_AndImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..053bb3ea4ed913bd388f3ae049c4d6402ad58d59
--- /dev/null
+++ b/unit_tests/operator/Test_AndImpl.cpp
@@ -0,0 +1,205 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/And.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
+        SECTION("ForwardDims")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        SECTION("Same dimensions") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+
+                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
+                myInput1->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
+                myInput1->zeros();
+                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
+                myInput2->setBackend("cpu");
+                myInput2->setDataType(DataType::Float32);
+                myInput2->zeros();
+                std::shared_ptr<Node> myAnd = And();
+                auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+                op->associateInput(0,myInput1);
+                op->associateInput(1,myInput2);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == dims);
+            }
+        }
+        SECTION("Broadcasting") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims1(nbDims, 1);
+                std::vector<DimSize_t> dims2(nbDims, 1);
+                std::vector<DimSize_t> expectedOutDims;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    DimSize_t dim = dimSizeDist(gen);
+                    if (boolDist(gen)) {
+                        dims1[i] = dim;
+                    }
+                    if (boolDist(gen)) {
+                        dims2[i] = dim;
+                    }
+                    expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+                }
+
+
+                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
+                myInput1->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
+                myInput1->zeros();
+                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
+                myInput2->setBackend("cpu");
+                myInput2->setDataType(DataType::Float32);
+                myInput2->zeros();
+                std::shared_ptr<Node> myAnd = And();
+                auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+                op->associateInput(0,myInput1);
+                op->associateInput(1,myInput2);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+    }
+    SECTION("Same size inputs") {
+        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        {                                       //
+            {                                   //
+                {{20, 15},{31, 11},{22, 49}},   //
+                {{41, 10},{24, 51},{27, 52}},   //
+                {{26, 53},{27, 54},{28, 55}}    //
+            },                                  //
+            {                                   //
+                {{29, 56},{30, 57},{31, 58}},   //
+                {{32, 59},{33, 60},{34, 61}},   //
+                {{35, 62},{36, 63},{37, 64}}    //
+            },                                  //
+            {                                   //
+                {{38, 65},{39, 66},{40, 67}},   //
+                {{41, 68},{42, 69},{43, 70}},   //
+                {{44, 71},{45, 72},{46, 73}}    //
+            }                                   //
+        }                                       //
+    });                                         //
+        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+            {                                       //
+                {                                   //
+                    {{20, 47},{21, 48},{22, 49}},   //
+                    {{23, 50},{24, 51},{25, 52}},   //
+                    {{17, 53},{27, 26},{14, 33}}    //
+                },                                  //
+                {                                   //
+                    {{29, 56},{30, 57},{31, 58}},   //
+                    {{72, 44},{33, 20},{27, 55}},   //
+                    {{35, 24},{25, 63},{28, 64}}    //
+                },                                  //
+                {                                   //
+                    {{32, 65},{39, 66},{40, 70}},   //
+                    {{41, 53},{42, 60},{34, 70}},   //
+                    {{44, 71},{30, 12},{46, 73}}    //
+                }                                   //
+            }                                       //
+        });                                         //
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+            {
+                {
+                    {{1, 0},{0, 0},{1, 1}},
+                    {{0, 0},{1, 1},{0, 1}},
+                    {{0, 1},{1, 0},{0, 0}}
+                },
+                {
+                    {{1, 1},{1, 1},{1, 1}},
+                    {{0, 0},{1, 0},{0, 0}},
+                    {{1, 0},{0, 1},{0, 1}}
+                },
+                {
+                    {{0, 1},{1, 1},{1, 0}},
+                    {{1, 0},{1, 0},{0, 1}},
+                    {{1, 1},{0, 0},{1, 1}}
+                }
+            }
+        });
+
+        std::shared_ptr<Node> myAnd = And();
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        op->associateInput(0, input1);
+        op->associateInput(1, input2);
+        op->setBackend("cpu");
+        op->setDataType(DataType::Int32);
+        myAnd->forward();
+
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+    }
+
+    SECTION("Broadcasting") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+        {                                       //
+            {                                   //
+                {{10, 20},{22, 23},{20, 20}},   //
+                {{10, 15},{10, 29},{20, 20}},   //
+                {{26, 25},{33, 20},{10, 20}}    //
+            }                                   //
+        }                                       //
+        });                                     //
+
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});  
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+            {                                   //
+                {                               //
+                    {{ 1, 1},{ 0, 0},{ 0, 1}},  //
+                    {{ 1, 0},{ 1, 0},{ 0, 1}},  //
+                    {{ 0, 0},{ 0, 1},{ 1, 1}}   //
+                }                               //
+            }                                   //
+        });                                     //
+
+        std::shared_ptr<Node> myAnd = And();
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        op->associateInput(0, input_1);
+        op->associateInput(1, input_2);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
+        myAnd->forward();
+        op->getOutput(0)->print();
+        expectedOutput->print();
+        REQUIRE(*op->getOutput(0) == *expectedOutput);
+    }
+}
\ No newline at end of file
diff --git a/unit_tests/operator/Test_ArgMaxImpl.cpp b/unit_tests/operator/Test_ArgMaxImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9915d90423e976db1bdd2a694a2cfd7beb380cee
--- /dev/null
+++ b/unit_tests/operator/Test_ArgMaxImpl.cpp
@@ -0,0 +1,227 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <memory>
+#include <numeric>   // std::accumulate
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/ArgMax.hpp"
+#include "aidge/operator/Conv.hpp"
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") {
+    SECTION("ForwardDims")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        SECTION("KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims(nbDims);
+                std::uniform_int_distribution<std::int32_t> axisDist(std::int32_t(0), std::int32_t(nbDims-1));
+                std::int32_t axis = axisDist(gen);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    if (i == axis) {
+                        expectedOutDims[i] = 1;
+                    }
+                    else {
+                        expectedOutDims[i] = dims[i];
+                    }
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                myInput->zeros();
+                std::shared_ptr<Node> myArgMax = ArgMax(axis);
+                auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+        SECTION("Not KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims;
+                std::uniform_int_distribution<std::int32_t> axisDist(std::int32_t(0), std::int32_t(nbDims-1));
+                std::int32_t axis = axisDist(gen);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    if(i != axis) {
+                        expectedOutDims.push_back(dims[i]);
+                    }
+                }
+                if(expectedOutDims.empty()) {
+                    expectedOutDims.push_back(1);
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myArgMax = ArgMax(axis, false);
+                auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+    }
+    SECTION("3D Tensor") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,2,3,4> {
+                {
+                    {
+                        { 1.0, 2.0, 3.0, 4.0},
+                        { 8.0, 0.0, 17.0, 1.0},
+                        { 5.0, 10.0, 6.0, 0.0}
+                    },
+                    {
+                        { 7.0, 1.0, 9.0, 4.0},
+                        { 0.0, 8.0, 4.0, 2.0},
+                        { 9.0, 2.0, 0.0, 5.0}
+                    }
+                }
+            });
+        SECTION("Axis 2") {
+
+            Tensor myOutput = Tensor(Array3D<float,2,3, 1> {
+               { 
+                    { 
+                        {3.0},
+                        {2.0},
+                        {1.0}
+                    },
+                    {
+                        {2.0},
+                        {1.0},
+                        {0.0}
+                    }
+               }
+            });
+
+            std::shared_ptr<Node> myArgMax = ArgMax(2);
+            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myArgMax->forward();
+
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+        SECTION("Axis 2 with keep_dims false") {
+
+            Tensor myOutput = Tensor(Array2D<float,2,3> {
+               { 
+                    { 3.0, 2.0, 1.0 },
+                    { 2.0, 1.0, 0.0 }
+               }
+            });
+
+            std::shared_ptr<Node> myArgMax = ArgMax(2,0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myArgMax->forward();
+
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+        SECTION("Axis 1") {
+            Tensor myOutput = Tensor(Array3D<float,2,1,4> {
+                {
+                    {
+                        { 1.0, 2.0, 1.0, 0.0 }
+                    },
+                    {
+                        { 2.0, 1.0, 0.0, 2.0 }
+                    }
+                }
+            });
+
+            std::shared_ptr<Node> myArgMax = ArgMax(1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myArgMax->forward();
+
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+        SECTION("Axis 0") {
+            Tensor myOutput = Tensor(Array3D<float,1,3,4> {
+                {
+                    {
+                        { 1.0, 0.0, 1.0, 0.0 },
+                        { 0.0, 1.0, 0.0, 1.0 },
+                        { 1.0, 0.0, 0.0, 1.0 }
+                    }
+                }
+            });
+
+            std::shared_ptr<Node> myArgMax = ArgMax(0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            std::cout << " ...............  "<< std::endl;
+            myArgMax->forward();
+            op->getOutput(0)->print();
+            std::cout <<"------"<<std::endl;
+            myOutput.print();
+
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+    }
+    SECTION("Select_Last_Index") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array1D<float,10> {
+            {
+                1.0, 5.0, 9.0, 0.0, 6.0, 2.0, 9.0, 4.0, 3.0, 9.0
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {{9}});
+
+        std::shared_ptr<Node> myArgMax = ArgMax(0, 1, 1);
+        auto op = std::static_pointer_cast<OperatorTensor>(myArgMax -> getOperator());
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        myArgMax->forward();
+        op->getOutput(0)->print();
+
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+
+    }
+}
\ No newline at end of file
diff --git a/unit_tests/operator/Test_BitShift.cpp b/unit_tests/operator/Test_BitShift.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a52990bc7991a325ce151cf6634b0d5a831992c8
--- /dev/null
+++ b/unit_tests/operator/Test_BitShift.cpp
@@ -0,0 +1,245 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <cstddef>   // std::size_t
+#include <cstdint>   // std::uint16_t
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <numeric>   
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <iomanip>
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/BitShift.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 15;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> valueDist(-15, 15); 
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
+    std::uniform_int_distribution<int> boolDist(0,1);
+
+    BitShift_Op::BitShiftDirection direction = BitShift_Op::BitShiftDirection::left;
+
+    if(valueDist(gen) % 2 == 0)
+    {
+        direction = BitShift_Op::BitShiftDirection::right;
+    }
+
+    // Create BitShift Operator
+    std::shared_ptr<Node> myBitShift = BitShift(direction);
+    auto op = std::static_pointer_cast<OperatorTensor>(myBitShift-> getOperator());
+    op->setDataType(DataType::Int32);
+    op->setBackend("cpu");
+
+    // Create 2 input Tensors
+    std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+    op->associateInput(0,T0);
+    T0->setDataType(DataType::Int32);
+    T0->setBackend("cpu");
+    std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
+    op -> associateInput(1,T1);
+    T1->setDataType(DataType::Int32);
+    T1->setBackend("cpu");
+
+    // Create results Tensor
+    std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
+    Tres->setDataType(DataType::Int32);
+    Tres->setBackend("cpu");
+
+    // To measure execution time of 'BitShift_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+
+    SECTION("BitShiftImpl_cpu::forward()") {
+        SECTION("Test Forward Kernel with same dimensions") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                const std::size_t nbDims = nbDimsDist(gen);
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // without broadcasting
+                int* array0 = new int[nb_elements];
+                int* array1 = new int[nb_elements];
+                int* result = new int[nb_elements];
+
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = valueDist(gen);
+                    array1[i] = std::abs(valueDist(gen)); // bitshift is impossible with negative value
+                    if(direction == BitShift_Op::BitShiftDirection::left)
+                    {
+                        result[i] = array0[i] << array1[i];
+                    }
+                    else
+                    {
+                        result[i] = array0[i] >> array1[i];
+                    }
+                }
+
+                // input0
+                T0->resize(dims);
+                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
+
+                // input1
+                T1->resize(dims);
+                T1 -> getImpl() -> setRawPtr(array1, nb_elements);
+
+                // results
+                Tres->resize(dims);
+                Tres -> getImpl() -> setRawPtr(result, nb_elements);
+
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myBitShift->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                bool is_eq = approxEq<int>(*(op->getOutput(0)), *Tres);
+
+                auto Output = *(op->getOutput(0));
+                auto prt = Output.getImpl()->rawPtr();
+
+                REQUIRE(is_eq);
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+
+            }
+            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
+            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+        }
+        SECTION("Test BitShift kernels with Broadcasting") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions, replace some dimensions with '1' to get broadcasting
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                std::vector<std::size_t> dims0 = dims;
+                std::vector<std::size_t> dims1 = dims;
+                std::vector<std::size_t> dimsOut = dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims0[i] = 1;
+                    }
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                    dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
+                }
+
+                // create arrays and fill them with random values
+                int* array0 = new int[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
+                int* array1 = new int[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
+                int* result = new int[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
+
+                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
+                    array1[i] = std::abs(valueDist(gen));
+                }
+
+                //True result with broadcast
+                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
+                const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
+                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
+                                                    + strides1[1] * ((dims1[1] > 1) ? b : 0);
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 = idx0_0
+                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
+                                                    + ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 = idx1_0
+                                                    + strides1[2] * ((dims1[2] > 1) ? c : 0)
+                                                    + ((dims1[3] > 1) ? d : 0);
+                                if(direction == BitShift_Op::BitShiftDirection::left)
+                                {
+                                    result[idx_out + d] = array0[idx0] << array1[idx1];
+                                }
+                                else
+                                {
+                                    result[idx_out + d] = array0[idx0] >> array1[idx1];                               
+                                }
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myBitShift->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // comparison between truth and computed result
+                bool equiv = (approxEq<int>(*(op->getOutput(0)), *Tres));
+                if(equiv == false)
+                {
+                    std::cout << "Problem\n";
+                }
+                REQUIRE(equiv);
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
+            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+        }
+
+}
+} // namespace Aidge
+}
\ No newline at end of file
diff --git a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..42505d385fde7e72e09531f1607287ffc6978f75
--- /dev/null
+++ b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
@@ -0,0 +1,120 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstddef> // std::size_t
+#include <cstdint> // std::uint16_t
+#include <iostream>
+#include <memory>
+#include <numeric> // std::accumulate
+#include <ostream>
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include "catch2/internal/catch_compiler_capabilities.hpp"
+#include "catch2/internal/catch_enforce.hpp"
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/generators/catch_generators_random.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/ConstantOfShape.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+#include <aidge/data/Data.hpp>
+#include <aidge/data/half.hpp>
+#include <aidge/filler/Filler.hpp>
+#include <aidge/operator/OperatorTensor.hpp>
+#include <aidge/operator/Reshape.hpp>
+#include <aidge/utils/TensorUtils.hpp>
+#include <aidge/utils/Types.h>
+
+namespace Aidge {
+TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") {
+  constexpr std::uint16_t NBTRIALS = 10;
+  // Create a random number generator
+  auto random_seed = Catch::Generators::Detail::getSeed;
+  std::mt19937 gen(random_seed());
+  std::uniform_real_distribution<float> valueDist(
+      0.1f, 1.1f); // Random float distribution between 0 and 1
+  std::uniform_int_distribution<DimSize_t> input_tensor_size_dist(
+      std::size_t(1), std::size_t(10));
+  std::uniform_int_distribution<int64_t> input_tensor_values_dist(
+      std::size_t(1), std::size_t(7));
+  std::uniform_real_distribution<double> operator_attr_value_dist(-100., 100.);
+
+  ///////////////////////////////////////////////
+  // SETUP FUNCTIONS
+  auto generate_input_tensor =
+      [&gen, &input_tensor_size_dist,
+       &input_tensor_values_dist]() -> std::shared_ptr<Tensor> {
+    std::vector<DimSize_t> input_dims;
+    input_dims.push_back(input_tensor_size_dist(gen));
+
+    auto result = std::make_shared<Tensor>(input_dims);
+    result->setDataType(DataType::Int64);
+    result->setBackend("cpu");
+    for (DimSize_t i = 0; i < result->size(); ++i) {
+      result->set<int64_t>(i, input_tensor_values_dist(gen));
+    }
+    return result;
+  };
+
+  auto generate_random_operator =
+      [&gen,
+       &operator_attr_value_dist]() -> std::shared_ptr<ConstantOfShape_Op> {
+    auto node = ConstantOfShape(Tensor(operator_attr_value_dist(gen)));
+    auto op = std::static_pointer_cast<ConstantOfShape_Op>(node->getOperator());
+    op->setDataType(DataType::Float64);
+    op->setBackend("cpu");
+    return op;
+  };
+
+  auto generate_output_tensor = [](std::shared_ptr<Tensor> input_tensor,
+                                   std::shared_ptr<ConstantOfShape_Op> op) {
+    std::vector<DimSize_t> output_dims;
+    output_dims.reserve(input_tensor->size());
+    for (DimSize_t i = 0; i < input_tensor->size(); ++i) {
+      output_dims.push_back(input_tensor->get<int64_t>(i));
+    }
+    auto result = std::make_shared<Tensor>(output_dims);
+    result->setDataType(op->value().dataType());
+    result->setBackend("cpu");
+    constantFiller(result, op->value().get<double>(0));
+    return result;
+  };
+
+  /////////////////////////////////////
+  // BENCHMARKING
+  std::chrono::time_point<std::chrono::system_clock> start;
+  std::chrono::time_point<std::chrono::system_clock> end;
+  std::chrono::duration<double, std::micro> duration{};
+  int number_of_operation{0};
+
+  SECTION("ConstantOfShapeImpl_cpu::forward()") {
+    for (int i = 0; i < NBTRIALS; ++i) {
+      auto input_T = generate_input_tensor();
+      std::shared_ptr<ConstantOfShape_Op> op = generate_random_operator();
+      auto output_T = generate_output_tensor(input_T, op);
+      op->associateInput(0, input_T);
+
+      REQUIRE(op->forwardDims(true));
+      REQUIRE_NOTHROW(op->forward());
+
+      CHECK(output_T->nbDims() == op->getOutput(0)->nbDims());
+      for (DimIdx_t i = 0; i < output_T->nbDims(); ++i) {
+        CHECK(output_T->dims().at(i) == op->getOutput(0)->dims().at(i));
+      }
+      CHECK(approxEq<double>(*output_T, *op->getOutput(0)));
+    }
+  }
+}
+} // namespace Aidge
+
diff --git a/unit_tests/operator/Test_FoldImpl.cpp b/unit_tests/operator/Test_FoldImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6832f5a42d796d9261495794e0758ce1b6df0346
--- /dev/null
+++ b/unit_tests/operator/Test_FoldImpl.cpp
@@ -0,0 +1,178 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <cstdlib>
+#include <memory>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/GraphView.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/operator/Fold.hpp"
+#include "aidge/operator/Unfold.hpp"
+#include "aidge/operator/MatMul.hpp"
+#include "aidge/operator/Reshape.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] Fold(forward)", "[Fold][CPU]") {
+    std::shared_ptr<Node> myUnfold = Unfold({3,3}, "myunfold");
+    std::shared_ptr<Node> myReshape = Reshape({4, 27}, "myreshape");
+    std::shared_ptr<Node> myMatMul = MatMul("mymatmul");
+    std::shared_ptr<Node> myFold = Fold({3,3}, {1,1}, "myfold");
+    myUnfold->addChild(myMatMul, 0, 1);
+    myReshape->addChild(myMatMul, 0, 0);
+    myMatMul->addChild(myFold, 0, 0);
+
+    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int,4,3,3,3> {
+        {
+            {
+                {{  0,   1,   2},
+                {  3,   4,   5},
+                {  6,   7,   8}},
+                {{  9,  10,  11},
+                { 12,  13,  14},
+                { 15,  16,  17}},
+                {{ 18,  19,  20},
+                { 21,  22,  23},
+                { 24,  25,  26}}
+            },
+            {
+                {{ 27,  28,  29},
+                { 30,  31,  32},
+                { 33,  34,  35}},
+                {{ 36,  37,  38},
+                { 39,  40,  41},
+                { 42,  43,  44}},
+                {{ 45,  46,  47},
+                { 48,  49,  50},
+                { 51,  52,  53}}
+            },
+            {
+                {{ 54,  55,  56},
+                { 57,  58,  59},
+                { 60,  61,  62}},
+                {{ 63,  64,  65},
+                { 66,  67,  68},
+                { 69,  70,  71}},
+                {{ 72,  73,  74},
+                { 75,  76,  77},
+                { 78,  79,  80}}
+            },
+            {
+                {{ 81,  82,  83},
+                { 84,  85,  86},
+                { 87,  88,  89}},
+                {{ 90,  91,  92},
+                { 93,  94,  95},
+                { 96,  97,  98}},
+                {{ 99, 100, 101},
+                {102, 103, 104},
+                {105, 106, 107}}
+            }
+        }
+    });
+    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW
+        {
+            {
+                {{  0,   1,   2,   3,   4},
+                {  5,   6,   7,   8,   9},
+                { 10,  11,  12,  13,  14},
+                { 15,  16,  17,  18,  19},
+                { 20,  21,  22,  23,  24}},
+
+                {{ 25,  26,  27,  28,  29},
+                { 30,  31,  32,  33,  34},
+                { 35,  36,  37,  38,  39},
+                { 40,  41,  42,  43,  44},
+                { 45,  46,  47,  48,  49}},
+
+                {{ 50,  51,  52,  53,  54},
+                { 55,  56,  57,  58,  59},
+                { 60,  61,  62,  63,  64},
+                { 65,  66,  67,  68,  69},
+                { 70,  71,  72,  73,  74}}
+            },
+            {
+                {{ 75,  76,  77,  78,  79},
+                { 80,  81,  82,  83,  84},
+                { 85,  86,  87,  88,  89},
+                { 90,  91,  92,  93,  94},
+                { 95,  96,  97,  98,  99}},
+
+                {{100, 101, 102, 103, 104},
+                {105, 106, 107, 108, 109},
+                {110, 111, 112, 113, 114},
+                {115, 116, 117, 118, 119},
+                {120, 121, 122, 123, 124}},
+
+                {{125, 126, 127, 128, 129},
+                {130, 131, 132, 133, 134},
+                {135, 136, 137, 138, 139},
+                {140, 141, 142, 143, 144},
+                {145, 146, 147, 148, 149}}
+            }
+        }
+    });
+    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,4,3,3> {
+        {
+            {
+                {{ 15219, 15570, 15921},
+                { 16974, 17325, 17676},
+                { 18729, 19080, 19431}},
+                {{ 37818, 38898, 39978},
+                { 43218, 44298, 45378},
+                { 48618, 49698, 50778}},
+                {{ 60417, 62226, 64035},
+                { 69462, 71271, 73080},
+                { 78507, 80316, 82125}},
+                {{ 83016, 85554, 88092},
+                { 95706, 98244, 100782},
+                { 108396, 110934, 113472}}
+            },
+            {
+                {{ 41544, 41895, 42246},
+                { 43299, 43650, 44001},
+                { 45054, 45405, 45756}},
+                {{ 118818, 119898, 120978},
+                { 124218, 125298, 126378},
+                { 129618, 130698, 131778}},
+                {{ 196092, 197901, 199710},
+                { 205137, 206946, 208755},
+                { 214182, 215991, 217800}},
+                {{ 273366, 275904, 278442},
+                { 286056, 288594, 291132},
+                { 298746, 301284, 303822}}
+            }
+        }
+    });
+
+    auto opUnfold = std::static_pointer_cast<OperatorTensor>(myUnfold -> getOperator());
+    auto opReshape = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator());
+    auto opMatMul = std::static_pointer_cast<OperatorTensor>(myMatMul -> getOperator());
+    auto opFold = std::static_pointer_cast<OperatorTensor>(myFold -> getOperator());
+    opUnfold->associateInput(0,myInput);
+    opReshape->associateInput(0,myWeights);
+
+    auto g = getConnectedGraphView(myMatMul);
+    g->setDataType(DataType::Int32);
+    g->setBackend("cpu");
+
+    g->forwardDims();
+    g->save("unfold_matmul_fold");
+
+    SequentialScheduler scheduler(g);
+    scheduler.forward();
+    //opFold->getOutput(0)->print();
+    REQUIRE(*(opFold->getOutput(0)) == *myOutput);
+}
\ No newline at end of file
diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp
index 9d592d31e1999f63fb0ebe3f5ad9d19e85c8645c..3378861d0d3d7e74e7867c2765a0b09069fa8caf 100644
--- a/unit_tests/operator/Test_MulImpl.cpp
+++ b/unit_tests/operator/Test_MulImpl.cpp
@@ -24,6 +24,337 @@
 
 namespace Aidge {
 
+    TEST_CASE("[CPU/Operator] Mul Backward", "[Mul][CPU][Backward]")
+    {
+        std::shared_ptr<Node> myMul = Mul();
+        auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator());
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+
+        SECTION("Case 1: 2D and 1D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array2D<float,2,3>(
+                {
+                    {
+                        {1,2,3},{4,5,6}
+                    }
+                }
+            ));
+
+            const auto T1 = std::make_shared<Tensor>(Array1D<float,3>(
+                {0.1,0.2,0.3}
+            ));
+
+            T0->setDataType(DataType::Float32);
+            T0->setBackend("cpu");
+            T1->setDataType(DataType::Float32);
+            T1->setBackend("cpu");
+
+            op->getOutput(0)->setGrad(std::make_shared<Tensor>(Array2D<float,2,3>({{{1.0,1.0,1.0},{1.0,1.0,1.0}}})));
+
+            op->associateInput(0,T0);
+            op->associateInput(1,T1);
+            op->forwardDims();
+
+            myMul->forward();
+            myMul->backward();
+
+            auto T0Grad = std::make_shared<Tensor>(Array2D<float, 2,3>({{{0.1,0.2,0.3},{0.1, 0.2, 0.3}}}));
+            auto T1Grad = std::make_shared<Tensor>(Array1D<float, 3>({5,7,9}));
+
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *T0Grad));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *T1Grad));
+        }
+
+        SECTION("Case 2: 3D and 1D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {7.0, 8.0, 9.0},
+                            {10.0, 11.0, 12.0}
+                        }
+                    }
+                }
+            ));
+
+            const auto T1 = std::make_shared<Tensor>(Array1D<float, 3>({0.3,0.2,0.1}));
+
+            const auto newGrad = std::make_shared<Tensor>(Array3D<float,2,2,3>(
+                    {
+                        {
+                            {
+                                {1, 1, 1},
+                                {1, 1, 1}
+                            },
+                            {
+                                {1, 1, 1},
+                                {1, 1, 1}
+                            }
+                        }
+                    }
+                ));
+
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
+                {
+                    {
+                        {
+                            {0.3, 0.2, 0.1},
+                            {0.3, 0.2, 0.1}
+                        },
+                        {
+                            {0.3, 0.2, 0.1},
+                            {0.3, 0.2, 0.1}
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float,3>(
+                {22.0, 26.0, 30.0}
+            ));
+
+            for(auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+
+            op->associateInput(0, T0);
+            op->associateInput(1, T1);
+            op->getOutput(0)->setGrad(newGrad);
+            op->forwardDims();
+
+            myMul->backward();
+
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        }
+
+        SECTION("Case 3: 4D and 2D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
+                {
+                    {
+                        {
+                            {
+                                {1.0, 2.0, 3.0},
+                                {4.0, 5.0, 6.0},
+                                {7.0, 8.0, 9.0}
+                            },
+                            {
+                                {10.0, 11.0, 12.0},
+                                {13.0, 14.0, 15.0},
+                                {16.0, 17.0, 18.0}
+                            }
+                        },
+                        {
+                            {
+                                {19.0, 20.0, 21.0},
+                                {22.0, 23.0, 24.0},
+                                {25.0, 26.0, 27.0}
+                            },
+                            {
+                                {28.0, 29.0, 30.0},
+                                {31.0, 32.0, 33.0},
+                                {34.0, 35.0, 36.0}
+                            }
+                        }
+                    }
+                }
+            ));
+
+            const auto T1 = std::make_shared<Tensor>(Array2D<float, 3,3>(
+                {
+                    {
+                        {0.5,0.3,0.1},
+                        {0.4,0.2,0.6},
+                        {0.7,0.8,0.9}
+                    }
+                }
+            ));
+
+            const auto newGrad = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
+                {
+                    {
+                        {
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            },
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            }
+                        },
+                        {
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            },
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            }
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array4D<float,2,2,3,3>(
+                {
+                    {
+                        {
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            },
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            }
+                        },
+                        {
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            },
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            }
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 3>(
+                {
+                    {
+                        {58.0, 62.0, 66.0},
+                        {70.0, 74.0, 78.0},
+                        {82.0, 86.0, 90.0}
+                    }
+                }
+            ));
+
+            for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+
+            op->associateInput(0, T0);
+            op->associateInput(1, T1);
+            op->getOutput(0)->setGrad(newGrad);
+            op->forwardDims();
+
+            myMul->backward();
+
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        }
+
+        SECTION("Case 4: 3D and 2D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 3, 4>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0, 4.0},
+                            {5.0, 6.0, 7.0, 8.0},
+                            {9.0, 10.0, 11.0, 12.0},
+                        },
+                        {
+                            {13.0, 14.0, 15.0, 16.0},
+                            {17.0, 18.0, 19.0, 20.0},
+                            {21.0, 22.0, 23.0, 24.0},
+                        }
+                    }
+                }
+            ));
+
+            const auto T1 = std::make_shared<Tensor>(Array2D<float, 3, 4>(
+                {
+                    {
+                        {0.1, 0.2, 0.3, 0.4},
+                        {0.5, 0.6, 0.7, 0.8},
+                        {0.9, 1.0, 1.1, 1.2}
+                    }
+                }
+            ));
+
+            const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2,3,4>(
+                {
+                    {
+                        {
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                        },
+                        {
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,3,4>(
+                {
+                    {
+                        {
+                            {0.1, 0.2, 0.3, 0.4},
+                            {0.5, 0.6, 0.7, 0.8},
+                            {0.9, 1.0, 1.1, 1.2}
+                        },
+                        {
+                            {0.1, 0.2, 0.3, 0.4},
+                            {0.5, 0.6, 0.7, 0.8},
+                            {0.9, 1.0, 1.1, 1.2}
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 4>(
+                {
+                    {
+                        {14.0, 16.0, 18.0, 20.0},
+                        {22.0, 24.0, 26.0, 28.0},
+                        {30.0, 32.0, 34.0, 36.0}
+                    }
+                }
+            ));
+
+            for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+            {
+                T->setBackend("cpu") ;
+                T->setDataType(DataType::Float32);
+            }
+
+            op->associateInput(0, T0);
+            op->associateInput(1, T1);
+            op->getOutput(0)->setGrad(newGrad);
+            op->forwardDims();
+
+            myMul->backward();
+
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        }
+    }
+
 TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
     constexpr std::uint16_t NBTRIALS = 10;
     // Create a random number generator
@@ -31,7 +362,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
     std::mt19937 gen(rd());
     std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
     std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
     std::uniform_int_distribution<int> boolDist(0,1);
 
     // Create MatMul Operator
@@ -60,6 +391,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
     std::chrono::time_point<std::chrono::system_clock> end;
     std::chrono::duration<double, std::micro> duration{};
 
+
     SECTION("MulImpl_cpu::forward()") {
         SECTION("Scalar / Scalar") {
 
@@ -68,16 +400,20 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
 
         }
         SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
+
             std::size_t number_of_operation = 0;
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+
                 // generate 2 random Tensors
-                const std::size_t nbDims = nbDimsDist(gen);
-                std::vector<std::size_t> dims;
+                const auto nbDims = nbDimsDist(gen);
+                auto dims = std::vector<std::size_t>{};
+
                 for (std::size_t i = 0; i < nbDims; ++i) {
                     dims.push_back(dimSizeDist(gen));
                 }
-                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+                const auto nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
 
                 // without broadcasting
@@ -114,67 +450,101 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 delete[] array0;
                 delete[] array1;
                 delete[] result;
-
-                // with broadcasting
             }
             std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
             std::cout << "total time: " << duration.count() << "μs" << std::endl;
         }
 
+
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
             std::size_t number_of_operation = 0;
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+
                 // generate 2 random Tensors
                 // handle dimensions, replace some dimensions with '1' to get broadcasting
+
                 constexpr std::size_t nbDims = 4;
-                std::vector<std::size_t> dims;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    dims.push_back(dimSizeDist(gen));
+                std::vector<std::size_t> dimensions;
+
+                for (std::size_t i = 0; i < nbDims; ++i)
+                {
+                    dimensions.push_back(dimSizeDist(gen));
                 }
-                std::vector<std::size_t> dims0 = dims;
-                std::vector<std::size_t> dims1 = dims;
-                std::vector<std::size_t> dimsOut = dims;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    if (boolDist(gen)) {
+
+                auto dims0 = dimensions;
+                auto dims1 = dimensions;
+                auto dimsOut = dimensions;
+
+                for (std::size_t i = 0; i < nbDims; ++i)
+                {
+                    if (boolDist(gen))
+                    {
                         dims0[i] = 1;
                     }
-                    if (boolDist(gen)) {
+
+                    if (boolDist(gen))
+                    {
                         dims1[i] = 1;
                     }
+
                     dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
                 }
 
+                for(auto dim : dims0)
+                {
+                    Log::info("Dimension of input 0 : {}", dim);
+                }
+
+                for(auto dim : dims1)
+                {
+                    Log::info("Dimension of input 1 : {}", dim);
+                }
+
                 // create arrays and fill them with random values
                 float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
                 float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
                 float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
 
-                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
+
+                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i)
+                {
                     array0[i] = valueDist(gen);
                 }
-                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
+
+                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i)
+                {
                     array1[i] = valueDist(gen);
                 }
 
                 // compute true result
                 const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
                 const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
-                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
-                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+
+                for (std::size_t a = 0; a < dimsOut[0]; ++a)
+                {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b)
+                    {
                         const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
                                                     + strides0[1] * ((dims0[1] > 1) ? b : 0);
+
                         const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
                                                     + strides1[1] * ((dims1[1] > 1) ? b : 0);
-                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c)
+                        {
                             const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
-                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d)
+                            {
                                 std::size_t idx0 = idx0_0
                                                     + strides0[2] * ((dims0[2] > 1) ? c : 0)
                                                     + ((dims0[3] > 1) ? d : 0);
+
                                 std::size_t idx1 = idx1_0
                                                     + strides1[2] * ((dims1[2] > 1) ? c : 0)
                                                     + ((dims1[3] > 1) ? d : 0);
+
                                 result[idx_out + d] = array0[idx0] * array1[idx1];
                                 // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl;
                             }
diff --git a/unit_tests/operator/Test_PadImpl.cpp b/unit_tests/operator/Test_PadImpl.cpp
index cdd3a5f979085f3782776ce69ddd92c0d53150c4..75233c0b97fc6f9812020d0e3d3c695d8cd388f0 100644
--- a/unit_tests/operator/Test_PadImpl.cpp
+++ b/unit_tests/operator/Test_PadImpl.cpp
@@ -134,7 +134,7 @@ TEST_CASE("[cpu/operator] Pad(forward)", "[Pad][CPU]") {
     SECTION("Asymmetric Pad") {
         const int pv = 0; // pad value
 
-        std::shared_ptr<Node> myPad = Pad<2>({1, 0, 0, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv));
+        std::shared_ptr<Node> myPad = Pad<2>({0, 1, 1, 0}, "mypad", PadBorderType::Constant, static_cast<double>(pv));
         auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
         std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW
             {
diff --git a/unit_tests/operator/Test_PowImpl.cpp b/unit_tests/operator/Test_PowImpl.cpp
index 3b85defb37ff76439b658faa84c3c7457a152d2f..cb5d8872c9c7242bb4aa4efca388d53b578417f9 100644
--- a/unit_tests/operator/Test_PowImpl.cpp
+++ b/unit_tests/operator/Test_PowImpl.cpp
@@ -313,5 +313,171 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
             std::cout << "total time: " << duration.count() << "μs" << std::endl;
         }
     }
+
+
+    SECTION("PowImpl_cpu::backward()") {
+        SECTION("3D Tensors") {
+            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {2.0, 3.0},
+                            {4.0, 5.0}
+                        },
+                        {
+                            {6.0, 7.0},
+                            {8.0, 9.0}
+                        }
+                    }
+                }
+            ));
+            const auto input1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {1.0, 2.0},
+                            {3.0, 2.0}
+                        },
+                        {
+                            {2.0, 3.0},
+                            {1.0, 0.5}
+                        }
+                    }
+                }
+            ));
+            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {0.5, 1.0},
+                            {1.5, 2.0}
+                        },
+                        {
+                            {2.5, 3.0},
+                            {3.5, 4.0}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {0.50000000,   6.00000000},
+                            {72.00000000,  20.00000000}
+                        },
+                        {
+                            {30.00000000, 441.00000000},
+                            {3.50000000,   0.66666669}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {  0.693147182, 9.88751030},
+                            {1.33084259e+02, 8.04718933e+01}
+                        },
+                        {
+                            {1.61258362e+02, 2.00234143e+03},
+                            {5.82243652e+01, 2.63666954e+01}
+                        }
+                    }
+                }
+            ));
+            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+            std::shared_ptr<Node> powOp = Pow();
+            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
+            opr->setDataType(DataType::Float32);
+            opr->setBackend("cpu");
+            opr->associateInput(0, input0);
+            opr->associateInput(1, input1);
+            opr->getOutput(0)->setGrad(gradOut);
+            opr->forward();
+
+            powOp->backward();
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
+        }
+        SECTION("Broadcasting") {
+            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {1.5, 2.5, 3.5},
+                            {4.5, 5.5, 6.5}
+                        }
+                    }
+                }
+            ));
+            const auto input1 = std::make_shared<Tensor>(Array1D<float, 3>(
+                {
+                    {0.1, 0.2, 0.3}
+                }
+            ));
+
+            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {6.0, 5.0, 4.0},
+                            {3.0, 2.0, 1.0}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {0.10000000, 0.22973967, 0.41711676},
+                            {0.11486985, 0.27594593, 0.51353097}
+                        },
+                        {
+                            {0.41655189, 0.48044977, 0.49926791},
+                            {0.07748720, 0.10227509, 0.08092485}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float, 3>(
+                {
+                    {14.14779854, 22.99299049, 33.56402588}
+                }
+            ));
+
+            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+            std::shared_ptr<Node> powOp = Pow();
+            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
+            opr->setDataType(DataType::Float32);
+            opr->setBackend("cpu");
+            opr->associateInput(0, input0);
+            opr->associateInput(1, input1);
+            opr->getOutput(0)->setGrad(gradOut);
+            powOp->forward();
+
+            powOp->backward();
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
+        }
+    }
 }
 } // namespace Aidge
diff --git a/unit_tests/operator/Test_ReduceMeanImpl.cpp b/unit_tests/operator/Test_ReduceMeanImpl.cpp
index 0269622740b5a0282a093d509d4b565f7acc3e76..dd647c7ba3f90fe7f3554aae7133e97ffa9c99ba 100644
--- a/unit_tests/operator/Test_ReduceMeanImpl.cpp
+++ b/unit_tests/operator/Test_ReduceMeanImpl.cpp
@@ -11,6 +11,8 @@
 
 #include <catch2/catch_test_macros.hpp>
 #include <memory>
+#include <numeric>   // std::accumulate
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
 
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/ReduceMean.hpp"
@@ -22,6 +24,129 @@
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] ReduceMean(forward)", "[ReduceMean][CPU]") {
+    SECTION("ForwardDims")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        SECTION("KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims(nbDims);
+                std::vector<std::int32_t> axes;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    expectedOutDims[i] = dims[i];
+                    if(boolDist(gen)) {
+                        axes.push_back(i);
+                        expectedOutDims[i] = 1;
+                    }
+                }
+                if (axes.empty()) { // Default behaviour if no axes are provided is to reduce all dimensions
+                   std::fill(expectedOutDims.begin(), expectedOutDims.end(), 1);
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                myInput->zeros();
+                std::shared_ptr<Node> myReduceMean = ReduceMean(axes, true);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+        SECTION("Not KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims;
+                std::vector<std::int32_t> axes;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    if(boolDist(gen)) {
+                        axes.push_back(i);
+                    }
+                    else {
+                        expectedOutDims.push_back(dims[i]);
+                    }
+                }
+                if (axes.empty() || expectedOutDims.empty()) { // Default behaviour if no axes are provided is to reduce all dimensions
+                   expectedOutDims = std::vector<DimSize_t>{1};
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceMean = ReduceMean(axes, false);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+        SECTION("NoopWithEmptyAxes") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceMean = ReduceMean(std::vector<int32_t>{}, false, true);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == dims);
+            }
+        }
+        SECTION("Not NoopWithEmptyAxes") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceMean = ReduceMean({}, false, false);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                REQUIRE(op->getOutput(0)->nbDims() == 1);
+                REQUIRE(op->getOutput(0)->size() == 1);
+            }
+        }
+    }
     SECTION("KeepDims") {
         SECTION("test 1") {
             std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
@@ -157,7 +282,7 @@ TEST_CASE("[cpu/operator] ReduceMean(forward)", "[ReduceMean][CPU]") {
                 {18.25}
             });
 
-            std::shared_ptr<Node> myReduceMean = ReduceMean({0, 1, 2}, 0);
+            std::shared_ptr<Node> myReduceMean = ReduceMean({}, 0);
             auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
             op->associateInput(0,myInput);
             op->setDataType(DataType::Float32);
@@ -179,15 +304,42 @@ TEST_CASE("[cpu/operator] ReduceMean(forward)", "[ReduceMean][CPU]") {
                 {0.1293547f}
             });
 
-            std::shared_ptr<Node> myReduceMean = ReduceMean({0, 1}, 0);
+            std::shared_ptr<Node> myReduceMean = ReduceMean({}, 0);
             auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
             op->associateInput(0,myInput);
             op->setDataType(DataType::Float32);
             op->setBackend("cpu");
             myReduceMean->forward();
-            op->getOutput(0)->print();
-            // approxEq<float>(*(op->getOutput(0)), *myOutput);
+
             REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
         }
+        SECTION("noop_with_empty_axes") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+
+            std::shared_ptr<Node> myReduceMean = ReduceMean({}, 0, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceMean->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == *myInput);
+        }
     }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_ReduceSumImpl.cpp b/unit_tests/operator/Test_ReduceSumImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..49569d1f65ff6c51f9681632b16375605ab326e7
--- /dev/null
+++ b/unit_tests/operator/Test_ReduceSumImpl.cpp
@@ -0,0 +1,345 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <memory>
+#include <numeric>   // std::accumulate
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/operator/Conv.hpp"
+
+#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] ReduceSum(forward)", "[ReduceSum][CPU]") {
+    SECTION("ForwardDims")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        SECTION("KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims(nbDims);
+                std::vector<std::int32_t> axes;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    expectedOutDims[i] = dims[i];
+                    if(boolDist(gen)) {
+                        axes.push_back(i);
+                        expectedOutDims[i] = 1;
+                    }
+                }
+                if (axes.empty()) { // Default behaviour if no axes are provided is to reduce all dimensions
+                   std::fill(expectedOutDims.begin(), expectedOutDims.end(), 1);
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                myInput->zeros();
+                std::shared_ptr<Node> myReduceSum = ReduceSum(axes, true);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+        SECTION("Not KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims;
+                std::vector<std::int32_t> axes;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    if(boolDist(gen)) {
+                        axes.push_back(i);
+                    }
+                    else {
+                        expectedOutDims.push_back(dims[i]);
+                    }
+                }
+                if (axes.empty() || expectedOutDims.empty()) { // Default behaviour if no axes are provided is to reduce all dimensions
+                   expectedOutDims = std::vector<DimSize_t>{1};
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceSum = ReduceSum(axes, false);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+        SECTION("NoopWithEmptyAxes") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceSum = ReduceSum(std::vector<int32_t>{}, false, true);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == dims);
+            }
+        }
+        SECTION("Not NoopWithEmptyAxes") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceSum = ReduceSum({}, false, false);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                REQUIRE(op->getOutput(0)->nbDims() == 1);
+                REQUIRE(op->getOutput(0)->size() == 1);
+            }
+        }
+    }
+    SECTION("KeepDims") {
+        SECTION("test 1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            Tensor myOutput = Tensor(Array3D<float,3,1,2> {
+                {
+
+                    {{ 25.0, 3.0 }},
+                    {{ 70.0, 3.0 }},
+                    {{ 115.0, 3.0 }}
+                }
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({1}, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+        SECTION("test 2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> {
+                {
+                    {
+                        { 0.0, 0.0 },
+                        { 1.0, 1.0 },
+                        { 2.0, 2.0 }
+                    },
+                    {
+                        { 3.0, 3.0 },
+                        { 4.0, 4.0 },
+                        { 5.0, 5.0 }
+                    },
+                    {
+                        { 6.0, 6.0 },
+                        { 7.0, 7.0 },
+                        { 8.0, 8.0 }
+                    }
+                }
+            });
+            Tensor myOutput = Tensor(Array3D<float,3,1,1> {
+                {
+
+                    {{ 6.0 }},
+                    {{ 24.0 }},
+                    {{ 42.0 }}
+                }
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({1, 2}, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            myOutput.print();
+            op->getOutput(0)->print();
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+    }
+    SECTION("not_KeepDims") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+            {
+                {
+                    { 5.0, 1.0 },
+                    { 20.0, 2.0 }
+                },
+                {
+                    { 30.0, 1.0 },
+                    { 40.0, 2.0 }
+                },
+                {
+                    { 55.0, 1.0 },
+                    { 60.0, 2.0 }
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> {
+            {
+                { 25.0, 3.0 },
+                { 70.0, 3.0 },
+                { 115.0, 3.0 }
+            }
+        });
+
+        std::shared_ptr<Node> myReduceSum = ReduceSum({1}, 0);
+        auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        myReduceSum->forward();
+        op->getOutput(0)->print();
+
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+
+    }
+    SECTION("all_axes") {
+        SECTION("1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
+                {219.0}
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({}, 0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == *myOutput);
+        }
+        SECTION("2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,5,4> {
+               {{ 0.004232f, 0.105120f, 0.045124f, 0.009205f},
+                { 0.000766f, 0.272162f, 0.503560f, 0.044163f},
+                { 0.049755f, 0.000305f, 0.143634f, 0.013253f},
+                { 0.096258f, 0.311231f, 0.358143f, 0.000452f},
+                { 0.468617f, 0.015693f, 0.145316f, 0.000105f}}
+            });
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
+                {2.587094f}
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1}, 0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            op->getOutput(0)->print();
+            REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
+        }
+        SECTION("noop_with_empty_axes") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({}, 0, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == *myInput);
+        }
+    }
+}
\ No newline at end of file
diff --git a/unit_tests/recipies/Test_ConvToMatMul.cpp b/unit_tests/recipies/Test_ConvToMatMul.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05c5eef83394ba8c965dfabae2bcd8c2b4502c79
--- /dev/null
+++ b/unit_tests/recipies/Test_ConvToMatMul.cpp
@@ -0,0 +1,76 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/recipes/Recipes.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/operator/Producer.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/filler/Filler.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include <cstddef>
+
+using namespace Aidge;
+
+TEST_CASE("[ConvToMatMul] conv") {
+    auto conv1 = Conv(3, 4, {3, 3}, "conv1");
+    auto conv2 = Conv(4, 7, {3, 3}, "conv2", {1, 1}, {1, 1}, true);
+    auto conv3 = Conv(7, 10, {1, 1}, "conv3", {2, 2});
+
+    auto g1 = Sequential({
+        Producer({2, 3, 13, 24}, "dataProvider"),
+        conv1,
+        conv2,
+        conv3
+    });
+
+    g1->setBackend("cpu");
+    g1->forwardDims();
+
+    // Random initialization of input and weights
+    uniformFiller<float>(std::static_pointer_cast<OperatorTensor>(conv1->getOperator())->getInput(0), -10.0, 10.0);
+    uniformFiller<float>(std::static_pointer_cast<OperatorTensor>(conv1->getOperator())->getInput(1), -10.0, 10.0);
+    uniformFiller<float>(std::static_pointer_cast<OperatorTensor>(conv1->getOperator())->getInput(2), -10.0, 10.0);
+    uniformFiller<float>(std::static_pointer_cast<OperatorTensor>(conv2->getOperator())->getInput(1), -10.0, 10.0);
+    uniformFiller<float>(std::static_pointer_cast<OperatorTensor>(conv3->getOperator())->getInput(1), -10.0, 10.0);
+    uniformFiller<float>(std::static_pointer_cast<OperatorTensor>(conv3->getOperator())->getInput(2), -10.0, 10.0);
+
+    auto s1 = SequentialScheduler(g1);
+    s1.forward();
+
+    g1->save("convToMatMul_before");
+
+    auto g2 = g1->clone();
+    g2->forwardDims();
+    REQUIRE(convToMatMul(g2) == 3);
+    
+    g2->setBackend("cpu");
+
+    auto s2 = SequentialScheduler(g2);
+    s2.forward();
+
+    g2->save("convToMatMul_after");
+
+    auto g1OutOp = std::static_pointer_cast<OperatorTensor>((*g1->outputNodes().cbegin())->getOperator());
+    auto g2OutOp = std::static_pointer_cast<OperatorTensor>((*g1->outputNodes().cbegin())->getOperator());
+    REQUIRE(*(g1OutOp->getOutput(0)) == *(g2OutOp->getOutput(0)));
+
+    // Simplify the graph: freeze parameters to allow reshaping of the Producers
+    for (auto node : g2->getNodes()) {
+        if (node->type() == Producer_Op::Type && node->name() != "dataProvider") {
+            std::static_pointer_cast<Producer_Op>(node->getOperator())->constant() = true;
+        }
+    }
+
+    constantFolding(g2);
+    g2->save("convToMatMul_after_folding");
+}
diff --git a/version.txt b/version.txt
index 7179039691ce07a214e7a815893fee97a97b1422..0d91a54c7d439e84e3dd17d3594f1b2b6737f430 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.3
+0.3.0