Compare revisions

Maxence Naud · Christophe Guillon · Olivier BICHLER · Olivier BICHLER · Olivier BICHLER · Grégoire Kubler
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-cmake_minimum_required(VERSION 3.15)
+cmake_minimum_required(VERSION 3.18)
 set(CXX_STANDARD 14)

 file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version)
@@ -24,6 +24,7 @@ add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")

 # Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME}
 set(module_name _${CMAKE_PROJECT_NAME}) # target name
+set(pybind_module_name ${CMAKE_PROJECT_NAME}) # name of submodule for python bindings

 ##############################################
 # Define options
@@ -69,16 +70,12 @@ set_property(TARGET ${module_name} PROPERTY POSITION_INDEPENDENT_CODE ON)

 # PYTHON BINDING
 if (PYBIND)
-    # Handles Python + pybind11 headers dependencies
-    include(PybindModuleCreation)
-    generate_python_binding(${CMAKE_PROJECT_NAME} ${module_name})
+    # Python binding lib is by default installed in <prefix>/python_packages/<package>/
+    # When installed from python, setup.py should set it to the python package dir
+    set(PYBIND_INSTALL_PREFIX python_packages/${pybind_module_name} CACHE PATH "Python package install prefix")

-    target_link_libraries(${module_name}
-        PUBLIC
-            pybind11::pybind11
-        PRIVATE
-            Python::Module
-        )
+    include(PybindModuleCreation)
+    generate_python_binding(${pybind_module_name} ${module_name})
 endif()

 if( ${ENABLE_ASAN} )
@@ -102,7 +99,6 @@ target_include_directories(${module_name}
        ${CMAKE_CURRENT_SOURCE_DIR}/src
 )

-target_link_libraries(${module_name} PUBLIC fmt::fmt)
 target_compile_features(${module_name} PRIVATE cxx_std_14)

 target_compile_options(${module_name} PRIVATE
@@ -128,6 +124,12 @@ install(TARGETS ${module_name} EXPORT ${CMAKE_PROJECT_NAME}-targets
 )
 install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

+if (PYBIND)
+    install(TARGETS ${pybind_module_name}
+        DESTINATION ${PYBIND_INSTALL_PREFIX}
+    )
+endif()
+
 #Export the targets to a script
 install(EXPORT ${CMAKE_PROJECT_NAME}-targets
 FILE "${CMAKE_PROJECT_NAME}-targets.cmake"
@@ -159,15 +161,16 @@ install(FILES
 ## Exporting from the build tree
 message(STATUS "Exporting created targets to use them in another build")
 export(EXPORT ${CMAKE_PROJECT_NAME}-targets
-    FILE "${CMAKE_CURRENT_BINARY_DIR}/${project}-targets.cmake")
+    FILE "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-targets.cmake")


 ##############################################
 ## Add test
 if(TEST)
-    if(PYBIND)
-        message(FATAL_ERROR "PYBIND and TEST are both enabled. But cannot compile with catch_2.\nChoose between pybind and Catch2 for compilation.")
+    if (AIDGE_REQUIRES_PYTHON AND NOT AIDGE_PYTHON_HAS_EMBED)
+        message(WARNING "Skipping compilation of tests: missing Python embedded interpreter")
+    else()
+        enable_testing()
+        add_subdirectory(unit_tests)
    endif()
-    enable_testing()
-    add_subdirectory(unit_tests)
 endif()
--- a/README.md
+++ b/README.md
@@ -23,9 +23,23 @@ Those operators can be used on any machine with an Linux OS.
 pip install . -v
 ```
 > **TIPS :** Use environment variables to change compilation options :
-> - `AIDGE_INSTALL` : to set the installation folder. Defaults to /usr/local/lib. :warning: This path must be identical to aidge_core install path.
-> - `AIDGE_PYTHON_BUILD_TYPE` : to set the compilation mode to **Debug** or **Release** 
-> - `AIDGE_BUILD_GEN` : to set the build backend with 
+> - `AIDGE_INSTALL` : to set the installation folder. Defaults to `<python_prefix>/lib/libAidge`. :warning: This path must be identical to aidge_core install path.
+> - `AIDGE_PYTHON_BUILD_TYPE` : to set the compilation mode to **Debug** or **Release** or "" (for default flags). Defaults to **Release**.
+> - `AIDGE_BUILD_GEN` : to set the build backend (for development mode) or "" for the cmake default. Default to "".
+
+## Pip installation for development
+
+To setup using pip in development (or editable mode), use the `--no-build-isolation -e` options to pip.
+
+For instance run the following command in your python environnement for a typical setup :
+``` bash
+export AIDGE_PYTHON_BUILD_TYPE=         # default flags (no debug info but fastest build time)
+export AIDGE_PYTHON_BUILD_TYPE=Debug    # or if one really need to debug the C++ code
+pip install -U pip setuptools setuptools_scm[toml] cmake   # Pre-install build requirements (refer to the pyproject.toml [build-system] section)
+pip install -v --no-build-isolation -e .
+```
+
+Refer to `aidge_core/README.md` for more details on development build options.

 ### Standard C++ Compilation


--- a/aidge_backend_cpu-config.cmake.in
+++ b/aidge_backend_cpu-config.cmake.in
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+find_dependency(aidge_core)
+
+include(CMakeFindDependencyMacro)
+
 include(${CMAKE_CURRENT_LIST_DIR}/aidge_backend_cpu-config-version.cmake)

 include(${CMAKE_CURRENT_LIST_DIR}/aidge_backend_cpu-targets.cmake)
--- a/cmake/PybindModuleCreation.cmake
+++ b/cmake/PybindModuleCreation.cmake
 function(generate_python_binding pybind_module_name target_to_bind) 
-    add_definitions(-DPYBIND)
+
+    find_package(Python COMPONENTS Interpreter Development.Module)
+
    Include(FetchContent)

    set(PYBIND_VERSION v2.10.4)
-    set(PYBIND11_FINDPYTHON ON)
    message(STATUS "Retrieving pybind ${PYBIND_VERSION} from git")

    FetchContent_Declare(
@@ -12,14 +13,12 @@ function(generate_python_binding pybind_module_name target_to_bind)
        GIT_TAG        ${PYBIND_VERSION} # or a later release
    )

-    # Use the New FindPython mode, recommanded. Requires CMake 3.15+
-    find_package(Python COMPONENTS Interpreter Development.Module)
    FetchContent_MakeAvailable(PyBind11)

    message(STATUS "Creating binding for module ${pybind_module_name}")
    file(GLOB_RECURSE pybind_src_files "python_binding/*.cpp")

    pybind11_add_module(${pybind_module_name} MODULE ${pybind_src_files} "NO_EXTRAS") # NO EXTRA recquired for pip install
-    target_include_directories(${pybind_module_name} PUBLIC "python_binding")
-    target_link_libraries(${pybind_module_name} PUBLIC ${target_to_bind})
+    target_include_directories(${pybind_module_name} PRIVATE "python_binding")
+    target_link_libraries(${pybind_module_name} PRIVATE ${target_to_bind})
 endfunction()
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -19,6 +19,7 @@
 #include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
+#include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"

--- a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp
+++ b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_H_
+#define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/BitShift.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using BitShiftImpl_cpu = OperatorImpl_cpu<BitShift_Op,
+    void(const BitShift_Op::BitShiftDirection,
+    const std::vector<std::size_t>&, 
+    const std::vector<std::size_t>&, 
+    const std::vector<std::size_t>&, 
+    const void*, 
+    const void*,
+    void*)>;
+    
+    // Implementation entry point registration to Operator
+    REGISTRAR(BitShift_Op,"cpu",Aidge::BitShiftImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_BITSHIFTIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include <cstdint>     // std::int32_t, std::int64_t
+#include "aidge/operator/BitShift.hpp"
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
+
+
+
+namespace Aidge {
+template <class I1, class I2, class O>
+void BitShiftImpl_cpu_forward_kernel(
+                                const BitShift_Op::BitShiftDirection direction,
+                                const std::vector<std::size_t>& input1Dims,
+                                const std::vector<std::size_t>& input2Dims,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input1_,
+                                const void* input2_,
+                                void* output_
+                                ) {
+
+    const I1* input_1 = static_cast<const I1*>(input1_);
+    const I2* input_2 = static_cast<const I2*>(input2_);
+    O* output = static_cast<O*>(output_);
+
+    const size_t totalElements = std::accumulate(outputDims.begin(), outputDims.end(), std::size_t(1), std::multiplies<std::size_t>());
+    
+    for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+    {
+        std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+        std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+        std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
+        if(direction == BitShift_Op::BitShiftDirection::right)
+
+        {
+                output[oIndex]= input_1[idx1] >> input_2[idx2];
+        }
+        else
+        {
+                output[oIndex] = input_1[idx1] << input_2[idx2];
+        }
+    }
+}
+
+REGISTRAR(BitShiftImpl_cpu,
+{DataType::Int32},
+{ProdConso::inPlaceModel,Aidge::BitShiftImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>,nullptr});
+REGISTRAR(BitShiftImpl_cpu,
+{DataType::Int64},
+{ProdConso::inPlaceModel,Aidge::BitShiftImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>,nullptr});
+
+
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_BitShiftIMPL_KERNELS_H_ */
\ No newline at end of file
--- a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
@@ -18,19 +18,19 @@ namespace Aidge {

 template <class I, class O>
 void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, const std::size_t m,
-                                    const void* input1_, const void* input2_, void* output_) {
+                                    const void* input1_, const void* input2_, void* __restrict output_) {
    // FIXME: missing MatMul parameters as arguments
    const I* input1 = static_cast<const I*>(input1_);
    const I* input2 = static_cast<const I*>(input2_);
-    O* output = static_cast<O*>(output_);
+    O* __restrict output = static_cast<O* __restrict>(output_);
+
+    std::memset(output, O(0), n * m * sizeof(O));

    for (std::size_t i = 0; i < n; ++i) {
-        for (std::size_t j = 0; j < m; ++j) {
-            O sum = O(0);
-            for (std::size_t l = 0; l < k; ++l) {
-                sum += static_cast<O>(input1[i*k + l] * input2[l*m + j]);
+        for (std::size_t l = 0; l < k; ++l) {
+            for (std::size_t j = 0; j < m; ++j) {
+                output[i*m + j] += static_cast<O>(input1[i*k + l] * input2[l*m + j]);
            }
-            output[i*m + j] = sum;
        }
    }
 }

--- a/include/aidge/backend/cpu/operator/PowImpl.hpp
+++ b/include/aidge/backend/cpu/operator/PowImpl.hpp
@@ -24,7 +24,8 @@ namespace Aidge {
 // Operator implementation entry point for the backend
 using PowImpl_cpu = OperatorImpl_cpu<Pow_Op,
    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*),
-    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)>;
+    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, const void*, void*, void*)>;
+

 // Implementation entry point registration to Operator
 REGISTRAR(Pow_Op, "cpu", Aidge::PowImpl_cpu::create);

--- a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
@@ -31,14 +31,10 @@ void PowImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
    const I2* input_2 = static_cast<const I2*>(input2_);
    O* output = static_cast<O*>(output_);

-    size_t totalElements = 1;
-    for (size_t dimSize : outputDims) {
-        totalElements *= dimSize;
-    }
-
+    std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
 	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) 
 	{
-		std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+		std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);

 		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
 		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
@@ -47,16 +43,53 @@ void PowImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
 	}
 }

+template <class I1, class I2, class O>
+void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims,
+                                const std::vector<std::size_t>& input1Dims,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                const void* gradOutput_,
+                                void* gradientInput0_,
+                                void* gradientInput1_) {
+	const I1* input0 = static_cast<const I1*>(input0_);
+	I1* grad0 = static_cast<I1*>(gradientInput0_);
+    const I2* input1 = static_cast<const I2*>(input1_);
+    I2* grad1 = static_cast<I2*>(gradientInput1_);
+    const O* gradOut = static_cast<const O*>(gradOutput_);
+
+    // Fill input grads with zeros
+	std::size_t input0Elements = std::accumulate(input0Dims.cbegin(), input0Dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+	std::fill(grad0, grad0 + input0Elements, I1(0));
+	std::size_t input1Elements = std::accumulate(input1Dims.cbegin(), input1Dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+	std::fill(grad1, grad1 + input1Elements, I2(0));
+
+	std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    for (size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+    {
+        // Compute indexes in inputs 0 and 1 to support broadcasting
+        std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+        std::size_t idx0 = getFlattenedIndex(input0Dims, indexes);
+        std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+
+        // grad0 = grad_output * (input1 * pow(input0, (input1 -1)))
+        grad0[idx0] += gradOut[oIndex]*input1[idx1]* std::pow(input0[idx0], input1[idx1]-1);
+
+        // grad1 = grad_output * (output * ln(input0))
+        grad1[idx1] += gradOut[oIndex] * std::pow(input0[idx0], input1[idx1]) * std::log(input0[idx0]);
+    }
+}
+
 // Kernels registration to implementation entry point
 REGISTRAR(PowImpl_cpu,
    {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>});
 REGISTRAR(PowImpl_cpu,
    {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>});
 REGISTRAR(PowImpl_cpu,
    {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t, int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>});
 }  // namespace Aidge

 #endif /* AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ */
--- a/project_name.txt
+++ b/project_name.txt
-aidge_backend_cpu
\ No newline at end of file
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,8 +17,7 @@ dynamic = ["version"] # defined in tool.setuptools_scm
 requires = [
    "setuptools>=64",
    "setuptools_scm[toml]==7.1.0",
-    "cmake>=3.15.3.post1",
-    "toml"
+    "cmake>=3.18.4.post1"
 ]
 build-backend = "setuptools.build_meta"


--- a/requirements.txt
+++ b/requirements.txt
-numpy
--- a/setup.py
+++ b/setup.py
@@ -8,17 +8,13 @@ import multiprocessing

 from math import ceil

-import toml
-
 from setuptools import setup, Extension
 from setuptools.command.build_ext import build_ext


-def get_project_name() -> str:
-    with open(pathlib.Path().absolute() / "pyproject.toml", "r") as file:
-        project_toml = toml.load(file)
-        return project_toml["project"]["name"]
+PROJECT_NAME = "aidge_backend_cpu"

+SETUP_DIR = pathlib.Path(__file__).parent

 class AidgeBuildExtension(Extension):
    def __init__(self, name):
@@ -26,6 +22,15 @@ class AidgeBuildExtension(Extension):


 class AidgePkgBuild(build_ext):
+    def __init__(self, dist, *args, **kwargs):
+        super().__init__(dist, *args, **kwargs)
+        # Detect editable_mode for old versions of setuptools
+        if not hasattr(self, "editable_mode"):
+            if hasattr(dist, "commands"):
+                self.editable_mode = "develop" in dist.commands
+            else:
+                self.editable_mode = False
+
    def run(self):
        ####################################
        # BUILD PACKAGE
@@ -43,36 +48,35 @@ class AidgePkgBuild(build_ext):
        if not build_lib.exists():
            build_lib.mkdir(parents=True, exist_ok=True)

-        os.chdir(str(build_temp))
+        package_prefix = build_lib if not self.editable_mode else SETUP_DIR
+        pybind_install_prefix = (package_prefix / PROJECT_NAME).absolute()

-        compile_type = (
-            "Release"
-            if "AIDGE_PYTHON_BUILD_TYPE" not in os.environ
-            else os.environ["AIDGE_PYTHON_BUILD_TYPE"]
-        )
+        os.chdir(str(build_temp))

+        compile_type = os.environ.get("AIDGE_PYTHON_BUILD_TYPE", "Release")
        install_path = (
            os.path.join(sys.prefix, "lib", "libAidge")
            if "AIDGE_INSTALL" not in os.environ
            else os.environ["AIDGE_INSTALL"]
        )
-
-        # using ninja as default build system to build faster and with the same compiler as on windows
-        build_gen = (
-            ["-G", os.environ["AIDGE_BUILD_GEN"]]
-            if "AIDGE_BUILD_GEN" in os.environ
+        build_gen = os.environ.get("AIDGE_BUILD_GEN", "")
+        build_gen_opts = (
+            ["-G", build_gen]
+            if build_gen
            else []
        )
+        test_onoff = os.environ.get("AIDGE_BUILD_TEST", "OFF")
        
        self.spawn(
            [
                "cmake",
-                *build_gen,
+                *build_gen_opts,
                str(cwd),
-                "-DTEST=OFF",
+                f"-DTEST={test_onoff}",
                f"-DCMAKE_INSTALL_PREFIX:PATH={install_path}",
                f"-DCMAKE_BUILD_TYPE={compile_type}",
                "-DPYBIND=ON",
+                f"-DPYBIND_INSTALL_PREFIX:PATH={pybind_install_prefix}",
                "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
                "-DCOVERAGE=OFF",
            ]
@@ -85,25 +89,11 @@ class AidgePkgBuild(build_ext):
            self.spawn(["cmake", "--install", ".", "--config", compile_type])
        os.chdir(str(cwd))

-        aidge_package = build_lib / (get_project_name())
-
-        # Get "aidge core" package
-        # ext_lib = build_temp
-        print(build_temp.absolute())
-        # Copy all shared object files from build_temp/lib to aidge_package
-        for root, _, files in os.walk(build_temp.absolute()):
-            for file in files:
-                if (file.endswith(".so") or file.endswith(".pyd")) and (
-                    root != str(aidge_package.absolute())
-                ):
-                    currentFile = os.path.join(root, file)
-                    shutil.copy(currentFile, str(aidge_package.absolute()))
-

 if __name__ == "__main__":
    setup(
        include_package_data=True,
-        ext_modules=[AidgeBuildExtension(get_project_name())],
+        ext_modules=[AidgeBuildExtension(PROJECT_NAME)],
        cmdclass={
            "build_ext": AidgePkgBuild,
        },

--- a/src/operator/BitShiftImpl.cpp
+++ b/src/operator/BitShiftImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric>
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
+#include "aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp"
+
+template<>
+void Aidge::BitShiftImpl_cpu::forward() {
+
+    const auto& op_ = dynamic_cast<const BitShift_Op&>(mOp);
+
+
+    const auto impl = Registrar<BitShiftImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+
+    const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
+    const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
+
+    BitShift_Op::BitShiftDirection direction = op_.direction();
+
+    // Call kernel
+    impl.forward(
+        direction,
+        inputDims0,
+        inputDims1,
+        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+        getCPUPtr(mOp.getRawInput(0)),
+        getCPUPtr(mOp.getRawInput(1)),
+        getCPUPtr(mOp.getRawOutput(0)));
+        
+}
+
+template <>
+void Aidge::BitShiftImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for BitShift_Op on backend cpu");
+}
\ No newline at end of file
--- a/src/operator/PowImpl.cpp
+++ b/src/operator/PowImpl.cpp
@@ -44,21 +44,29 @@ void Aidge::PowImpl_cpu::forward() {

 template <>
 void Aidge::PowImpl_cpu::backward() {
-    // Find the correct kernel type
    const Pow_Op& op_ = dynamic_cast<const Pow_Op&>(mOp);
-    const std::vector<std::size_t> input0gradDims = getBroadcastedDims(op_.getInput(0)->grad()->dims(),
-                                                                   op_.getOutput(0)->grad()->dims());
-    const std::vector<std::size_t> input1gradDims = getBroadcastedDims(op_.getInput(1)->grad()->dims(),
-                                                                   op_.getOutput(0)->grad()->dims());
+
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    const std::vector<std::size_t> input0gradDims = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->grad()->dims(),
+                                                                       std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->grad()->dims());
+    const std::vector<std::size_t> input1gradDims = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->grad()->dims(),
+                                                                       std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->grad()->dims());

    // Find the correct kernel type
    const auto impl = Registrar<PowImpl_cpu>::create(getBestMatch(getRequiredSpec()));

    // Call kernel
-    impl.backward(op_.getOutput(0)->grad()->dims(),
-               input0gradDims,
-               input1gradDims,
-               getCPUPtr(mOp.getRawOutput(0)),
-               getCPUPtr(mOp.getRawInput(0)),
-               getCPUPtr(mOp.getRawInput(1)));
+    impl.backward(input0gradDims,
+                input1gradDims,
+                out0grad->dims(),
+                getCPUPtr(in0),
+                getCPUPtr(in1),
+                getCPUPtr(out0grad),
+                getCPUPtr(in0grad),
+                getCPUPtr(in1grad));
 }
\ No newline at end of file
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -12,7 +12,7 @@ file(GLOB_RECURSE src_files "*.cpp")

 add_executable(tests${module_name} ${src_files})

-target_link_libraries(tests${module_name} PUBLIC ${module_name})
+target_link_libraries(tests${module_name} PRIVATE ${module_name})

 target_link_libraries(tests${module_name} PRIVATE Catch2::Catch2WithMain)


--- a/unit_tests/operator/Test_BitShift.cpp
+++ b/unit_tests/operator/Test_BitShift.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <cstddef>   // std::size_t
+#include <cstdint>   // std::uint16_t
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <numeric>   
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <iomanip>
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/BitShift.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 15;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> valueDist(-15, 15); 
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
+    std::uniform_int_distribution<int> boolDist(0,1);
+
+    BitShift_Op::BitShiftDirection direction = BitShift_Op::BitShiftDirection::left;
+
+    if(valueDist(gen) % 2 == 0)
+    {
+        direction = BitShift_Op::BitShiftDirection::right;
+    }
+
+    // Create BitShift Operator
+    std::shared_ptr<Node> myBitShift = BitShift(direction);
+    auto op = std::static_pointer_cast<OperatorTensor>(myBitShift-> getOperator());
+    op->setDataType(DataType::Int32);
+    op->setBackend("cpu");
+
+    // Create 2 input Tensors
+    std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+    op->associateInput(0,T0);
+    T0->setDataType(DataType::Int32);
+    T0->setBackend("cpu");
+    std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
+    op -> associateInput(1,T1);
+    T1->setDataType(DataType::Int32);
+    T1->setBackend("cpu");
+
+    // Create results Tensor
+    std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
+    Tres->setDataType(DataType::Int32);
+    Tres->setBackend("cpu");
+
+    // To measure execution time of 'BitShift_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+
+    SECTION("BitShiftImpl_cpu::forward()") {
+        SECTION("Test Forward Kernel with same dimensions") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                const std::size_t nbDims = nbDimsDist(gen);
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // without broadcasting
+                int* array0 = new int[nb_elements];
+                int* array1 = new int[nb_elements];
+                int* result = new int[nb_elements];
+
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = valueDist(gen);
+                    array1[i] = std::abs(valueDist(gen)); // bitshift is impossible with negative value
+                    if(direction == BitShift_Op::BitShiftDirection::left)
+                    {
+                        result[i] = array0[i] << array1[i];
+                    }
+                    else
+                    {
+                        result[i] = array0[i] >> array1[i];
+                    }
+                }
+
+                // input0
+                T0->resize(dims);
+                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
+
+                // input1
+                T1->resize(dims);
+                T1 -> getImpl() -> setRawPtr(array1, nb_elements);
+
+                // results
+                Tres->resize(dims);
+                Tres -> getImpl() -> setRawPtr(result, nb_elements);
+
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myBitShift->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                bool is_eq = approxEq<int>(*(op->getOutput(0)), *Tres);
+
+                auto Output = *(op->getOutput(0));
+                auto prt = Output.getImpl()->rawPtr();
+
+                REQUIRE(is_eq);
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+
+            }
+            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
+            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+        }
+        SECTION("Test BitShift kernels with Broadcasting") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions, replace some dimensions with '1' to get broadcasting
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                std::vector<std::size_t> dims0 = dims;
+                std::vector<std::size_t> dims1 = dims;
+                std::vector<std::size_t> dimsOut = dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims0[i] = 1;
+                    }
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                    dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
+                }
+
+                // create arrays and fill them with random values
+                int* array0 = new int[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
+                int* array1 = new int[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
+                int* result = new int[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
+
+                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
+                    array1[i] = std::abs(valueDist(gen));
+                }
+
+                //True result with broadcast
+                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
+                const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
+                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
+                                                    + strides1[1] * ((dims1[1] > 1) ? b : 0);
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 = idx0_0
+                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
+                                                    + ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 = idx1_0
+                                                    + strides1[2] * ((dims1[2] > 1) ? c : 0)
+                                                    + ((dims1[3] > 1) ? d : 0);
+                                if(direction == BitShift_Op::BitShiftDirection::left)
+                                {
+                                    result[idx_out + d] = array0[idx0] << array1[idx1];
+                                }
+                                else
+                                {
+                                    result[idx_out + d] = array0[idx0] >> array1[idx1];                               
+                                }
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myBitShift->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // comparison between truth and computed result
+                bool equiv = (approxEq<int>(*(op->getOutput(0)), *Tres));
+                if(equiv == false)
+                {
+                    std::cout << "Problem\n";
+                }
+                REQUIRE(equiv);
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
+            std::cout << "total time: " << duration.count() << "μs" << std::endl;
+        }
+
+}
+} // namespace Aidge
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_PowImpl.cpp
+++ b/unit_tests/operator/Test_PowImpl.cpp
@@ -313,5 +313,171 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
            std::cout << "total time: " << duration.count() << "μs" << std::endl;
        }
    }
+
+
+    SECTION("PowImpl_cpu::backward()") {
+        SECTION("3D Tensors") {
+            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {2.0, 3.0},
+                            {4.0, 5.0}
+                        },
+                        {
+                            {6.0, 7.0},
+                            {8.0, 9.0}
+                        }
+                    }
+                }
+            ));
+            const auto input1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {1.0, 2.0},
+                            {3.0, 2.0}
+                        },
+                        {
+                            {2.0, 3.0},
+                            {1.0, 0.5}
+                        }
+                    }
+                }
+            ));
+            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {0.5, 1.0},
+                            {1.5, 2.0}
+                        },
+                        {
+                            {2.5, 3.0},
+                            {3.5, 4.0}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {0.50000000,   6.00000000},
+                            {72.00000000,  20.00000000}
+                        },
+                        {
+                            {30.00000000, 441.00000000},
+                            {3.50000000,   0.66666669}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {  0.693147182, 9.88751030},
+                            {1.33084259e+02, 8.04718933e+01}
+                        },
+                        {
+                            {1.61258362e+02, 2.00234143e+03},
+                            {5.82243652e+01, 2.63666954e+01}
+                        }
+                    }
+                }
+            ));
+            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+            std::shared_ptr<Node> powOp = Pow();
+            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
+            opr->setDataType(DataType::Float32);
+            opr->setBackend("cpu");
+            opr->associateInput(0, input0);
+            opr->associateInput(1, input1);
+            opr->getOutput(0)->setGrad(gradOut);
+            opr->forward();
+
+            powOp->backward();
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
+        }
+        SECTION("Broadcasting") {
+            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {1.5, 2.5, 3.5},
+                            {4.5, 5.5, 6.5}
+                        }
+                    }
+                }
+            ));
+            const auto input1 = std::make_shared<Tensor>(Array1D<float, 3>(
+                {
+                    {0.1, 0.2, 0.3}
+                }
+            ));
+
+            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {6.0, 5.0, 4.0},
+                            {3.0, 2.0, 1.0}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {0.10000000, 0.22973967, 0.41711676},
+                            {0.11486985, 0.27594593, 0.51353097}
+                        },
+                        {
+                            {0.41655189, 0.48044977, 0.49926791},
+                            {0.07748720, 0.10227509, 0.08092485}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float, 3>(
+                {
+                    {14.14779854, 22.99299049, 33.56402588}
+                }
+            ));
+
+            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+            std::shared_ptr<Node> powOp = Pow();
+            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
+            opr->setDataType(DataType::Float32);
+            opr->setBackend("cpu");
+            opr->associateInput(0, input0);
+            opr->associateInput(1, input1);
+            opr->getOutput(0)->setGrad(gradOut);
+            powOp->forward();
+
+            powOp->backward();
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
+        }
+    }
 }
 } // namespace Aidge
--- a/version.txt
+++ b/version.txt
-0.2.3
+0.3.0
No results found