diff --git a/CMakeLists.txt b/CMakeLists.txt index eef0e63bf398cffb2c15b3af56ec0bf02d6590a9..6f5a8cbfcf7076256beef5e636d8c594b9df27e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,7 @@ add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}") # Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME} set(module_name _${CMAKE_PROJECT_NAME}) # target name +set(pybind_module_name ${CMAKE_PROJECT_NAME}) # name of submodule for python bindings ############################################## # Define options @@ -69,9 +70,12 @@ set_property(TARGET ${module_name} PROPERTY POSITION_INDEPENDENT_CODE ON) # PYTHON BINDING if (PYBIND) - # Handles Python + pybind11 headers dependencies + # Python binding lib is by default installed in <prefix>/python_packages/<package>/ + # When installed from python, setup.py should set it to the python package dir + set(PYBIND_INSTALL_PREFIX python_packages/${pybind_module_name} CACHE PATH "Python package install prefix") + include(PybindModuleCreation) - generate_python_binding(${CMAKE_PROJECT_NAME} ${module_name}) + generate_python_binding(${pybind_module_name} ${module_name}) target_link_libraries(${module_name} PUBLIC @@ -128,6 +132,12 @@ install(TARGETS ${module_name} EXPORT ${CMAKE_PROJECT_NAME}-targets ) install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) +if (PYBIND) + install(TARGETS ${pybind_module_name} + DESTINATION ${PYBIND_INSTALL_PREFIX} + ) +endif() + #Export the targets to a script install(EXPORT ${CMAKE_PROJECT_NAME}-targets FILE "${CMAKE_PROJECT_NAME}-targets.cmake" diff --git a/README.md b/README.md index 2e309083fb677f82b4adb1e9d5a3b7923bb32c47..96283603759f03415b7dc1b99f3905550427f633 100644 --- a/README.md +++ b/README.md @@ -23,33 +23,35 @@ Those operators can be used on any machine with an Linux OS. pip install . -v ``` > **TIPS :** Use environment variables to change compilation options : -> - `AIDGE_INSTALL` : to set the installation folder. Defaults to /usr/local/lib. :warning: This path must be identical to aidge_core install path. -> - `AIDGE_PYTHON_BUILD_TYPE` : to set the compilation mode to **Debug** or **Release** -> - `AIDGE_BUILD_GEN` : to set the build backend with +> - `AIDGE_INSTALL` : to set the installation folder. Defaults to `<python_prefix>/lib/libAidge`. :warning: This path must be identical to aidge_core install path. +> - `AIDGE_PYTHON_BUILD_TYPE` : to set the compilation mode to **Debug** or **Release** or "" (for default flags). Defaults to **Release**. +> - `AIDGE_BUILD_GEN` : to set the build backend (for development mode) or "" for the cmake default. Default to "". -### Standard C++ Compilation +## Pip installation for development -You will need to compile first the Core library before compiling the CPU one. -The makefile is designed to do it for you. +To setup using pip in development (or editable mode), use the `--no-build-isolation -e` options to pip. -To only compile the CPU library, run -``` -make cpu_only +For instance run the following command in your python environnement for a typical setup : +``` bash +export AIDGE_PYTHON_BUILD_TYPE= # default flags (no debug info but fastest build time) +export AIDGE_PYTHON_BUILD_TYPE=Debug # or if one really need to debug the C++ code +pip install -U pip setuptools setuptools_scm[toml] cmake # Pre-install build requirements (refer to the pyproject.toml [build-system] section) +pip install -v --no-build-isolation -e . ``` -To compile the CPU library + the associated unitary tests, run -``` -make cpu_tests -``` +Refer to `aidge_core/README.md` for more details on development build options. -To compile the CPU library with the python binding, run -``` -make cpu_with_pybind -``` -Important: this command can also be run with `make`. +### Standard C++ Compilation +You will need to compile and install the [Core Library](https://gitlab.eclipse.org/eclipse/aidge/aidge_core) before compiling the CPU one. -To compile the CPU library with the python binding + the associated unitary tests, run -``` -make cpu_with_pybind_tests +Once this has been done, you'll need run CMake with the +`CMAKE_INSTALL_PREFIX:PATH` flag, in order to indicate to CMake where +`aidge_core` has been installed : +```sh +cmake -DCMAKE_INSTALL_PREFIX:PATH=$(path_to_install_folder) $(CMAKE PARAMETERS) $(projet_root) + +make all ``` + +More detailed information is available in the [Aidge User Guide](https://eclipse.dev/aidge/source/GetStarted/install.html) diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp index 2e758de85b612e1159c211dad1b1cf2047037bca..b45aa1cb4151d8d6c5268d4a94da97bb25a89a40 100644 --- a/include/aidge/backend/cpu.hpp +++ b/include/aidge/backend/cpu.hpp @@ -22,6 +22,7 @@ #include "aidge/backend/cpu/operator/BitShiftImpl.hpp" #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp" #include "aidge/backend/cpu/operator/ConvImpl.hpp" +#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp" #include "aidge/backend/cpu/operator/DivImpl.hpp" #include "aidge/backend/cpu/operator/ErfImpl.hpp" #include "aidge/backend/cpu/operator/FCImpl.hpp" diff --git a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..80efb31d5cc9b99a6678d9010903d99245a3cee6 --- /dev/null +++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp @@ -0,0 +1,52 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_ +#define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_ + +#include <cstddef> +#include <memory> +#include <vector> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/ConstantOfShape.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { +// class ConstantOfShape_op; + +class ConstantOfShapeImplForward_cpu + : public Registrable< + ConstantOfShapeImplForward_cpu, std::tuple<DataType>, + void(const std::vector<DimSize_t>, const Tensor&, void *)> {}; + +class ConstantOfShapeImpl_cpu : public OperatorImpl { +public: + ConstantOfShapeImpl_cpu(const ConstantOfShape_Op &op) + : OperatorImpl(op, "cpu") {} + + static std::unique_ptr<ConstantOfShapeImpl_cpu> + create(const ConstantOfShapeImpl_cpu &op) { + return std::make_unique<ConstantOfShapeImpl_cpu>(op); + } + + void forward() override; +}; + +namespace { +static Registrar<ConstantOfShape_Op> registrarConstantOfShapeImpl_cpu( + "cpu", Aidge::ConstantOfShapeImpl_cpu::create); +} +} // namespace Aidge + +#endif /* _AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_ */ + diff --git a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_forward_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..59a3475ec0f8f33cbeca76688a5f2e395d268ad7 --- /dev/null +++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_forward_kernels.hpp @@ -0,0 +1,78 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_FORWARD_KERNEL_H_ +#define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_FORWARD_KERNEL_H_ + +#include <aidge/data/Tensor.hpp> +#include <aidge/data/half.hpp> +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <functional> // std::multiplies +#include <numeric> // std::accumulate +#include <vector> + +#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp" +#include "aidge/data/Data.hpp" +#include "aidge/utils/ErrorHandling.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { +template <class O> +void ConstantOfShapeimpl_cpu_forward_kernel( + const std::vector<DimSize_t> output_dims, const Tensor &value, + void *output_) { + + O *output = static_cast<O *>(output_); + O val; + std::copy(static_cast<O *>(value.getImpl()->hostPtr()), + static_cast<O *>(value.getImpl()->hostPtr()) + + static_cast<NbElts_t>(1), + &val); + const size_t output_size = std::accumulate( + output_dims.begin(), output_dims.end(), 1, std::multiplies<DimSize_t>()); + for (size_t i = 0; i < output_size; ++i) { + output[i] = val; + } +} + +// Then we add the Registrar declaration for different input/output types +namespace { +static Registrar<ConstantOfShapeImplForward_cpu> + registrarConstantOfShapeImplForward_cpu_Float16( + {DataType::Float16}, + Aidge::ConstantOfShapeimpl_cpu_forward_kernel<half_float::half>); +static Registrar<ConstantOfShapeImplForward_cpu> + registrarConstantOfShapeImplForward_cpu_Float32( + {DataType::Float32}, + Aidge::ConstantOfShapeimpl_cpu_forward_kernel<float>); +static Registrar<ConstantOfShapeImplForward_cpu> + registrarConstantOfShapeImplForward_cpu_Float64( + {DataType::Float64}, + Aidge::ConstantOfShapeimpl_cpu_forward_kernel<double>); +static Registrar<ConstantOfShapeImplForward_cpu> + registrarConstantOfShapeImplForward_cpu_Int16( + {DataType::Int16}, + Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int16_t>); +static Registrar<ConstantOfShapeImplForward_cpu> + registrarConstantOfShapeImplForward_cpu_Int32( + {DataType::Int32}, + Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int32_t>); +static Registrar<ConstantOfShapeImplForward_cpu> + registrarConstantOfShapeImplForward_cpu_Int64( + {DataType::Int64}, Aidge::ConstantOfShapeimpl_cpu_forward_kernel <std::int64_t>); +} // namespace +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_FORWARD_KERNEL_H_ */ + diff --git a/include/aidge/backend/cpu/operator/GridSampleImpl.hpp b/include/aidge/backend/cpu/operator/GridSampleImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a166cb36a601a9a8c7f957b6b65c9b54c47c4e8e --- /dev/null +++ b/include/aidge/backend/cpu/operator/GridSampleImpl.hpp @@ -0,0 +1,65 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_ +#define AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/operator/GridSample.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" + +namespace Aidge { + +// compute kernel registry for forward and backward +class GridSampleImpl1DForward_cpu + : public Registrable<GridSampleImpl1DForward_cpu, + std::tuple<DataType, DataType>, + void(const GridSample_Op&, + const std::shared_ptr<Tensor>&, + const std::shared_ptr<Tensor>&, + const std::shared_ptr<Tensor>&)> {}; + +class GridSampleImpl2DForward_cpu + : public Registrable<GridSampleImpl2DForward_cpu, + std::tuple<DataType, DataType>, + void(const GridSample_Op&, + const std::shared_ptr<Tensor>&, + const std::shared_ptr<Tensor>&, + const std::shared_ptr<Tensor>&)> {}; + +class GridSampleImpl_cpu : public OperatorImpl { + public: + GridSampleImpl_cpu(const GridSample_Op& op) : OperatorImpl(op, "cpu") {} + + static std::unique_ptr<GridSampleImpl_cpu> create(const GridSample_Op &op) { + return std::make_unique<GridSampleImpl_cpu>(op); + } + + public: + Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final; + void forward() override; +}; + +namespace { +// add cpu backend to GridSample_Op<1> implementation registry +static Registrar<GridSample_Op> registrarGridSampleImpl_cpu("cpu", Aidge::GridSampleImpl_cpu::create); +} // namespace + +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/GridSampleImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/GridSampleImpl_forward_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..87b6634e467c30c2737afea31a28083d78d00588 --- /dev/null +++ b/include/aidge/backend/cpu/operator/GridSampleImpl_forward_kernels.hpp @@ -0,0 +1,478 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_ +#define AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_ + +#include <algorithm> // std::max, std::min +#include <cmath> // std::fabs, std::trunf, std::nearbyint +#include <cstddef> // std::size_t +#include <cstdint> // std::int64_t + +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include "aidge/backend/cpu/operator/GridSampleImpl.hpp" +#include "aidge/data/half.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +static bool in_bound(float coord, float lower_bound, float upper_bound) noexcept { + return (coord > lower_bound) && (coord < upper_bound); +} + +static float unnormalized_coord(float coord, float new_lower_bound, float new_upper_bound) noexcept { + return (coord + 1) / 2 * (new_upper_bound - new_lower_bound) + new_lower_bound; +} + +// unused +// static float normalized_coord(float coord, float prev_lower_bound, float prev_upper_bound) noexcept { +// return (coord + prev_lower_bound) / (prev_upper_bound-prev_lower_bound) * 2 - 1; +// } + +static float unnormalize_grid_sample_coord(float coord, std::size_t size, bool align_corners) noexcept { + return align_corners ? unnormalized_coord(coord, 0.0f, static_cast<float>(size) - 1.0f) + : unnormalized_coord(coord, -0.5f, static_cast<float>(size) - 0.5f); +} + +// unused +// static float normalize_grid_sample_coord(float coord, std::size_t size, bool align_corners) noexcept { +// return align_corners ? normalized_coord(coord, 0.0f, static_cast<float>(size) - 1.0f) +// : normalized_coord(coord, -0.5f, static_cast<float>(size) - 0.5f); +// } + +static float update_normalized_coord_with_padding(float coord, Aidge::GridSample_Op::PaddingMode padding_mode) { + if (!in_bound(coord, -1.0f, 1.0f)) { + if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) { + coord = std::min(std::max(-1.0f, coord), 1.0f); + } + else if (padding_mode == Aidge::GridSample_Op::PaddingMode::Reflection) { + float abs_coord = std::fabs(coord); + float int_coord = std::truncf(abs_coord); + std::int32_t nb_refl = static_cast<std::int32_t>((int_coord - 1) / 2); + float res = ((nb_refl + 1)*2) - abs_coord; + coord = (coord > 0) ? (nb_refl % 2 == 0 ? res : -res) \ + : (nb_refl % 2 == 0 ? -res : res); + } + + } + return coord; +} + +static inline std::int64_t update_unnormalized_coord_with_padding(std::int64_t coord, std::int64_t size, Aidge::GridSample_Op::PaddingMode padding_mode) { + if (!in_bound(coord, 0, size)) { + // out of bound. switch padding mode + if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) { + coord = std::min(std::max(std::int64_t(0), coord), size-std::int64_t(1)); + } else if (padding_mode == Aidge::GridSample_Op::PaddingMode::Reflection) { + const std::int64_t quotient = coord / (size-1); + const std::int64_t remainer = std::abs(coord - quotient*(size-1)); + coord = (quotient % 2 == 0) ? remainer : size - 1 - remainer; + } + } + return coord; +} + +namespace Aidge { +/** + * @brief Forward kernel for 1D GridSample on CPU backend. + * @tparam I Input data type. + * @tparam O Output data type. + * @param params tuple of Attributes from the Operator + * @param inputDims Array of input dimensions. + * @param input_ const input Tensor. + * @param grid_ const grid Tensor. + * @param output_ Output Tensor. + */ +template <class I, class O> +void GridSampleImpl1D_cpu_forward_kernel(const GridSample_Op& op, + const std::shared_ptr<Tensor>& in0, + const std::shared_ptr<Tensor>& in1, + const std::shared_ptr<Tensor>& out) +{ + const I* const input = static_cast<const I * const>(in0->getImpl()->rawPtr()); + const I* input_ptr = input; + float* const grid = static_cast<float* const>(in1->getImpl()->rawPtr()); + float* grid_ptr = grid; + O* const output = static_cast<O* const>(out->getImpl()->rawPtr()); + O* output_ptr = output; + + const std::size_t N = in0->dim(0); + const std::size_t C = in0->dim(1); + const std::size_t in_H = in0->dim(2); + const std::size_t grid_H = in1->dim(1); + + const std::size_t in_N_s = in0->stride(0); + const std::size_t in_C_s = in0->stride(1); + const std::size_t in_H_s = in0->stride(2); + const std::size_t grid_N_s = in1->stride(0); + const std::size_t grid_H_s = in1->stride(1); + const std::size_t out_N_s = out->stride(0); + const std::size_t out_C_s = out->stride(1); + const std::size_t out_H_s = out->stride(2); + + float* grid_ptr_N = grid; + const I* input_ptr_N = input; + O* output_ptr_N = output; + for (std::size_t n = 0; n < N; ++n) { + grid_ptr = grid_ptr_N; + for (std::size_t grid_x = 0; grid_x < grid_H; ++grid_x) { + output_ptr = output_ptr_N + grid_x*out_H_s; + /* + * change grid_x coord to match padding_mode + * Change range from [-1, 1] to [0, H-1] or [-0.5, H-0.5] according to align_corners + * Handle computation of interpolation + * any value outside bounds is considered 0 + * if nearest: + * else if linear: + * else if cubic: + * else : nothing + */ + float x = *grid_ptr; + x = update_normalized_coord_with_padding(x, op.paddingMode()); + x = unnormalize_grid_sample_coord(x, in_H, op.alignCorners()); + if (op.mode() == GridSample_Op::Mode::Nearest) { + const std::int64_t x_rounded = std::nearbyintf(x); + + if (in_bound(x_rounded, 0, in_H)) { + input_ptr = input_ptr_N + x_rounded*in_H_s; + for (std::size_t c = 0; c < C; ++c) { + *output_ptr = *input_ptr; + input_ptr += in_C_s; + output_ptr += out_C_s; + } + } else { + for (std::size_t c = 0; c < C; ++c) { + *output_ptr = O(0); + output_ptr += out_C_s; + } + } + } else if (op.mode() == GridSample_Op::Mode::Linear) { + const std::int64_t x_inf = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_H, op.paddingMode()); + const std::int64_t x_sup = update_unnormalized_coord_with_padding(x_inf + 1, in_H, op.paddingMode()); + + const I* input_ptr_NC = input_ptr_N; + for (std::size_t c = 0; c < C; ++c) { + const I f_inf = in_bound(x_inf, 0, in_H) ? + input_ptr_NC[static_cast<std::size_t>(x_inf)*in_H_s] : I(0); + const I f_sup = in_bound(x_sup, 0, in_H) ? + input_ptr_NC[static_cast<std::size_t>(x_sup)*in_H_s] : I(0); + + *output_ptr = static_cast<O>(static_cast<I>(x - x_inf)*f_inf \ + + static_cast<I>(x_sup - x)*f_sup); + + input_ptr_NC += in_C_s; + output_ptr += out_C_s; + } + } else if (op.mode() == GridSample_Op::Mode::Cubic) { + const std::int64_t x_inf = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_H, op.paddingMode()); + const std::int64_t x_sup = update_unnormalized_coord_with_padding(x_inf + 1, in_H, op.paddingMode()); + const std::int64_t x_inf_inf = update_unnormalized_coord_with_padding(x_inf - 1, in_H, op.paddingMode()); + const std::int64_t x_sup_sup = update_unnormalized_coord_with_padding(x_sup + 1, in_H, op.paddingMode()); + + const I x1 = static_cast<I>(x - static_cast<float>(x_inf)); + const I x2 = x1 * x1; + const I x3 = x1 * x2; + + const I* input_ptr_NC = input_ptr_N; + for (std::size_t c = 0; c < C; ++c) { + const I f_inf_inf = in_bound(x_inf_inf, 0, in_H) ? input_ptr_NC[x_inf_inf*in_H_s] : I(0); + const I f_inf = in_bound(x_inf, 0, in_H) ? input_ptr_NC[x_inf*in_H_s] : I(0); + const I f_sup = in_bound(x_sup, 0, in_H) ? input_ptr_NC[x_sup*in_H_s] : I(0); + const I f_sup_sup = in_bound(x_sup_sup, 0, in_H) ? input_ptr_NC[x_sup_sup*in_H_s] : I(0); + + const I m_inf = (f_sup - f_inf_inf) / I(2); + const I m_sup = (f_sup_sup - f_inf) / I(2); + + *output_ptr = f_inf \ + + x1 * m_inf \ + + x2 * (3 * (f_sup - f_inf) - 2 * m_inf - m_sup) \ + + x3 * (2*(f_inf - f_sup) + m_inf + m_sup); + + input_ptr_NC += in_C_s; + output_ptr += out_C_s; + } + } + + grid_ptr += grid_H_s; + } + + input_ptr_N += in_N_s; + grid_ptr_N += grid_N_s; + output_ptr_N += out_N_s; + } +} + +namespace { +static Registrar<GridSampleImpl1DForward_cpu> registrarGridSampleImpl1DForward_cpu_Float32( + {DataType::Float32, DataType::Float32}, + Aidge::GridSampleImpl1D_cpu_forward_kernel<float, float>); +static Registrar<GridSampleImpl1DForward_cpu> registrarGridSampleImpl1DForward_cpu_Float16( + {DataType::Float16, DataType::Float16}, + Aidge::GridSampleImpl1D_cpu_forward_kernel<half_float::half, half_float::half>); +static Registrar<GridSampleImpl1DForward_cpu> registrarGridSampleImpl1DForward_cpu_Int32( + {DataType::Int32, DataType::Int32}, + Aidge::GridSampleImpl1D_cpu_forward_kernel<int, int>); +static Registrar<GridSampleImpl1DForward_cpu> registrarGridSampleImpl1DForward_cpu_Float64( + {DataType::Float64, DataType::Float64}, + Aidge::GridSampleImpl1D_cpu_forward_kernel<double, double>); + + +/** + * @brief Forward kernel for 1D GridSample on CPU backend. + * @tparam I Input data type. + * @tparam O Output data type. + * @param params tuple of Attributes from the Operator + * @param inputDims Array of input dimensions. + * @param input_ const input Tensor. + * @param grid_ const grid Tensor. + * @param output_ Output Tensor. + */ +template <class I, class O> +void GridSampleImpl2D_cpu_forward_kernel(const GridSample_Op& op, + const std::shared_ptr<Tensor>& in0, + const std::shared_ptr<Tensor>& in1, + const std::shared_ptr<Tensor>& out) +{ + const I* input = static_cast<const I *>(in0->getImpl()->rawPtr()); + const I* input_ptr = input; + float* const grid = static_cast<float* const>(in0->getImpl()->rawPtr()); + float* grid_ptr = grid; + O* const output = static_cast<O* const>(out->getImpl()->rawPtr()); + + const std::size_t N = in0->dim(0); + const std::size_t C = in0->dim(1); + const std::size_t in_H = in0->dim(2); + const std::size_t in_W = in0->dim(3); + const std::size_t grid_H = in1->dim(1); + const std::size_t grid_W = in1->dim(2); + + const std::size_t in_N_s = in0->stride(0); + const std::size_t in_C_s = in0->stride(1); + const std::size_t in_H_s = in0->stride(2); + const std::size_t in_W_s = in0->stride(3); + const std::size_t grid_N_s = in1->stride(0); + const std::size_t grid_H_s = in1->stride(1); + const std::size_t grid_W_s = in1->stride(2); + const std::size_t grid_Coord_s = in1->stride(3); + const std::size_t out_N_s = out->stride(0); + const std::size_t out_C_s = out->stride(1); + const std::size_t out_H_s = out->stride(2); + const std::size_t out_W_s = out->stride(3); + + + float* grid_ptr_N = grid; + const I* input_ptr_N = input; + O* output_ptr_N = output; + for (std::size_t n = 0; n < N; ++n) { + for (std::size_t grid_y = 0; grid_y < grid_H; ++grid_y) { + for (std::size_t grid_x = 0; grid_x < grid_W; ++grid_x) { + O* output_ptr = output_ptr_N + grid_y*out_H_s + grid_y*out_W_s; + grid_ptr = grid_ptr_N + grid_y*grid_H_s + grid_x*grid_W_s; + /* + * change grid_x coord to match padding_mode + * Change range from [-1, 1] to [0, H-1] or [-0.5, H-0.5] according to align_corners + * Handle computation of interpolation + * any value outside bounds is considered 0 + * if nearest: + * else if linear: + * else if cubic: + * else : nothing + */ + float x = *grid_ptr; + float y = grid_ptr[grid_Coord_s]; + x = update_normalized_coord_with_padding(x, op.paddingMode()); + x = unnormalize_grid_sample_coord(x, in_W, op.alignCorners()); + y = update_normalized_coord_with_padding(y, op.paddingMode()); + y = unnormalize_grid_sample_coord(y, in_H, op.alignCorners()); + if (op.mode() == GridSample_Op::Mode::Nearest) { + const std::int64_t x_rounded = std::nearbyintf(x); + const std::int64_t y_rounded = std::nearbyintf(y); + + if (in_bound(x_rounded, 0, in_W) && in_bound(y_rounded, 0, in_H)) { + input_ptr = input_ptr_N + y_rounded*in_H_s + x_rounded*in_W_s; + for (std::size_t c = 0; c < C; ++c) { + *output_ptr = *input_ptr; + input_ptr += in_C_s; + output_ptr += out_C_s; + } + } else { + for (std::size_t c = 0; c < C; ++c) { + *output_ptr = O(0); + output_ptr += out_C_s; + } + } + } else if (op.mode() == GridSample_Op::Mode::Linear) { + const std::int64_t x_r = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_W, op.paddingMode()); // right + const std::int64_t x_l = update_unnormalized_coord_with_padding(x_r + 1, in_W, op.paddingMode()); // left + + const std::int64_t y_t = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(y)), in_H, op.paddingMode()); // top + const std::int64_t y_b = update_unnormalized_coord_with_padding(y_t + 1, in_H, op.paddingMode()); // bottom + + const I* input_ptr_NC = input_ptr_N; + for (std::size_t c = 0; c < C; ++c) { + + const I f_tr = (in_bound(x_r, 0, in_W) && in_bound(y_t, 0, in_H)) ? + input_ptr_NC[static_cast<std::size_t>(y_t)*in_H_s + + static_cast<std::size_t>(x_r)*in_W_s] + : I(0); + const I f_tl = (in_bound(x_l, 0, in_W) && in_bound(y_t, 0, in_H)) ? + input_ptr_NC[static_cast<std::size_t>(y_t)*in_H_s + + static_cast<std::size_t>(x_l)*in_W_s] + : I(0); + const I f_br = (in_bound(x_r, 0, in_W) && in_bound(y_b, 0, in_H)) ? + input_ptr_NC[static_cast<std::size_t>(y_b)*in_H_s + + static_cast<std::size_t>(x_r)*in_W_s] + : I(0); + const I f_bl = (in_bound(x_l, 0, in_W) && in_bound(y_b, 0, in_H)) ? + input_ptr_NC[static_cast<std::size_t>(y_b)*in_H_s + + static_cast<std::size_t>(x_l)*in_W_s] + : I(0); + + // compute weighted sum of the 4 corners + const I w_tr = static_cast<I>((y - static_cast<float>(y_t))*(static_cast<float>(x_r) - x)); + const I w_tl = static_cast<I>((y - static_cast<float>(y_t))*(x - static_cast<float>(x_l))); + const I w_br = static_cast<I>((static_cast<float>(y_b) - y)*(static_cast<float>(x_r) - x)); + const I w_bl = static_cast<I>((static_cast<float>(y_b) - y)*(x - static_cast<float>(x_l))); + + *output_ptr = static_cast<O>(w_tr*f_tr + w_tl*f_tl + w_br*f_br + w_bl*f_bl); + + input_ptr_NC += in_C_s; + output_ptr += out_C_s; + } + } else if (op.mode() == GridSample_Op::Mode::Cubic) { + /* + * .. .. .. .. .. .. + * .. 00 01 02 03 .. + * .. 10 11 12 13 .. + * .. 20 21 22 23 .. + * .. 30 31 32 33 .. + * .. .. .. .. .. .. + */ + const std::int64_t x_1 = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_W, op.paddingMode()); + const std::int64_t x_0 = update_unnormalized_coord_with_padding(x_1 - 1, in_W, op.paddingMode()); + const std::int64_t x_2 = update_unnormalized_coord_with_padding(x_1 + 1, in_W, op.paddingMode()); + const std::int64_t x_3 = update_unnormalized_coord_with_padding(x_1 + 2, in_W, op.paddingMode()); + + const std::int64_t y_1 = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(y)), in_H, op.paddingMode()); + const std::int64_t y_0 = update_unnormalized_coord_with_padding(y_1 - 1, in_H, op.paddingMode()); + const std::int64_t y_2 = update_unnormalized_coord_with_padding(y_1 + 1, in_H, op.paddingMode()); + const std::int64_t y_3 = update_unnormalized_coord_with_padding(y_1 + 2, in_H, op.paddingMode()); + + const I* input_ptr_NC = input_ptr_N; + + for (std::size_t c = 0; c < C; ++c) { + const I f_00 = in_bound(x_0, 0, in_W) && in_bound(y_0, 0, in_H) ? + input_ptr_NC[x_0*in_W_s + y_0*in_H_s] : I(0); + const I f_01 = in_bound(x_0, 0, in_W) && in_bound(y_1, 0, in_H) ? + input_ptr_NC[x_0*in_W_s + y_1*in_H_s] : I(0); + const I f_02 = in_bound(x_0, 0, in_W) && in_bound(y_2, 0, in_H) ? + input_ptr_NC[x_0*in_W_s + y_2*in_H_s] : I(0); + const I f_03 = in_bound(x_0, 0, in_W) && in_bound(y_3, 0, in_H) ? + input_ptr_NC[x_0*in_W_s + y_3*in_H_s] : I(0); + const I f_10 = in_bound(x_1, 0, in_W) && in_bound(y_0, 0, in_H) ? + input_ptr_NC[x_1*in_W_s + y_0*in_H_s] : I(0); + const I f_20 = in_bound(x_2, 0, in_W) && in_bound(y_0, 0, in_H) ? + input_ptr_NC[x_2*in_W_s + y_0*in_H_s] : I(0); + const I f_30 = in_bound(x_3, 0, in_W) && in_bound(y_0, 0, in_H) ? + input_ptr_NC[x_3*in_W_s + y_0*in_H_s] : I(0); + const I f_11 = in_bound(x_1, 0, in_W) && in_bound(y_1, 0, in_H) ? + input_ptr_NC[x_1*in_W_s + y_1*in_H_s] : I(0); + const I f_12 = in_bound(x_1, 0, in_W) && in_bound(y_2, 0, in_H) ? + input_ptr_NC[x_1*in_W_s + y_2*in_H_s] : I(0); + const I f_13 = in_bound(x_1, 0, in_W) && in_bound(y_3, 0, in_H) ? + input_ptr_NC[x_1*in_W_s + y_3*in_H_s] : I(0); + const I f_21 = in_bound(x_2, 0, in_W) && in_bound(y_1, 0, in_H) ? + input_ptr_NC[x_2*in_W_s + y_1*in_H_s] : I(0); + const I f_22 = in_bound(x_2, 0, in_W) && in_bound(y_2, 0, in_H) ? + input_ptr_NC[x_2*in_W_s + y_2*in_H_s] : I(0); + const I f_23 = in_bound(x_2, 0, in_W) && in_bound(y_3, 0, in_H) ? + input_ptr_NC[x_2*in_W_s + y_3*in_H_s] : I(0); + const I f_31 = in_bound(x_3, 0, in_W) && in_bound(y_1, 0, in_H) ? + input_ptr_NC[x_3*in_W_s + y_1*in_H_s] : I(0); + const I f_32 = in_bound(x_3, 0, in_W) && in_bound(y_2, 0, in_H) ? + input_ptr_NC[x_3*in_W_s + y_2*in_H_s] : I(0); + const I f_33 = in_bound(x_3, 0, in_W) && in_bound(y_3, 0, in_H) ? + input_ptr_NC[x_3*in_W_s + y_3*in_H_s] : I(0); + + const I mx_11 = (f_21 - f_01) / I(2); + const I mx_12 = (f_22 - f_02) / I(2); + const I mx_21 = (f_31 - f_11) / I(2); + const I mx_22 = (f_32 - f_12) / I(2); + + const I my_11 = (f_12 - f_10) / I(2); + const I my_12 = (f_13 - f_11) / I(2); + const I my_21 = (f_22 - f_20) / I(2); + const I my_22 = (f_23 - f_21) / I(2); + + const I mxy_11 = (f_22 - f_20 - f_02 - + f_00) / I(4); + const I mxy_12 = (f_23 - f_21 - f_03 - + f_01) / I(4); + const I mxy_21 = (f_32 - f_30 - f_12 - + f_10) / I(4); + const I mxy_22 = (f_33 - f_31 - f_13 - + f_11) / I(4); + + const I a_00 = f_11; + const I a_10 = mx_11; + const I a_20 = I(3)*(f_21 - f_11) - I(2)*mx_11 - mx_21; + const I a_30 = I(2)*(f_11 - f_21) + mx_11 + mx_21; + const I a_01 = my_11; + const I a_11 = mxy_11; + const I a_21 = I(3)*(my_21 - my_11) - I(2)*mxy_11 - mxy_21; + const I a_31 = I(2)*(my_11 - my_21) + mxy_11 + mxy_21; + const I a_02 = I(3)*(f_12 - f_11) - I(2)*my_11 - my_12; + const I a_12 = I(3)*(mx_12 - mx_11) - I(2)*mxy_11 - mxy_12; + const I a_22 = I(9)*(f_11 + f_22 - f_21 - f_12) + I(3)*(I(2)*(mx_11 - mx_12 + my_11 - my_21) + mx_21 - mx_22 + my_12 - my_22) + mxy_22 + I(2)*(mxy_12 + mxy_21 + I(2)*mxy_11); + const I a_32 = - mxy_12 - mxy_22 + I(2)*(my_22 - my_12 - mxy_11 - mxy_21 + I(2)*(my_21 - my_11) + I(3)*(f_21 + f_12 - f_11 - f_22)) + I(3)*(mx_12 + mx_22 - mx_11 - mx_21); + const I a_03 = I(2)*(f_11 - f_12) + my_11 + my_12; + const I a_13 = I(2)*(mx_11 - mx_12) + mxy_11 + mxy_12; + const I a_23 = - mxy_21 - mxy_22 + I(2)*(-mx_21 + mx_22 - mxy_11 - mxy_12 + I(2)*(mx_12 - mx_11) + I(3)*(f_12 + f_21 - f_11 - f_22)) + I(3)*(my_21 + my_22 - my_11 - my_12); + const I a_33 = mxy_11 + mxy_21 + mxy_12 + mxy_22 + I(2)*(mx_11 + mx_21 - mx_12 - mx_22 + my_11 - my_21 + my_12 - my_22 + I(2)*(f_11 - f_21 - f_12 + f_22)); + + const I x2 = static_cast<I>(x*x); + const I x3 = static_cast<I>(x*x*x); + const I y2 = static_cast<I>(y*y); + const I y3 = static_cast<I>(y*y*y); + + *output_ptr = static_cast<O>( \ + a_00 + a_10*x + a_20*x2 + a_30*x3 \ + + a_01*y + a_11*x*y + a_21*x2*y + a_31*x3*y \ + + a_02*y2 + a_12*x*y2 + a_22*x2*y2 + a_32*x3*y2 \ + + a_03*y3 + a_13*x*y3 + a_23*x2*y3 + a_33*x3*y3); + + input_ptr_NC += in_C_s; + output_ptr += out_C_s; + } + } + } + } + + input_ptr_N += in_N_s; + grid_ptr_N += grid_N_s; + output_ptr_N += out_N_s; + } +} + +static Registrar<GridSampleImpl2DForward_cpu> registrarGridSampleImpl2DForward_cpu_Float32( + {DataType::Float32, DataType::Float32}, + Aidge::GridSampleImpl2D_cpu_forward_kernel<float, float>); +static Registrar<GridSampleImpl2DForward_cpu> registrarGridSampleImpl2DForward_cpu_Float16( + {DataType::Float16, DataType::Float16}, + Aidge::GridSampleImpl2D_cpu_forward_kernel<half_float::half, half_float::half>); +static Registrar<GridSampleImpl2DForward_cpu> registrarGridSampleImpl2DForward_cpu_Int32( + {DataType::Int32, DataType::Int32}, + Aidge::GridSampleImpl2D_cpu_forward_kernel<int, int>); +static Registrar<GridSampleImpl2DForward_cpu> registrarGridSampleImpl2DForward_cpu_Float64( + {DataType::Float64, DataType::Float64}, + Aidge::GridSampleImpl2D_cpu_forward_kernel<double, double>); +} // namespace + + + +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_ */ diff --git a/include/aidge/backend/cpu/operator/MulImpl.hpp b/include/aidge/backend/cpu/operator/MulImpl.hpp index 2d42194c417bd7d57c00f4325a4585cf59d95b24..008edf176594a326e464a242f9f31d7b936a6940 100644 --- a/include/aidge/backend/cpu/operator/MulImpl.hpp +++ b/include/aidge/backend/cpu/operator/MulImpl.hpp @@ -25,11 +25,25 @@ namespace Aidge { // compute kernel registry for forward and backward class MulImplForward_cpu - : public Registrable<MulImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)> { -}; + : public Registrable<MulImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, + const std::vector<std::size_t>&, + const std::vector<std::size_t>&, + const void*, + const void*, + void*)> {}; + class MulImplBackward_cpu - : public Registrable<MulImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)> { -}; + : public Registrable<MulImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t, + const std::size_t, + const std::size_t, + const std::vector<std::size_t>, + const std::vector<std::size_t>, + const void*, + const void*, + const void*, + void*, + void*)> {}; + class MulImpl_cpu : public OperatorImpl { public: @@ -40,7 +54,9 @@ public: } Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final; + void forward() override; + void backward() override; }; namespace { diff --git a/include/aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..db4cf81f0733b476957acc5cc21ad31e9c88ac72 --- /dev/null +++ b/include/aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp @@ -0,0 +1,92 @@ +#ifndef AIDGE_CPU_OPERATOR_MULIMPL_BACKWARD_KERNEL_H_ +#define AIDGE_CPU_OPERATOR_MULIMPL_BACKWARD_KERNEL_H_ + + +#include "aidge/utils/Registrar.hpp" + +#include <cstdint> // std::int32_t, std::int64_t +#include <algorithm> + +#include "aidge/backend/cpu/data/Broadcasting.hpp" +#include "aidge/backend/cpu/operator/MulImpl.hpp" + + +namespace Aidge { + +template <class I1, class I2, class O> +void MulImpl_cpu_backward_kernel(const std::size_t input0Length, + const std::size_t input1Length, + const std::size_t grad0Length, + const std::vector<std::size_t> input0Dims, + const std::vector<std::size_t> input1Dims, + const void* input0_, + const void* input1_, + const void* grad_output_, + void* gradientInput0, + void* gradientInput1) +{ + const auto* input0 = static_cast<const I1*>(input0_); + const auto* input1 = static_cast<const I1*>(input1_); + const auto* grad_output = static_cast<const O*>(grad_output_); + auto* grad_input_0 = static_cast<I1*>(gradientInput0); + auto* grad_input_1 = static_cast<I2*>(gradientInput1); + + + if(input0Dims.size() >= input1Dims.size()) + { + AIDGE_ASSERT(input0Length == grad0Length, "Incorrect dimensions between Mul input and output tensors"); + + for(auto i = 0U; i < input0Length; ++i) + { + const auto indices = getMultiDimIndices(input1Dims, i); + const auto flattenedIndex = getFlattenedIndex(input1Dims, indices); + + grad_input_0[i] = input1[flattenedIndex] * grad_output[i]; + } + + for(std::size_t i = 0 ; i < grad0Length; ++i) + { + const auto indices = getMultiDimIndices(input1Dims, i); + const auto flattenedIndex = getFlattenedIndex(input1Dims, indices); + + grad_input_1[flattenedIndex] += input0[i] * grad_output[i]; + } + + } else { + AIDGE_ASSERT(input1Length == grad0Length, "Incorrect dimensions between Mul input and output tensors"); + + for(auto i = 0U; i < input1Length; ++i) + { + const auto indices = getMultiDimIndices(input0Dims, i); + const auto flattenedIndex = getFlattenedIndex(input0Dims, indices); + + grad_input_1[i] = input0[flattenedIndex] * grad_output[i]; + } + + for(std::size_t i = 0 ; i < grad0Length; ++i) + { + const auto indices = getMultiDimIndices(input0Dims, i); + const auto flattenedIndex = getFlattenedIndex(input0Dims, indices); + + grad_input_0[flattenedIndex] += input1[i] * grad_output[i]; + } + } +} + + +namespace { +static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Float32( + {DataType::Float32, DataType::Float32, DataType::Float32}, + Aidge::MulImpl_cpu_backward_kernel<float, float, float>); +static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Float64( + {DataType::Float64, DataType::Float64, DataType::Float64}, + Aidge::MulImpl_cpu_backward_kernel<double, double, double>); +static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Int32( + {DataType::Int32, DataType::Int32, DataType::Int32}, + Aidge::MulImpl_cpu_backward_kernel<std::int32_t, std::int32_t, std::int32_t>); +static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Int64( + {DataType::Int64, DataType::Int64, DataType::Int64}, + Aidge::MulImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>); +} // namespace +} // namespace Aidge +#endif diff --git a/include/aidge/backend/cpu/operator/PadImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/PadImpl_forward_kernels.hpp index 26c873c8fe7f140b09b31d0f1a9d4125acbcf50f..8ca23a4d39600af29065992804d75c42b822ea1b 100644 --- a/include/aidge/backend/cpu/operator/PadImpl_forward_kernels.hpp +++ b/include/aidge/backend/cpu/operator/PadImpl_forward_kernels.hpp @@ -131,7 +131,7 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder for (std::uint32_t oy = 0; oy < oySize; ++oy) { for (std::uint32_t ox = 0; ox < oxSize; ++ox) { - const std::size_t oIndexFull = oIndex + ox*oySize + oy; + const std::size_t oIndexFull = oIndex + oy*oxSize + ox; O outputValue = static_cast<O>(borderValue); @@ -140,14 +140,14 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1]); if (ix >= 0 && ix < static_cast<std::int32_t>(dims[3]) && iy >= 0 && iy < static_cast<std::int32_t>(dims[2])) { - outputValue = input[iIndex + static_cast<std::size_t>(ix)*dims[2] + static_cast<std::size_t>(iy)]; + outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)]; } } else if (borderType == PadBorderType::Edge) { std::int32_t ix = std::max(0, std::min(static_cast<std::int32_t>(dims[3]) - 1, static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3]))); std::int32_t iy = std::max(0, std::min(static_cast<std::int32_t>(dims[2]) - 1, static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1]))); - outputValue = input[iIndex + static_cast<std::size_t>(ix)*dims[2] + static_cast<std::size_t>(iy)]; + outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)]; } else if (borderType == PadBorderType::Reflect) { std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3]); @@ -162,13 +162,13 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder if (iy >= static_cast<std::int32_t>(dims[2])) iy = static_cast<std::int32_t>(dims[2]) - iy; - outputValue = input[iIndex + static_cast<std::size_t>(ix)*dims[2] + static_cast<std::size_t>(iy)]; + outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)]; } else if (borderType == PadBorderType::Wrap) { std::int32_t ix = (static_cast<std::int32_t>(dims[3]) + static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3])) % static_cast<std::int32_t>(dims[3]); std::int32_t iy = (static_cast<std::int32_t>(dims[2]) + static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1])) % static_cast<std::int32_t>(dims[2]); - outputValue = input[iIndex + static_cast<std::size_t>(ix)*dims[2] + static_cast<std::size_t>(iy)]; + outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)]; } output[oIndexFull] = outputValue; diff --git a/pyproject.toml b/pyproject.toml index aa43189d3f4f7d3796009c2646175635382796bf..ff7a17363963ca511f26e3f3bbbcafa5e2870dd1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,7 @@ dynamic = ["version"] # defined in tool.setuptools_scm requires = [ "setuptools>=64", "setuptools_scm[toml]==7.1.0", - "cmake>=3.15.3.post1", - "toml" + "cmake>=3.15.3.post1" ] build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py index 35520fd344c505bf38a60fcd5484c28517b0d2bd..a2b50c38faa1dc832e6ffe60b528f146e4287da3 100644 --- a/setup.py +++ b/setup.py @@ -8,17 +8,13 @@ import multiprocessing from math import ceil -import toml - from setuptools import setup, Extension from setuptools.command.build_ext import build_ext -def get_project_name() -> str: - with open(pathlib.Path().absolute() / "pyproject.toml", "r") as file: - project_toml = toml.load(file) - return project_toml["project"]["name"] +PROJECT_NAME = "aidge_backend_cpu" +SETUP_DIR = pathlib.Path(__file__).parent class AidgeBuildExtension(Extension): def __init__(self, name): @@ -26,6 +22,15 @@ class AidgeBuildExtension(Extension): class AidgePkgBuild(build_ext): + def __init__(self, dist, *args, **kwargs): + super().__init__(dist, *args, **kwargs) + # Detect editable_mode for old versions of setuptools + if not hasattr(self, "editable_mode"): + if hasattr(dist, "commands"): + self.editable_mode = "develop" in dist.commands + else: + self.editable_mode = False + def run(self): #################################### # BUILD PACKAGE @@ -43,36 +48,34 @@ class AidgePkgBuild(build_ext): if not build_lib.exists(): build_lib.mkdir(parents=True, exist_ok=True) - os.chdir(str(build_temp)) + package_prefix = build_lib if not self.editable_mode else SETUP_DIR + pybind_install_prefix = (package_prefix / PROJECT_NAME).absolute() - compile_type = ( - "Release" - if "AIDGE_PYTHON_BUILD_TYPE" not in os.environ - else os.environ["AIDGE_PYTHON_BUILD_TYPE"] - ) + os.chdir(str(build_temp)) + compile_type = os.environ.get("AIDGE_PYTHON_BUILD_TYPE", "Release") install_path = ( os.path.join(sys.prefix, "lib", "libAidge") if "AIDGE_INSTALL" not in os.environ else os.environ["AIDGE_INSTALL"] ) - - # using ninja as default build system to build faster and with the same compiler as on windows - build_gen = ( - ["-G", os.environ["AIDGE_BUILD_GEN"]] - if "AIDGE_BUILD_GEN" in os.environ + build_gen = os.environ.get("AIDGE_BUILD_GEN", "") + build_gen_opts = ( + ["-G", build_gen] + if build_gen else [] ) self.spawn( [ "cmake", - *build_gen, + *build_gen_opts, str(cwd), "-DTEST=OFF", f"-DCMAKE_INSTALL_PREFIX:PATH={install_path}", f"-DCMAKE_BUILD_TYPE={compile_type}", "-DPYBIND=ON", + f"-DPYBIND_INSTALL_PREFIX:PATH={pybind_install_prefix}", "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON", "-DCOVERAGE=OFF", ] @@ -85,25 +88,11 @@ class AidgePkgBuild(build_ext): self.spawn(["cmake", "--install", ".", "--config", compile_type]) os.chdir(str(cwd)) - aidge_package = build_lib / (get_project_name()) - - # Get "aidge core" package - # ext_lib = build_temp - print(build_temp.absolute()) - # Copy all shared object files from build_temp/lib to aidge_package - for root, _, files in os.walk(build_temp.absolute()): - for file in files: - if (file.endswith(".so") or file.endswith(".pyd")) and ( - root != str(aidge_package.absolute()) - ): - currentFile = os.path.join(root, file) - shutil.copy(currentFile, str(aidge_package.absolute())) - if __name__ == "__main__": setup( include_package_data=True, - ext_modules=[AidgeBuildExtension(get_project_name())], + ext_modules=[AidgeBuildExtension(PROJECT_NAME)], cmdclass={ "build_ext": AidgePkgBuild, }, diff --git a/src/operator/ConstantOfShapeImpl.cpp b/src/operator/ConstantOfShapeImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7d727c04d13b7cd8822fc9be4cf62b8a7bf7754f --- /dev/null +++ b/src/operator/ConstantOfShapeImpl.cpp @@ -0,0 +1,40 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp" + +#include <functional> +#include <memory> +#include <vector> + +#include "aidge/backend/cpu/operator/ConstantOfShapeImpl_forward_kernels.hpp" +#include "aidge/data/Data.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/ConstantOfShape.hpp" +#include "aidge/utils/ErrorHandling.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +void Aidge::ConstantOfShapeImpl_cpu::forward() { + const ConstantOfShape_Op &op_ = static_cast<const ConstantOfShape_Op &>(mOp); + // Check if input is provided + AIDGE_ASSERT(op_.getInput(0), "{} : Missing input 0", __func__); + + // Create the forward kernal with the wanted types + auto kernelFunc = Registrar<ConstantOfShapeImplForward_cpu>::create( + {op_.getOutput(0)->dataType()}); + + // Call kernel + kernelFunc(op_.getOutput(0)->dims(), + op_.value(), + op_.getOutput(0)->getImpl()->rawPtr()); +} + diff --git a/src/operator/GridSampleImpl.cpp b/src/operator/GridSampleImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3f465d4dc9915eb2270f650b5a2f29bcd83377b5 --- /dev/null +++ b/src/operator/GridSampleImpl.cpp @@ -0,0 +1,70 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cpu/operator/GridSampleImpl.hpp" + +#include <functional> +#include <vector> + +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include "aidge/backend/cpu/operator/GridSampleImpl_forward_kernels.hpp" +#include "aidge/operator/GridSample.hpp" +#include "aidge/utils/Types.h" + +Aidge::Elts_t Aidge::GridSampleImpl_cpu::getNbRequiredProtected(IOIndex_t /*inputIdx*/) const { + // this implementation can be in-place + return Elts_t::DataElts(0); +} + +void Aidge::GridSampleImpl_cpu::forward() { + const auto& op_ = static_cast<const GridSample_Op&>(mOp); + + // Find the correct kernel type + const auto outputDataType = op_.getOutput(0)->dataType(); + + const Registrar<GridSampleImpl1DForward_cpu>::registrar_key registrarKey = { + op_.getInput(0)->dataType(), + outputDataType}; + + std::function<void(const GridSample_Op&, + const std::shared_ptr<Tensor>&, + const std::shared_ptr<Tensor>&, + const std::shared_ptr<Tensor>&)> kernelFunc; + + const std::size_t nbSpatialFeat = op_.getInput(0)->nbDims(); + switch (nbSpatialFeat) + { + case 1: + kernelFunc = Registrar<GridSampleImpl1DForward_cpu>::create(registrarKey); + break; + case 2: + kernelFunc = Registrar<GridSampleImpl2DForward_cpu>::create(registrarKey); + break; + default: + AIDGE_THROW_OR_ABORT(std::runtime_error, "No CPU {} kernel available for {} dimensions.", op_.type(), nbSpatialFeat); + break; + } + + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + std::shared_ptr<Tensor> input0Fallback, input1Fallback; + const auto& input0 = std::make_shared<Tensor>(op_.getInput(0)->refCastFrom(input0Fallback, *op_.getOutput(0))); + const auto& input1 = std::make_shared<Tensor>(op_.getInput(1)->refCastFrom(input1Fallback, *op_.getOutput(0))); + + // Call kernel + kernelFunc(op_, + input0, // input + input1, // grid + op_.getOutput(0) // output + ); +} diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp index d7feb9b76e25a0e874b3682cdc5b3e53bf8e9228..e5fd911cf199edbf98a1ecb343d5904d647d9caa 100644 --- a/src/operator/MulImpl.cpp +++ b/src/operator/MulImpl.cpp @@ -22,6 +22,7 @@ #include "aidge/backend/cpu/operator/MulImpl.hpp" #include "aidge/backend/cpu/operator/MulImpl_forward_kernels.hpp" +#include "aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp" Aidge::Elts_t Aidge::MulImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const { // this implementation can be in-place @@ -40,6 +41,7 @@ void Aidge::MulImpl_cpu::forward() { const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); + // Call kernel kernelFunc(inputDims0, inputDims1, @@ -48,3 +50,32 @@ void Aidge::MulImpl_cpu::forward() { getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawOutput(0))); } + +void Aidge::MulImpl_cpu::backward() { + + const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp); + + auto in0 = op_.getInput(0); + auto in1 = op_.getInput(1); + auto in0grad = op_.getInput(0)->grad(); + auto in1grad = op_.getInput(1)->grad(); + auto out0grad = op_.getOutput(0)->grad(); + + // Find kernel function + auto kernelFunc = Registrar<MulImplBackward_cpu>::create({ + out0grad->dataType(), + in0grad->dataType(), + in1grad->dataType()}); + + kernelFunc(/* input0Length */ in0grad->size(), + /* input1Length */ in1grad->size(), + /* grad0Length */ out0grad->size(), + /* input0Dims */ in0->dims(), + /* input1Dims */ in1->dims(), + getCPUPtr(in0), + getCPUPtr(in1), + getCPUPtr(out0grad), + getCPUPtr(in0grad), + getCPUPtr(in1grad)); +} + diff --git a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..42505d385fde7e72e09531f1607287ffc6978f75 --- /dev/null +++ b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp @@ -0,0 +1,120 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <algorithm> +#include <chrono> +#include <cmath> +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <iostream> +#include <memory> +#include <numeric> // std::accumulate +#include <ostream> +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include "catch2/internal/catch_compiler_capabilities.hpp" +#include "catch2/internal/catch_enforce.hpp" +#include <catch2/catch_test_macros.hpp> +#include <catch2/generators/catch_generators_random.hpp> + +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/ConstantOfShape.hpp" +#include "aidge/utils/TensorUtils.hpp" +#include <aidge/data/Data.hpp> +#include <aidge/data/half.hpp> +#include <aidge/filler/Filler.hpp> +#include <aidge/operator/OperatorTensor.hpp> +#include <aidge/operator/Reshape.hpp> +#include <aidge/utils/TensorUtils.hpp> +#include <aidge/utils/Types.h> + +namespace Aidge { +TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") { + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + auto random_seed = Catch::Generators::Detail::getSeed; + std::mt19937 gen(random_seed()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<DimSize_t> input_tensor_size_dist( + std::size_t(1), std::size_t(10)); + std::uniform_int_distribution<int64_t> input_tensor_values_dist( + std::size_t(1), std::size_t(7)); + std::uniform_real_distribution<double> operator_attr_value_dist(-100., 100.); + + /////////////////////////////////////////////// + // SETUP FUNCTIONS + auto generate_input_tensor = + [&gen, &input_tensor_size_dist, + &input_tensor_values_dist]() -> std::shared_ptr<Tensor> { + std::vector<DimSize_t> input_dims; + input_dims.push_back(input_tensor_size_dist(gen)); + + auto result = std::make_shared<Tensor>(input_dims); + result->setDataType(DataType::Int64); + result->setBackend("cpu"); + for (DimSize_t i = 0; i < result->size(); ++i) { + result->set<int64_t>(i, input_tensor_values_dist(gen)); + } + return result; + }; + + auto generate_random_operator = + [&gen, + &operator_attr_value_dist]() -> std::shared_ptr<ConstantOfShape_Op> { + auto node = ConstantOfShape(Tensor(operator_attr_value_dist(gen))); + auto op = std::static_pointer_cast<ConstantOfShape_Op>(node->getOperator()); + op->setDataType(DataType::Float64); + op->setBackend("cpu"); + return op; + }; + + auto generate_output_tensor = [](std::shared_ptr<Tensor> input_tensor, + std::shared_ptr<ConstantOfShape_Op> op) { + std::vector<DimSize_t> output_dims; + output_dims.reserve(input_tensor->size()); + for (DimSize_t i = 0; i < input_tensor->size(); ++i) { + output_dims.push_back(input_tensor->get<int64_t>(i)); + } + auto result = std::make_shared<Tensor>(output_dims); + result->setDataType(op->value().dataType()); + result->setBackend("cpu"); + constantFiller(result, op->value().get<double>(0)); + return result; + }; + + ///////////////////////////////////// + // BENCHMARKING + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + int number_of_operation{0}; + + SECTION("ConstantOfShapeImpl_cpu::forward()") { + for (int i = 0; i < NBTRIALS; ++i) { + auto input_T = generate_input_tensor(); + std::shared_ptr<ConstantOfShape_Op> op = generate_random_operator(); + auto output_T = generate_output_tensor(input_T, op); + op->associateInput(0, input_T); + + REQUIRE(op->forwardDims(true)); + REQUIRE_NOTHROW(op->forward()); + + CHECK(output_T->nbDims() == op->getOutput(0)->nbDims()); + for (DimIdx_t i = 0; i < output_T->nbDims(); ++i) { + CHECK(output_T->dims().at(i) == op->getOutput(0)->dims().at(i)); + } + CHECK(approxEq<double>(*output_T, *op->getOutput(0))); + } + } +} +} // namespace Aidge + diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp index 9d592d31e1999f63fb0ebe3f5ad9d19e85c8645c..3378861d0d3d7e74e7867c2765a0b09069fa8caf 100644 --- a/unit_tests/operator/Test_MulImpl.cpp +++ b/unit_tests/operator/Test_MulImpl.cpp @@ -24,6 +24,337 @@ namespace Aidge { + TEST_CASE("[CPU/Operator] Mul Backward", "[Mul][CPU][Backward]") + { + std::shared_ptr<Node> myMul = Mul(); + auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator()); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + SECTION("Case 1: 2D and 1D tensors") { + const auto T0 = std::make_shared<Tensor>(Array2D<float,2,3>( + { + { + {1,2,3},{4,5,6} + } + } + )); + + const auto T1 = std::make_shared<Tensor>(Array1D<float,3>( + {0.1,0.2,0.3} + )); + + T0->setDataType(DataType::Float32); + T0->setBackend("cpu"); + T1->setDataType(DataType::Float32); + T1->setBackend("cpu"); + + op->getOutput(0)->setGrad(std::make_shared<Tensor>(Array2D<float,2,3>({{{1.0,1.0,1.0},{1.0,1.0,1.0}}}))); + + op->associateInput(0,T0); + op->associateInput(1,T1); + op->forwardDims(); + + myMul->forward(); + myMul->backward(); + + auto T0Grad = std::make_shared<Tensor>(Array2D<float, 2,3>({{{0.1,0.2,0.3},{0.1, 0.2, 0.3}}})); + auto T1Grad = std::make_shared<Tensor>(Array1D<float, 3>({5,7,9})); + + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *T0Grad)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *T1Grad)); + } + + SECTION("Case 2: 3D and 1D tensors") { + const auto T0 = std::make_shared<Tensor>(Array3D<float,2,2,3>( + { + { + { + {1.0, 2.0, 3.0}, + {4.0, 5.0, 6.0} + }, + { + {7.0, 8.0, 9.0}, + {10.0, 11.0, 12.0} + } + } + } + )); + + const auto T1 = std::make_shared<Tensor>(Array1D<float, 3>({0.3,0.2,0.1})); + + const auto newGrad = std::make_shared<Tensor>(Array3D<float,2,2,3>( + { + { + { + {1, 1, 1}, + {1, 1, 1} + }, + { + {1, 1, 1}, + {1, 1, 1} + } + } + } + )); + + const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,2,3>( + { + { + { + {0.3, 0.2, 0.1}, + {0.3, 0.2, 0.1} + }, + { + {0.3, 0.2, 0.1}, + {0.3, 0.2, 0.1} + } + } + } + )); + + const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float,3>( + {22.0, 26.0, 30.0} + )); + + for(auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1}) + { + T->setBackend("cpu") ; + T->setDataType(DataType::Float32); + } + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->getOutput(0)->setGrad(newGrad); + op->forwardDims(); + + myMul->backward(); + + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1)); + } + + SECTION("Case 3: 4D and 2D tensors") { + const auto T0 = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>( + { + { + { + { + {1.0, 2.0, 3.0}, + {4.0, 5.0, 6.0}, + {7.0, 8.0, 9.0} + }, + { + {10.0, 11.0, 12.0}, + {13.0, 14.0, 15.0}, + {16.0, 17.0, 18.0} + } + }, + { + { + {19.0, 20.0, 21.0}, + {22.0, 23.0, 24.0}, + {25.0, 26.0, 27.0} + }, + { + {28.0, 29.0, 30.0}, + {31.0, 32.0, 33.0}, + {34.0, 35.0, 36.0} + } + } + } + } + )); + + const auto T1 = std::make_shared<Tensor>(Array2D<float, 3,3>( + { + { + {0.5,0.3,0.1}, + {0.4,0.2,0.6}, + {0.7,0.8,0.9} + } + } + )); + + const auto newGrad = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>( + { + { + { + { + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0} + }, + { + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0} + } + }, + { + { + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0} + }, + { + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0} + } + } + } + } + )); + + const auto expectedGrad0 = std::make_shared<Tensor>(Array4D<float,2,2,3,3>( + { + { + { + { + {0.5, 0.3, 0.1}, + {0.4, 0.2, 0.6}, + {0.7, 0.8, 0.9} + }, + { + {0.5, 0.3, 0.1}, + {0.4, 0.2, 0.6}, + {0.7, 0.8, 0.9} + } + }, + { + { + {0.5, 0.3, 0.1}, + {0.4, 0.2, 0.6}, + {0.7, 0.8, 0.9} + }, + { + {0.5, 0.3, 0.1}, + {0.4, 0.2, 0.6}, + {0.7, 0.8, 0.9} + } + } + } + } + )); + + const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 3>( + { + { + {58.0, 62.0, 66.0}, + {70.0, 74.0, 78.0}, + {82.0, 86.0, 90.0} + } + } + )); + + for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1}) + { + T->setBackend("cpu") ; + T->setDataType(DataType::Float32); + } + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->getOutput(0)->setGrad(newGrad); + op->forwardDims(); + + myMul->backward(); + + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1)); + } + + SECTION("Case 4: 3D and 2D tensors") { + const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 3, 4>( + { + { + { + {1.0, 2.0, 3.0, 4.0}, + {5.0, 6.0, 7.0, 8.0}, + {9.0, 10.0, 11.0, 12.0}, + }, + { + {13.0, 14.0, 15.0, 16.0}, + {17.0, 18.0, 19.0, 20.0}, + {21.0, 22.0, 23.0, 24.0}, + } + } + } + )); + + const auto T1 = std::make_shared<Tensor>(Array2D<float, 3, 4>( + { + { + {0.1, 0.2, 0.3, 0.4}, + {0.5, 0.6, 0.7, 0.8}, + {0.9, 1.0, 1.1, 1.2} + } + } + )); + + const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2,3,4>( + { + { + { + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + }, + { + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + } + } + } + )); + + const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,3,4>( + { + { + { + {0.1, 0.2, 0.3, 0.4}, + {0.5, 0.6, 0.7, 0.8}, + {0.9, 1.0, 1.1, 1.2} + }, + { + {0.1, 0.2, 0.3, 0.4}, + {0.5, 0.6, 0.7, 0.8}, + {0.9, 1.0, 1.1, 1.2} + } + } + } + )); + + const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 4>( + { + { + {14.0, 16.0, 18.0, 20.0}, + {22.0, 24.0, 26.0, 28.0}, + {30.0, 32.0, 34.0, 36.0} + } + } + )); + + for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1}) + { + T->setBackend("cpu") ; + T->setDataType(DataType::Float32); + } + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->getOutput(0)->setGrad(newGrad); + op->forwardDims(); + + myMul->backward(); + + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1)); + } + } + TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { constexpr std::uint16_t NBTRIALS = 10; // Create a random number generator @@ -31,7 +362,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { std::mt19937 gen(rd()); std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1 std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10)); - std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3)); std::uniform_int_distribution<int> boolDist(0,1); // Create MatMul Operator @@ -60,6 +391,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration{}; + SECTION("MulImpl_cpu::forward()") { SECTION("Scalar / Scalar") { @@ -68,16 +400,20 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { } SECTION("+1-D Tensor / +1-D Tensor - same dimensions") { + std::size_t number_of_operation = 0; for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate 2 random Tensors - const std::size_t nbDims = nbDimsDist(gen); - std::vector<std::size_t> dims; + const auto nbDims = nbDimsDist(gen); + auto dims = std::vector<std::size_t>{}; + for (std::size_t i = 0; i < nbDims; ++i) { dims.push_back(dimSizeDist(gen)); } - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + const auto nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); number_of_operation += nb_elements; // without broadcasting @@ -114,67 +450,101 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { delete[] array0; delete[] array1; delete[] result; - - // with broadcasting } std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; std::cout << "total time: " << duration.count() << "μs" << std::endl; } + SECTION("+1-D Tensor / +1-D Tensor - broadcasting") { std::size_t number_of_operation = 0; for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate 2 random Tensors // handle dimensions, replace some dimensions with '1' to get broadcasting + constexpr std::size_t nbDims = 4; - std::vector<std::size_t> dims; - for (std::size_t i = 0; i < nbDims; ++i) { - dims.push_back(dimSizeDist(gen)); + std::vector<std::size_t> dimensions; + + for (std::size_t i = 0; i < nbDims; ++i) + { + dimensions.push_back(dimSizeDist(gen)); } - std::vector<std::size_t> dims0 = dims; - std::vector<std::size_t> dims1 = dims; - std::vector<std::size_t> dimsOut = dims; - for (std::size_t i = 0; i < nbDims; ++i) { - if (boolDist(gen)) { + + auto dims0 = dimensions; + auto dims1 = dimensions; + auto dimsOut = dimensions; + + for (std::size_t i = 0; i < nbDims; ++i) + { + if (boolDist(gen)) + { dims0[i] = 1; } - if (boolDist(gen)) { + + if (boolDist(gen)) + { dims1[i] = 1; } + dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i]; } + for(auto dim : dims0) + { + Log::info("Dimension of input 0 : {}", dim); + } + + for(auto dim : dims1) + { + Log::info("Dimension of input 1 : {}", dim); + } + // create arrays and fill them with random values float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]]; float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]]; float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]]; - for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) { + + for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) + { array0[i] = valueDist(gen); } - for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) { + + for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) + { array1[i] = valueDist(gen); } // compute true result const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1}; const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1}; - for (std::size_t a = 0; a < dimsOut[0]; ++a) { - for (std::size_t b = 0; b < dimsOut[1]; ++b) { + + for (std::size_t a = 0; a < dimsOut[0]; ++a) + { + for (std::size_t b = 0; b < dimsOut[1]; ++b) + { const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0) + strides0[1] * ((dims0[1] > 1) ? b : 0); + const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0) + strides1[1] * ((dims1[1] > 1) ? b : 0); - for (std::size_t c = 0; c < dimsOut[2]; ++c) { + + for (std::size_t c = 0; c < dimsOut[2]; ++c) + { const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a)); - for (std::size_t d = 0; d < dimsOut[3]; ++d) { + + for (std::size_t d = 0; d < dimsOut[3]; ++d) + { std::size_t idx0 = idx0_0 + strides0[2] * ((dims0[2] > 1) ? c : 0) + ((dims0[3] > 1) ? d : 0); + std::size_t idx1 = idx1_0 + strides1[2] * ((dims1[2] > 1) ? c : 0) + ((dims1[3] > 1) ? d : 0); + result[idx_out + d] = array0[idx0] * array1[idx1]; // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl; } diff --git a/unit_tests/operator/Test_PadImpl.cpp b/unit_tests/operator/Test_PadImpl.cpp index cdd3a5f979085f3782776ce69ddd92c0d53150c4..75233c0b97fc6f9812020d0e3d3c695d8cd388f0 100644 --- a/unit_tests/operator/Test_PadImpl.cpp +++ b/unit_tests/operator/Test_PadImpl.cpp @@ -134,7 +134,7 @@ TEST_CASE("[cpu/operator] Pad(forward)", "[Pad][CPU]") { SECTION("Asymmetric Pad") { const int pv = 0; // pad value - std::shared_ptr<Node> myPad = Pad<2>({1, 0, 0, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv)); + std::shared_ptr<Node> myPad = Pad<2>({0, 1, 1, 0}, "mypad", PadBorderType::Constant, static_cast<double>(pv)); auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator()); std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW {