diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 97fcaa704b72922d35ad70feb923633fa194c850..56dc0ef17faa0be88f81cc4b7ed95e4d654a4c38 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,19 +12,14 @@ stages: - deploy include: - - project: 'eclipse/aidge/gitlab_shared_files' + - project: 'eclipse/aidge/gitlab_shared_files' ref: 'main' - file: - # choose which jobs to run by including the corresponding files. + file: # choose which jobs to run by including the corresponding files. - '.gitlab/ci/ubuntu_cpp.gitlab-ci.yml' - '.gitlab/ci/ubuntu_python.gitlab-ci.yml' - - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml' + - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml' - '.gitlab/ci/windows_cpp.gitlab-ci.yml' - - - '.gitlab/ci/windows_python.gitlab-ci.yml' - - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml' - - - + - '.gitlab/ci/windows_python.gitlab-ci.yml' + - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml' diff --git a/CHANGELOG b/CHANGELOG index 9a76d7b11556b434cf9749d625cedea85dc6c5ac..a461371a17b586e8ebc65172282153a6ae8e09e2 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,5 @@ +# Verson 0.4.0 (December 6, 2024) + # Version 0.2.2 (May 14, 2024) * Remove implmentation for Operators soly handling memory and format diff --git a/CMakeLists.txt b/CMakeLists.txt index 3574e25cec5977bc2249c7d756041c09650f9b11..e9e191c36d5ad57a9a9dbed378154db6676ec796 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,9 @@ execute_process( message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}") add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}") +# helper for LSP users +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + # Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME} set(module_name _${CMAKE_PROJECT_NAME}) # target name set(pybind_module_name ${CMAKE_PROJECT_NAME}) # name of submodule for python bindings diff --git a/aidge_backend_cpu/unit_tests/test_scheduler.py b/aidge_backend_cpu/unit_tests/test_scheduler.py index c37fc54437c02b0bb1c6f09a1c73d5cc538fa4c0..494f34565ffd644971c97e9adfa06709dee9e36d 100644 --- a/aidge_backend_cpu/unit_tests/test_scheduler.py +++ b/aidge_backend_cpu/unit_tests/test_scheduler.py @@ -13,9 +13,10 @@ class test_scheduler(unittest.TestCase): pass def test_relu_forward(self): - values = np.arange(6) - 3 - input_node = aidge_core.Producer(aidge_core.Tensor(values), "Input") + t = aidge_core.Tensor(np.arange(6, dtype=np.int32) - 3) + + input_node = aidge_core.Producer(t) relu = aidge_core.ReLU() input_node.add_child(relu) @@ -34,7 +35,7 @@ class test_scheduler(unittest.TestCase): out_tensor = relu.get_operator().get_output(0) expected_out = [0,0,0,0,1,2] for i in range(len(expected_out)): - self.assertEqual(expected_out[i], out_tensor[i]) + self.assertEqual(expected_out[i], out_tensor[i], f"On idx {i}") def test_sequential_scheduling(self): input_data = np.array([0]).astype(np.float32) @@ -69,7 +70,7 @@ class test_scheduler(unittest.TestCase): aidge_core.Producer(input_tensor, "X"), aidge_core.FC(1, 50, name='0'), aidge_core.parallel([aidge_core.FC(50, 50, name='1'), aidge_core.FC(50, 50, name='3')]), - aidge_core.Add(2, name='2'), + aidge_core.Add(name='2'), ]) EXPECTED_SCHEDULE = [['0', '1', '3', '2'], ['0', '3', '1', '2']] # Both scheduling are valid ! diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp index b45aa1cb4151d8d6c5268d4a94da97bb25a89a40..caa75328e58f6c9581f81368a3981bb79a069d49 100644 --- a/include/aidge/backend/cpu.hpp +++ b/include/aidge/backend/cpu.hpp @@ -15,11 +15,14 @@ #include "aidge/backend/cpu/operator/AbsImpl.hpp" #include "aidge/backend/cpu/operator/AddImpl.hpp" #include "aidge/backend/cpu/operator/AndImpl.hpp" +#include "aidge/backend/cpu/operator/AtanImpl.hpp" + #include "aidge/backend/cpu/operator/ArgMaxImpl.hpp" #include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp" #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp" #include "aidge/backend/cpu/operator/BatchNormImpl.hpp" #include "aidge/backend/cpu/operator/BitShiftImpl.hpp" +#include "aidge/backend/cpu/operator/ClipImpl.hpp" #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp" #include "aidge/backend/cpu/operator/ConvImpl.hpp" #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp" @@ -28,15 +31,19 @@ #include "aidge/backend/cpu/operator/FCImpl.hpp" #include "aidge/backend/cpu/operator/FoldImpl.hpp" #include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp" +#include "aidge/backend/cpu/operator/LRNImpl.hpp" #include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp" #include "aidge/backend/cpu/operator/LnImpl.hpp" #include "aidge/backend/cpu/operator/MatMulImpl.hpp" #include "aidge/backend/cpu/operator/MulImpl.hpp" #include "aidge/backend/cpu/operator/PadImpl.hpp" +#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp" #include "aidge/backend/cpu/operator/PowImpl.hpp" #include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp" #include "aidge/backend/cpu/operator/ReduceSumImpl.hpp" +#include "aidge/backend/cpu/operator/ResizeImpl.hpp" #include "aidge/backend/cpu/operator/ReLUImpl.hpp" +#include "aidge/backend/cpu/operator/RoundImpl.hpp" #include "aidge/backend/cpu/operator/ScalingImpl.hpp" #include "aidge/backend/cpu/operator/SigmoidImpl.hpp" #include "aidge/backend/cpu/operator/SqrtImpl.hpp" diff --git a/include/aidge/backend/cpu/data/Interpolation.hpp b/include/aidge/backend/cpu/data/Interpolation.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5909f02a190f4e10cdeb878505fdfea1a17e2d75 --- /dev/null +++ b/include/aidge/backend/cpu/data/Interpolation.hpp @@ -0,0 +1,117 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_DATA_INTERPOLATION_H_ +#define AIDGE_CPU_DATA_INTERPOLATION_H_ + +#include <vector> + +#include <aidge/data/Interpolation.hpp> +#include <aidge/utils/Types.h> + +namespace Aidge { +class InterpolationCPU : public Interpolation { + public: + /* + * @brief Interpolates values given via input in given mode. + * + * Values are contiguously arranged in a "square" shape around the point to + * interpolate. Depending on interpolation mode. + * The point that will be interpolated is located right in the + * middle of all points. + * Immediate neighbours : + * 1D interp : 2D interp : + * . . . . . . + * . . 1 2 . . . . . . . . + * . . 1 2 . . + * . . 3 4 . . + * . . . . . . + * . . . . . . + * + * 2 neighbours : + * 1D interp : 2D interp : + * . . . . . . . . + * . . . . . . . . + * . . 1 2 3 4 . . . . 1 2 3 4 . . + * . . 5 6 7 8 . . + * . . 9 10 11 12 . . + * . . 13 14 15 16 . . + * . . . . . . . . + * . . . . . . . . + * + * @param[in] originalCoords: coord of the point to interpolate in the + * original picture. These coords are generated with + * Interpolation::untransformCoords(coordsInInterpolatedTensor) + * @param[in] points : points to interpolate, arranged in a vector of a + * pairs ((point_coord), value) : + * [[[X1, X2, ..., XN], Xval], ...., [[A1, A2, ..., AN],Aval]]. + * With : + * - N: the number of dimensions. + * - A: the number of points of the grid to interpolate. + * - All coordinates expressed in originalTensor frame. + * @param[in] interpMode: interpolation mode + * @return interpolated value + */ + template <typename T> + static T interpolate(const std::vector<float> &coordsToInterpolate, + const std::set<Point<T>> &points, + const Mode interpMode = Interpolation::Mode::Linear); + + /** + * @brief performs linear interpolation on given points. + * @param[in] values: values to interpolate, since we only do an average of + * all values, their indexes isn't useful. + * @return interpolated value + */ + template <typename T> + static T linear(const std::vector<float> &originalCoords, + const std::set<Point<T>> &points); + + /** + * @brief performs nearest interpolation on given points. + * @note it is a wrapper for linearRecurse() private method + * @param[in] coordsToInterpolate: coordinates to interpolate + * @param[in] points: points to interpolate + * @param[in] interpMode: interpolation method, must be a Nearest... + * otherwise function will throw an error. + * @return interpolated value + */ + template <typename T> + static T nearest(const std::vector<float> &coordsToInterpolate, + const std::set<Point<T>> &points, + const Interpolation::Mode nearestMode); + + private: + /** + * @brief actual linear interpolation function. + * will : + * - Split all points along each dimension depending of if their coords at + * idx alongDim are above or under coordsToInterpolate until they are + * 1-to-1. + * - Perform interpolation in 2 leftover points and return interpolated + * point to parent call with a set of size 1. + * - repeat until all dimensions have been interpolated. + * @param[in] coordsToInterpolate: coordinates to interpolate + * @param[in] points: points to interpolate + * @param[in] alongDim: discriminant on along which dimension are being + * segregated. + * @return + */ + template <typename T> + static std::set<Interpolation::Point<T>> + linearRecurse(const std::vector<float> &coordsToInterpolate, + const std::set<Point<T>> &points, + const DimIdx_t alongDim = 0); +}; + +} // namespace Aidge + +#endif // AIDGE_CPU_DATA_INTERPOLATION_H_ diff --git a/include/aidge/backend/cpu/operator/AddImpl.hpp b/include/aidge/backend/cpu/operator/AddImpl.hpp index 5e795922a67be178dde588e8e5e346ec268efe86..e39c35b42fdb6065aa72aee092cd1cd23b2b1011 100644 --- a/include/aidge/backend/cpu/operator/AddImpl.hpp +++ b/include/aidge/backend/cpu/operator/AddImpl.hpp @@ -25,7 +25,7 @@ namespace Aidge { // Operator implementation entry point for the backend using AddImpl_cpu = OperatorImpl_cpu<Add_Op, - void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)>; + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>; // Implementation entry point registration to Operator REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create); diff --git a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp index 4a4ba2a8999c4dc33fc743b5a3a7dad023f9e0dd..e6d13fcf3699824a8410015d35ff766adf617c11 100644 --- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp @@ -14,31 +14,137 @@ #include "aidge/utils/Registrar.hpp" -#include <cstdint> // std::int32_t, std::int64_t +#include <cstddef> // std::size_t #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/AddImpl.hpp" namespace Aidge { +namespace { +// suppose values are contiguous in memory template <class I, class O> -void AddImpl_cpu_forward_kernel(const std::vector<const void*> inputs_, const std::vector<std::vector<std::size_t>>& inputDims, const std::size_t outputLength, const std::vector<std::size_t>& outDims, void* output_) { - // FIXME: missing Add attributes as arguments - std::vector<const I*> inputs; - for (const auto& input_ : inputs_) { - inputs.push_back(static_cast<const I*>(input_)); +void add_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I* input1, + const I* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(input1[in1_id] + input2[in2_id]); } +} +} + +template <class I, class O> +void AddImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, + const std::vector<std::size_t>& outputDims, + const void* input0_, + const void* input1_, + void* output_) { + + const I* input_0 = static_cast<const I*>(input0_); + const I* input_1 = static_cast<const I*>(input1_); O* output = static_cast<O*>(output_); - for (std::size_t oIndex = 0; oIndex < outputLength; ++oIndex) - { - output[oIndex] = 0; - std::vector<size_t> indexes = getMultiDimIndices(outDims, oIndex); - for(std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) { - std::size_t idx = getFlattenedIndex(inputDims[iIndex], indexes); - output[oIndex] += inputs[iIndex][idx]; - } - } + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(input_0[i] + input_1[i]); + } + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + add_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } } // Kernels registration to implementation entry point @@ -48,6 +154,12 @@ REGISTRAR(AddImpl_cpu, REGISTRAR(AddImpl_cpu, {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}}, {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, nullptr}); +REGISTRAR(AddImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}}, + {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, nullptr}); +REGISTRAR(AddImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}}, + {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, nullptr}); REGISTRAR(AddImpl_cpu, {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}}, {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr}); diff --git a/include/aidge/backend/cpu/operator/AndImpl.hpp b/include/aidge/backend/cpu/operator/AndImpl.hpp index 316a2fb922596642088d133a7fec49c988739bb7..8814df2fac36be56332035731679b724b169efe7 100644 --- a/include/aidge/backend/cpu/operator/AndImpl.hpp +++ b/include/aidge/backend/cpu/operator/AndImpl.hpp @@ -23,7 +23,7 @@ namespace Aidge { // Operator implementation entry point for the backend using AndImpl_cpu = OperatorImpl_cpu<And_Op, - void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>; + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>; // Implementation entry point registration to Operator REGISTRAR(And_Op, "cpu", Aidge::AndImpl_cpu::create); diff --git a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp index 197e829f3527ce2f36c3ef5ee812a26477633703..73b710e021ac5031923eb1e9a2492502c02a3633 100644 --- a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp @@ -12,52 +12,152 @@ #ifndef AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ #define AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ -#include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/AndImpl.hpp" #include "aidge/utils/Registrar.hpp" namespace Aidge { -template <class I1, class I2, class O> -void AndImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims, - const std::vector<std::size_t>& input2Dims, + +namespace { +// suppose values are contiguous in memory +template <class I, class O> +void equal_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I* input1, + const I* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]); + } +} +} + + +template <class I, class O> +void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, + const void* input0_, const void* input1_, - const void* input2_, void* output_) { - const I1* input_1 = static_cast<const I1*>(input1_); - const I2* input_2 = static_cast<const I2*>(input2_); + const I* input_0 = static_cast<const I*>(input0_); + const I* input_1 = static_cast<const I*>(input1_); O* output = static_cast<O*>(output_); - size_t totalElements = 1; - for (size_t dimSize : outputDims) { - totalElements *= dimSize; + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(input_0[i] == input_1[i]); + } + return; } - for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) - { - std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } - std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); - std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); + const std::size_t nbDims = dims0.size(); - output[oIndex] = static_cast<O>(input_1[idx1] == input_2[idx2]); + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } } } // Kernels registration to implementation entry point REGISTRAR(AndImpl_cpu, {DataType::Float32}, - {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float, float>, nullptr}); + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr}); REGISTRAR(AndImpl_cpu, {DataType::Float64}, - {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double, double>, nullptr}); + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr}); REGISTRAR(AndImpl_cpu, {DataType::Int32}, - {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr}); + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr}); REGISTRAR(AndImpl_cpu, {DataType::Int64}, - {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr}); + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr}); + } // namespace Aidge #endif /* AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/AtanImpl.hpp b/include/aidge/backend/cpu/operator/AtanImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2f1b4bf0ad666ff9856c24fa675b70d6f830b07c --- /dev/null +++ b/include/aidge/backend/cpu/operator/AtanImpl.hpp @@ -0,0 +1,33 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_ATAN_H_ +#define AIDGE_CPU_OPERATOR_ATAN_H_ + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/Atan.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include <memory> +#include <vector> + +namespace Aidge { +// Operator implementation entry point for the backend +using AtanImpl_cpu = OperatorImpl_cpu<Atan_Op, + void(const std::size_t, const void*, void*), + void(const std::size_t, const void*, const void*, void*)>; + +// Implementation entry point registration to Operator +REGISTRAR(Atan_Op, "cpu", Aidge::AtanImpl_cpu::create); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_ATAN_H_ */ diff --git a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2a786339503354514416705b61cfedfcc0b7c321 --- /dev/null +++ b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp @@ -0,0 +1,60 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_ + +#include "aidge/utils/Registrar.hpp" + +#include "aidge/backend/cpu/operator/AtanImpl.hpp" +#include <cmath> // For atan() + + +namespace Aidge { +template <class I, class O> +void AtanImpl_cpu_forward_kernel(std::size_t inputLenght, + const void* input_, + void* output_) { + const I* input = static_cast<const I*>(input_); + O* output = static_cast<O*>(output_); + + for (size_t i = 0; i < inputLenght; ++i) { + output[i] = static_cast<O>(atan(input[i])); + } + +} + +template <class O, class GI, class GO> +void AtanImpl_cpu_backward_kernel(const std::size_t inputLenght, + const void* output_, const void* grad_output_, + void* grad_input_) { + const O* output = static_cast<const O*>(output_); + const GO* grad_output = static_cast<const GO*>(grad_output_); + GI* grad_input = static_cast<GI*>(grad_input_); + + // Apply the derivative of atan for each element in the input array + for (size_t i = 0; i < inputLenght; ++i) { + // dx = dy * (1 / (1 + x^2)) + grad_input[i] = grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i])); + } +} + + +// Kernels registration to implementation entry point +REGISTRAR(AtanImpl_cpu, + {DataType::Float32}, + {ProdConso::inPlaceModel, Aidge::AtanImpl_cpu_forward_kernel<float, float>, Aidge::AtanImpl_cpu_backward_kernel<float, float, float>}); +REGISTRAR(AtanImpl_cpu, + {DataType::Float64}, + {ProdConso::inPlaceModel, Aidge::AtanImpl_cpu_forward_kernel<double, double>, Aidge::AtanImpl_cpu_backward_kernel<double, double, double>}); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/BatchNormImpl.hpp b/include/aidge/backend/cpu/operator/BatchNormImpl.hpp index 36a100b21edc6cd63a0176c89f2f1e57c10001c7..03dd5d1d04d5263eb84843925a1ce9ee3263423f 100644 --- a/include/aidge/backend/cpu/operator/BatchNormImpl.hpp +++ b/include/aidge/backend/cpu/operator/BatchNormImpl.hpp @@ -29,7 +29,7 @@ using BatchNorm2D_Op = BatchNorm_Op<2>; using BatchNormImpl2D_cpu = OperatorImpl_cpu<BatchNorm_Op<2>, void(float, float, - const std::array<DimSize_t, 4> &, + const std::vector<DimSize_t> &, const void *, const void *, const void *, diff --git a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp index ec71e3b8e37e344c551fd643dc7b3957bdddcb67..cf97f7372ac528ef28d0f378beb2650af32bfa30 100644 --- a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp @@ -38,7 +38,7 @@ namespace Aidge { * @param output_ Output Tensor. */ template <class I, class P, class O> -void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std::array<DimSize_t, 4> &dims, +void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std::vector<DimSize_t> &dims, const void *input_, const void *scale_, const void *shift_, void *batchMean_, void *batchVar_, void *output_, const bool freeze) { // FIXME: missing convolution attributes as arguments const I *input = static_cast<const I *>(input_); @@ -49,9 +49,8 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std O *output = static_cast<O *>(output_); const DimSize_t nbBatch = dims[0]; - const DimSize_t nbChannels = dims[1]; - const DimSize_t featureMapSize = dims[2]*dims[3]; - + const DimSize_t nbChannels = (dims.size() > 1) ? dims[1] : 1; + const DimSize_t featureMapSize = (dims.size() > 2) ? std::accumulate(dims.begin() + 2, dims.end(), 1, std::multiplies<DimSize_t>()) : 1; if ((freeze == true) || (momentum == 0.0f)) { for (std::size_t batch = 0; batch < nbBatch; ++batch) { diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp index 6da67bb7dd4469b6ca609c5aea1ae70dfca3f939..807d2b972ba385f9382d4121173a75207600d098 100644 --- a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp +++ b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp @@ -24,13 +24,13 @@ namespace Aidge { // Operator implementation entry point for the backend using BitShiftImpl_cpu = OperatorImpl_cpu<BitShift_Op, void(const BitShift_Op::BitShiftDirection, - const std::vector<std::size_t>&, - const std::vector<std::size_t>&, - const std::vector<std::size_t>&, - const void*, + std::vector<std::size_t>, + std::vector<std::size_t>, + const std::vector<std::size_t>&, + const void*, const void*, void*)>; - + // Implementation entry point registration to Operator REGISTRAR(BitShift_Op,"cpu",Aidge::BitShiftImpl_cpu::create); } // namespace Aidge diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp index f815e946ea2e4abaff48a6e5155368d564e88e8c..1f2561afe0be9997116cbd82f754c485a1760090 100644 --- a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp @@ -12,47 +12,150 @@ #ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_ #define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_ -#include "aidge/utils/Registrar.hpp" -#include <cstdint> // std::int32_t, std::int64_t -#include "aidge/operator/BitShift.hpp" +#include <cstdint> // std::int32_t, std::int64_t +#include <cstddef> // std::size_t #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/BitShiftImpl.hpp" +#include "aidge/operator/BitShift.hpp" +#include "aidge/utils/Registrar.hpp" +namespace { +// suppose values are contiguous in memory +template <class I1, class I2, class O> +void bitshift_contiguous_arrays( + const Aidge::BitShift_Op::BitShiftDirection direction, + const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I1* input_1, + const I2* input_2, + O* output) +{ + if(direction == Aidge::BitShift_Op::BitShiftDirection::right) { + for (std::size_t i = 0; i < output1size; ++i) { + const std::size_t idx1 = (input1size != 1) ? i : 0; + const std::size_t idx2 = (input2size != 1) ? i : 0; + output[i]= input_1[idx1] >> input_2[idx2]; + } + + } else { + for (std::size_t i = 0; i < output1size; ++i) { + const std::size_t idx1 = (input1size != 1) ? i : 0; + const std::size_t idx2 = (input2size != 1) ? i : 0; + output[i] = input_1[idx1] << input_2[idx2]; + } + } +} +} namespace Aidge { template <class I1, class I2, class O> void BitShiftImpl_cpu_forward_kernel( const BitShift_Op::BitShiftDirection direction, - const std::vector<std::size_t>& input1Dims, - const std::vector<std::size_t>& input2Dims, + std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, + const void* input0_, const void* input1_, - const void* input2_, void* output_ ) { - const I1* input_1 = static_cast<const I1*>(input1_); - const I2* input_2 = static_cast<const I2*>(input2_); + const I1* input_0 = static_cast<const I1*>(input0_); + const I2* input_1 = static_cast<const I2*>(input1_); O* output = static_cast<O*>(output_); - const size_t totalElements = std::accumulate(outputDims.begin(), outputDims.end(), std::size_t(1), std::multiplies<std::size_t>()); - - for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) - { - std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); - std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); - std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); - if(direction == BitShift_Op::BitShiftDirection::right) - - { - output[oIndex]= input_1[idx1] >> input_2[idx2]; + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // ## Compute compatible input dimensions + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + bitshift_contiguous_arrays(direction, input0_contiguous_size, input0_contiguous_size, input0_contiguous_size, input_0, input_1, output); + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; } - else - { - output[oIndex] = input_1[idx1] << input_2[idx2]; + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + bitshift_contiguous_arrays<I1,I2,O>(direction, input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; } } } diff --git a/include/aidge/backend/cpu/operator/ClipImpl.hpp b/include/aidge/backend/cpu/operator/ClipImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c83836d5aa1d6aae27e3fdce1bbb9561b70ec31e --- /dev/null +++ b/include/aidge/backend/cpu/operator/ClipImpl.hpp @@ -0,0 +1,46 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_CLIPIMPL_H_ +#define AIDGE_CPU_OPERATOR_CLIPIMPL_H_ + +#include <cstddef> // std::size_t +#include <memory> +#include <tuple> // std::tuple +#include <vector> +#include <algorithm> + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/Clip.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" + + +namespace Aidge { +// Operator implementation entry point for the backend + using ClipImpl_cpu = OperatorImpl_cpu<Clip_Op, + void(float, //Forward Types + float, + const void*, + const std::size_t, + void*), + void(float,//Backward Types + float, + const std::size_t, + const void*, + const void*, + void*)>; + + REGISTRAR(Clip_Op,"cpu",Aidge::ClipImpl_cpu::create); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_CLIPIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1afac4698be2a63790ebac671ecc1e59166c5f94 --- /dev/null +++ b/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp @@ -0,0 +1,77 @@ + +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_ + +#include "aidge/utils/Registrar.hpp" +#include "aidge/backend/cpu/operator/ClipImpl.hpp" + +namespace Aidge { +template <class I, class O> +void ClipImpl_cpu_forward_kernel( + float min_, + float max_, + const void* input_, + const std::size_t length, + void* output_) +{ + const I* input = static_cast<const I*>(input_); + O* output = static_cast<O*>(output_); + + for (std::size_t i = 0; i < length; ++i) { + output[i] = std::min(std::max(static_cast<float>(input[i]), min_), max_); + } +} + +template <class I, class GI, class GO> +void ClipImpl_cpu_backward_kernel( + float min_, + float max_, + const std::size_t length, + const void* input_, + const void* grad_output_, + void* grad_input_) +{ + const I* input = static_cast<const I*>(input_); + const GO* grad_output = static_cast<const GO*>(grad_output_); + GI* grad_input = static_cast<GI*>(grad_input_); + + for (std::size_t i = 0; i < length; ++i) { + grad_input[i] = ((input[i] > min_) && (input[i] < max_)) ? grad_output[i] : 0; + } +} + +REGISTRAR(ClipImpl_cpu, +{DataType::Float32}, +{ProdConso::inPlaceModel, +Aidge::ClipImpl_cpu_forward_kernel<float,float>, +Aidge::ClipImpl_cpu_backward_kernel<float,float,float>}); +REGISTRAR(ClipImpl_cpu, +{DataType::Float64}, +{ProdConso::inPlaceModel, +Aidge::ClipImpl_cpu_forward_kernel<double,double>, +Aidge::ClipImpl_cpu_backward_kernel<double,double,double>}); +REGISTRAR(ClipImpl_cpu, +{DataType::Int32}, +{ProdConso::inPlaceModel, +Aidge::ClipImpl_cpu_forward_kernel<std::int32_t,std::int32_t>, +Aidge::ClipImpl_cpu_backward_kernel<std::int32_t,std::int32_t,std::int32_t>}); +REGISTRAR(ClipImpl_cpu, +{DataType::Int64}, +{ProdConso::inPlaceModel, +Aidge::ClipImpl_cpu_forward_kernel<std::int64_t,std::int64_t>, +Aidge::ClipImpl_cpu_backward_kernel<std::int64_t,std::int64_t,std::int64_t>}); + +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp index 59a471aee82f7c706be390d80b5db569bd3c6f1e..46ae59877bee1b87a9a17be242434d3caca7aae2 100644 --- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp @@ -137,6 +137,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri const std::size_t oxSize = static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) / static_cast<float>(strideDims[0]))); + // output W size const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1; const std::size_t oySize = @@ -148,54 +149,106 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri // input (batch, ch, Xin, Yin) // weight (outCh, ch, kernelX, kernelY) // does not take Dilation attribute into account - using signedsize = std::make_signed<std::size_t>::type; - for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { - for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { - const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize * oySize; - B biasVal = (biases != nullptr) ? biases[ch] : B(0); - std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal); - const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; - const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1]; - for (std::size_t ox = 0; ox < oxSize; ++ox) { - // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]); - // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0))); - // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx); - const std::size_t sxMin = 0; - const std::size_t sxMax = dilated_kernel_x; - for (std::size_t oy = 0; oy < oySize; ++oy) { - // const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]); - // const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0))); - // const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify); - const std::size_t syMin = 0; - const std::size_t syMax = dilated_kernel_y; - const std::size_t oIndexFull = oIndex + ox*oySize + oy; - const signedsize ix = static_cast<signedsize>(ox * strideDims[0]); - const signedsize iy = static_cast<signedsize>(oy * strideDims[1]); - - if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) { - output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] + - weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] + - weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] + - weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] + - weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] + - weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] + - weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] + - weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] + - weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]); - } else { - for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) { - for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) { + const std::size_t outChannels_s = oxSize * oySize; + + if (dilated_kernel_x ==3 && dilated_kernel_y == 3) { + for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { + for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { + + B biasVal = (biases != nullptr) ? biases[ch] : B(0); + + std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; + const std::size_t wIndex = ch * 9; + + if (strideDims[0] == 1 && strideDims[1]==1) { + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) { + for (std::size_t oy = 0; oy < oySize; ++oy) { + output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2]; + } + iIndex+=inputDims[3]; + for (std::size_t oy = 0; oy < oySize; ++oy) { + output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2]; + } + iIndex+=inputDims[3]; + for (std::size_t oy = 0; oy < oySize; ++oy) { + output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2]; + } + } + } else { + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) { + for (std::size_t oy = 0; oy < oySize; ++oy) { + output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2]; + } + iIndex+=strideDims[0]*inputDims[3]; + for (std::size_t oy = 0; oy < oySize; ++oy) { + output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2]; + } + iIndex+=strideDims[0]*inputDims[3]; + for (std::size_t oy = 0; oy < oySize; ++oy) { + output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2]; + } + } + } + output += outChannels_s; + } + } + } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) { + std::size_t index = 0; + for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { + for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { + + B biasVal = (biases != nullptr) ? biases[ch] : B(0); + + const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; + const std::size_t wIndex = ch; + + if (strideDims[0] == 1 && strideDims[1] == 1) { + for (; index < iIndex + oxSize*oySize; ++index) { + output[index] = biasVal + weights[wIndex] * input[index]; + } + } else { + std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize * oySize; + for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize) { + index = iIndex + strideDims[0]*inputDims[3]; + for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) { + output[oIndex + oy] += weights[wIndex]*input[index+iy]; + } + } + } + } + } + } else { + for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { + for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { + + B biasVal = (biases != nullptr) ? biases[ch] : B(0); + std::fill(output, output+outChannels_s, biasVal); + + const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; + const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1]; + + for (std::size_t ox = 0; ox < oxSize; ++ox) { + for (std::size_t oy = 0; oy < oySize; ++oy) { + + const std::size_t oIndexFull = ox*oySize + oy; + const std::size_t ix = ox * strideDims[0]; + const std::size_t iy = oy * strideDims[1]; + + for (std::size_t sx = 0; sx*dilationDims[0] < dilated_kernel_x; ++sx) { + for (std::size_t sy = 0; sy*dilationDims[1] < dilated_kernel_y; ++sy) { output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] * - input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))]; + input[iIndex + static_cast<std::size_t>(ix + sx*dilationDims[0])*inputDims[3] + static_cast<std::size_t>(iy + sy*dilationDims[1])]; } } } } } + output += outChannels_s; } } } + // Kernels registration to implementation entry point REGISTRAR(ConvDepthWiseImpl2D_cpu, {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}}, diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp index e800c252676ec5247a776abf458f808289b278c8..e3b709bf308288a93fd72865a2fdef0e58908134 100644 --- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp @@ -141,15 +141,15 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, O *output = static_cast<O *>(output_); // output H size + const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1; const std::size_t oxSize = - static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) / + static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) / static_cast<float>(strideDims[0]))); - const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1; // output W size + const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1; const std::size_t oySize = - static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilationDims[1]*(kernelDims[1] - 1) - 1 + strideDims[1]) / + static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) / static_cast<float>(strideDims[1]))); - const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1; // TODO: kernel computation @@ -157,57 +157,107 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, // input (batch, inCh, Xin, Yin) // weight (outCh, inCh, kernelX, kernelY) // does not take Dilation attribute into account - using signedsize = std::make_signed<std::size_t>::type; - for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { - for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { - const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize; - // If bias = nullptr, set B(0) - B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal); - for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { - const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; - const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1]; - for (std::size_t ox = 0; ox < oxSize; ++ox) { - // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]); - // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0))); - // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx); - const std::size_t sxMin = 0; - const std::size_t sxMax = dilated_kernel_x; - for (std::size_t oy = 0; oy < oySize; ++oy) { - // const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]); - // const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0))); - // const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify); - const std::size_t syMin = 0; - const std::size_t syMax = dilated_kernel_y; - const std::size_t oIndexFull = oIndex + ox*oySize + oy; - const signedsize ix = static_cast<signedsize>(ox * strideDims[0]); - const signedsize iy = static_cast<signedsize>(oy * strideDims[1]); - - if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) { - output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] + - weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] + - weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] + - weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] + - weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] + - weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] + - weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] + - weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] + - weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]); - } else { - for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) { - for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) { - output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] * - input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))]; + const std::size_t outChannels_s = oxSize * oySize; + + if (dilated_kernel_x == 3 && dilated_kernel_y == 3) { + for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { + for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { + // If bias = nullptr, set B(0) + B biasVal = (biases != nullptr) ? biases[outCh] : B(0); + std::fill(output, output+outChannels_s, biasVal); + for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { + std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; + const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9; + if (strideDims[0] == 1 && strideDims[1]==1) { + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) { + for (std::size_t oy = 0; oy < oySize; ++oy) { + output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2]; + } + iIndex+=inputDims[3]; + for (std::size_t oy = 0; oy < oySize; ++oy) { + output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2]; + } + iIndex+=inputDims[3]; + for (std::size_t oy = 0; oy < oySize; ++oy) { + output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2]; + } + } + } else { + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) { + for (std::size_t oy = 0; oy < oySize; ++oy) { + output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2]; + } + iIndex+=strideDims[0]*inputDims[3]; + for (std::size_t oy = 0; oy < oySize; ++oy) { + output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2]; + } + iIndex+=strideDims[0]*inputDims[3]; + for (std::size_t oy = 0; oy < oySize; ++oy) { + output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2]; + } + } + } + } + output += outChannels_s; + } + } + } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) { + for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { + for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { + // If bias = nullptr, set B(0) + B biasVal = (biases != nullptr) ? biases[outCh] : B(0); + std::fill(output, output+outChannels_s, biasVal); + for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { + std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; + const std::size_t wIndex = (inCh + outCh*inputDims[1]); + if (strideDims[0] == 1 && strideDims[1] == 1) { + for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) { + output[oIndex] += weights[wIndex] * input[iIndex]; + } + } else { + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) { + for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) { + output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy]; + } + } + } + } + output += outChannels_s; + } + } + } else { + for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { + for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { + // If bias = nullptr, set B(0) + B biasVal = (biases != nullptr) ? biases[outCh] : B(0); + std::fill(output, output+outChannels_s, biasVal); + for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { + std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; + const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1]; + + // loop over each ouput line + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) { + // loop over associated input line + for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) { + // loop over the entire line + for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) { + const std::size_t iIndex = iIndex_channel + ix + iy; + // loop over elements assosicated with one output + for (std::size_t kx = 0; kx < kernelDims[0]; ++kx) { + output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]]; } } } } } + output += outChannels_s; } } } } + + // Kernels registration to implementation entry point REGISTRAR(ConvImpl2D_cpu, {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}}, diff --git a/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp index ea62fd010db8c155a3ff86ff8396797da5ebb6be..3461b254b7beecf3e7a41e90a7e40d3f6ecf6a36 100644 --- a/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp @@ -65,7 +65,7 @@ static float update_normalized_coord_with_padding(float coord, Aidge::GridSample return coord; } -static inline std::int64_t update_unnormalized_coord_with_padding(std::int64_t coord, std::int64_t size, Aidge::GridSample_Op::PaddingMode padding_mode) { +static std::int64_t update_unnormalized_coord_with_padding(std::int64_t coord, std::int64_t size, Aidge::GridSample_Op::PaddingMode padding_mode) { if (!in_bound(coord, 0, size)) { // out of bound. switch padding mode if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) { @@ -96,11 +96,11 @@ void GridSampleImpl1D_cpu_forward_kernel(const GridSample_Op& op, const std::shared_ptr<Tensor>& in1, const std::shared_ptr<Tensor>& out) { - const I* const input = static_cast<const I * const>(in0->getImpl()->rawPtr()); + const I* const input = static_cast<const I *>(in0->getImpl()->rawPtr()); const I* input_ptr = input; - float* const grid = static_cast<float* const>(in1->getImpl()->rawPtr()); + float* const grid = static_cast<float*>(in1->getImpl()->rawPtr()); float* grid_ptr = grid; - O* const output = static_cast<O* const>(out->getImpl()->rawPtr()); + O* const output = static_cast<O*>(out->getImpl()->rawPtr()); O* output_ptr = output; const std::size_t N = in0->dim(0); @@ -243,9 +243,9 @@ void GridSampleImpl2D_cpu_forward_kernel(const GridSample_Op& op, { const I* input = static_cast<const I *>(in0->getImpl()->rawPtr()); const I* input_ptr = input; - float* const grid = static_cast<float* const>(in0->getImpl()->rawPtr()); + float* const grid = static_cast<float*>(in0->getImpl()->rawPtr()); float* grid_ptr = grid; - O* const output = static_cast<O* const>(out->getImpl()->rawPtr()); + O* const output = static_cast<O*>(out->getImpl()->rawPtr()); const std::size_t N = in0->dim(0); const std::size_t C = in0->dim(1); diff --git a/include/aidge/backend/cpu/operator/LRNImpl.hpp b/include/aidge/backend/cpu/operator/LRNImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..81956c8763010d6294bd4a11a943e66fb93a64eb --- /dev/null +++ b/include/aidge/backend/cpu/operator/LRNImpl.hpp @@ -0,0 +1,32 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_LRNIMPL_H_ +#define AIDGE_CPU_OPERATOR_LRNIMPL_H_ + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/LRN.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include <memory> +#include <vector> + +namespace Aidge { +// Operator implementation entry point for the backend +using LRNImpl_cpu = OperatorImpl_cpu<LRN_Op, + void(float, float, float, std::size_t, const std::vector<DimSize_t>&, const void*, void*)>; + +// Implementation entry point registration to Operator +REGISTRAR(LRN_Op, "cpu", Aidge::LRNImpl_cpu::create); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_LRNIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/LRNImpl_kernels.hpp b/include/aidge/backend/cpu/operator/LRNImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..02018c9f8e002965584df38a95364ca10f69f8b7 --- /dev/null +++ b/include/aidge/backend/cpu/operator/LRNImpl_kernels.hpp @@ -0,0 +1,69 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_LRNIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_LRNIMPL_KERNELS_H_ + +#include "aidge/utils/Registrar.hpp" +#include <cstddef> +#include <cmath> +#include "aidge/data/Data.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" + +#include "aidge/backend/cpu/operator/LRNImpl.hpp" + +namespace Aidge { +template <class I, class O> +void LRNImpl_cpu_forward_kernel(float alpha, float beta, float bias, std::size_t size, const std::vector<DimSize_t>& inputDims, const void* input_, void* output_) +{ + const I* input = static_cast<const I*>(input_); + O* output = static_cast<O*>(output_); + + const DimSize_t nbBatch = inputDims[0]; + const DimSize_t nbChannels = (inputDims.size() > 1) ? inputDims[1] : 1; + const DimSize_t featureMapSize = (inputDims.size() > 2) ? std::accumulate(inputDims.begin() + 2, inputDims.end(), 1, std::multiplies<DimSize_t>()) : 1; + + for (std::size_t batch = 0; batch < nbBatch; ++batch) { + for (std::size_t ch = 0; ch < nbChannels; ++ch) { + const std::size_t ioIndex = (ch + batch*nbChannels) * featureMapSize; + const unsigned int channelMin + = std::max<int>(0, ch - size / 2); + const unsigned int channelMax + = std::min<size_t>(nbChannels - 1, ch + size / 2); + + for (std::size_t feature = 0; feature<featureMapSize; ++feature) { + // For each input channel, accumulate the value + O accAccrossChannels(0.0); + + for (unsigned int accChannel = channelMin; + accChannel < channelMax; ++accChannel) + { + accAccrossChannels += input[ioIndex + feature]; + } + + // Compute the output signal + output[ioIndex + feature] = input[ioIndex + feature] + / std::pow((bias + (accAccrossChannels * accAccrossChannels) * alpha), beta); + } + } + } +} + +REGISTRAR(LRNImpl_cpu, + {DataType::Float32}, + {ProdConso::inPlaceModel, Aidge::LRNImpl_cpu_forward_kernel<float, float>, nullptr}); +REGISTRAR(LRNImpl_cpu, + {DataType::Float64}, + {ProdConso::inPlaceModel, Aidge::LRNImpl_cpu_forward_kernel<double, double>, nullptr}); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_LRNIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/MulImpl.hpp b/include/aidge/backend/cpu/operator/MulImpl.hpp index 05fceba17471229d83d9f8738614b2e747121b49..c927af9ebd4d658c764cc059df9778c273ba178e 100644 --- a/include/aidge/backend/cpu/operator/MulImpl.hpp +++ b/include/aidge/backend/cpu/operator/MulImpl.hpp @@ -23,21 +23,21 @@ namespace Aidge { // Operator implementation entry point for the backend using MulImpl_cpu = OperatorImpl_cpu<Mul_Op, - void(const std::vector<std::size_t>&, - const std::vector<std::size_t>&, - const std::vector<std::size_t>&, - const void*, + void(std::vector<std::size_t>, + std::vector<std::size_t>, + const std::vector<std::size_t>&, + const void*, const void*, void*), - void(const std::size_t, - const std::size_t, + void(const std::size_t, + const std::size_t, const std::size_t, const std::vector<std::size_t>, const std::vector<std::size_t>, - const void*, - const void*, - const void*, - void*, + const void*, + const void*, + const void*, + void*, void*)>; // Implementation entry point registration to Operator diff --git a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp index c015b8f0182608fecd3da94220e9411decfd186c..556dd56cd32f28de14a43d20b97deb0083341fee 100644 --- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp @@ -19,44 +19,143 @@ #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/MulImpl.hpp" +namespace { +// suppose values are contiguous in memory +template <class I1, class I2, class O> +void mul_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I1* input1, + const I2* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(input1[in1_id] * input2[in2_id]); + } +} +} + namespace Aidge { + template <class I1, class I2, class O> -void MulImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims, - const std::vector<std::size_t>& input2Dims, +void MulImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, + const void* input0_, const void* input1_, - const void* input2_, void* output_) { - - const I1* input_1 = static_cast<const I1*>(input1_); - const I2* input_2 = static_cast<const I2*>(input2_); + const I1* input_0 = static_cast<const I1*>(input0_); + const I2* input_1 = static_cast<const I2*>(input1_); O* output = static_cast<O*>(output_); - size_t totalElements = 1; - for (size_t dimSize : outputDims) { - totalElements *= dimSize; + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // ## Compute compatible input dimensions + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(input_0[i] * input_1[i]); + } + return; } - for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) - { - std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } - std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); - std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } - output[oIndex] = input_1[idx1] * input_2[idx2]; + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + mul_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } } } template <class I1, class I2, class O> -void MulImpl_cpu_backward_kernel(const std::size_t input0Length, +void MulImpl_cpu_backward_kernel(const std::size_t input0Length, const std::size_t input1Length, const std::size_t grad0Length, const std::vector<std::size_t> input0Dims, const std::vector<std::size_t> input1Dims, - const void* input0_, - const void* input1_, - const void* grad_output_, + const void* input0_, + const void* input1_, + const void* grad_output_, void* gradientInput0, void* gradientInput1) { diff --git a/include/aidge/backend/cpu/operator/OperatorImpl.hpp b/include/aidge/backend/cpu/operator/OperatorImpl.hpp index abf94ab9069a07e8f87819cb29c027b1adbfd9c6..45f099e8140395181d8be1600c61024efaa9c6a7 100644 --- a/include/aidge/backend/cpu/operator/OperatorImpl.hpp +++ b/include/aidge/backend/cpu/operator/OperatorImpl.hpp @@ -38,8 +38,10 @@ public: return impl.prodConso(mOp); } - virtual std::set<ImplSpec> getAvailableImplSpecs() const override { - return Registrar<OperatorImpl_cpu>::getKeys(); + virtual std::vector<ImplSpec> getAvailableImplSpecs() const override { + // return Registrar<OperatorImpl_cpu>::getKeys(); // Note: cannot return set due to python binding + std::set<ImplSpec> implSpecsSet = Registrar<OperatorImpl_cpu>::getKeys(); + return std::vector<ImplSpec>(implSpecsSet.begin(), implSpecsSet.end()); } void forward() override; diff --git a/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp b/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp index a362be0944aa18c36dd74a2f0066aaa21a1fc4c0..6d218cb1d719e8576f6c013ac5a1b9c60a739852 100644 --- a/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp @@ -55,19 +55,19 @@ void PadImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorder O outputValue = static_cast<O>(borderValue); if (borderType == PadBorderType::Constant) { - int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[1]); + int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[0]); if (ix >= 0 && ix < static_cast<int>(dims[2])) { outputValue = input[iIndex + static_cast<std::size_t>(ix)]; } } else if (borderType == PadBorderType::Edge) { - int ix = std::max(0, std::min(static_cast<int>(dims[2]) - 1, static_cast<int>(ox) - static_cast<int>(beginEndBorders[1]))); + int ix = std::max(0, std::min(static_cast<int>(dims[2]) - 1, static_cast<int>(ox) - static_cast<int>(beginEndBorders[0]))); outputValue = input[iIndex + static_cast<std::size_t>(ix)]; } else if (borderType == PadBorderType::Reflect) { - int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[1]); + int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[0]); if (ix < 0) ix = 0 - ix; @@ -77,7 +77,7 @@ void PadImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorder outputValue = input[iIndex + static_cast<std::size_t>(ix)]; } else if (borderType == PadBorderType::Wrap) { - int ix = (static_cast<int>(dims[2]) + static_cast<int>(ox) - static_cast<int>(beginEndBorders[1])) % static_cast<int>(dims[2]); + int ix = (static_cast<int>(dims[2]) + static_cast<int>(ox) - static_cast<int>(beginEndBorders[0])) % static_cast<int>(dims[2]); outputValue = input[iIndex + static_cast<std::size_t>(ix)]; } @@ -120,8 +120,8 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder const I *input = static_cast<const I *>(input_); O *output = static_cast<O *>(output_); - const std::size_t oySize = dims[2] + beginEndBorders[0] + beginEndBorders[1]; - const std::size_t oxSize = dims[3] + beginEndBorders[2] + beginEndBorders[3]; + const std::size_t oySize = dims[2] + beginEndBorders[0] + beginEndBorders[2]; + const std::size_t oxSize = dims[3] + beginEndBorders[1] + beginEndBorders[3]; for (std::size_t batch = 0; batch < dims[0]; ++batch) { for (std::size_t ch = 0; ch < dims[1]; ++ch) { @@ -135,22 +135,22 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder O outputValue = static_cast<O>(borderValue); if (borderType == PadBorderType::Constant) { - std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3]); - std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1]); + std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1]); + std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0]); if (ix >= 0 && ix < static_cast<std::int32_t>(dims[3]) && iy >= 0 && iy < static_cast<std::int32_t>(dims[2])) { outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)]; } } else if (borderType == PadBorderType::Edge) { - std::int32_t ix = std::max(0, std::min(static_cast<std::int32_t>(dims[3]) - 1, static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3]))); - std::int32_t iy = std::max(0, std::min(static_cast<std::int32_t>(dims[2]) - 1, static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1]))); + std::int32_t ix = std::max(0, std::min(static_cast<std::int32_t>(dims[3]) - 1, static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1]))); + std::int32_t iy = std::max(0, std::min(static_cast<std::int32_t>(dims[2]) - 1, static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0]))); outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)]; } else if (borderType == PadBorderType::Reflect) { - std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3]); - std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1]); + std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1]); + std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0]); if (ix < 0) ix = 0 - ix; @@ -164,8 +164,8 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)]; } else if (borderType == PadBorderType::Wrap) { - std::int32_t ix = (static_cast<std::int32_t>(dims[3]) + static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3])) % static_cast<std::int32_t>(dims[3]); - std::int32_t iy = (static_cast<std::int32_t>(dims[2]) + static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1])) % static_cast<std::int32_t>(dims[2]); + std::int32_t ix = (static_cast<std::int32_t>(dims[3]) + static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1])) % static_cast<std::int32_t>(dims[3]); + std::int32_t iy = (static_cast<std::int32_t>(dims[2]) + static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0])) % static_cast<std::int32_t>(dims[2]); outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)]; } diff --git a/include/aidge/backend/cpu/operator/PaddedConvImpl.hpp b/include/aidge/backend/cpu/operator/PaddedConvImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e1e2a89ceb1356dacae965903eaf405a3d524866 --- /dev/null +++ b/include/aidge/backend/cpu/operator/PaddedConvImpl.hpp @@ -0,0 +1,59 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_ +#define AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/MetaOperatorDefs.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { +// Operator implementation entry point for the backend +using PaddedConv1D_Op = MetaOperator_Op; +using PaddedConvImpl1D_cpu = OperatorImpl_cpu<MetaOperator_Op, + void(const std::array<DimSize_t, 2>&, + const std::array<DimSize_t, 1>&, + const std::array<DimSize_t, 1>&, + const std::array<DimSize_t, 1>&, + const std::array<DimSize_t, 3> &, + DimSize_t, + const void *, + const void *, + const void *, + void *)>; + +using PaddedConv2D_Op = MetaOperator_Op; +using PaddedConvImpl2D_cpu = OperatorImpl_cpu<MetaOperator_Op, + void(const std::array<DimSize_t, 4>&, + const std::array<DimSize_t, 2>&, + const std::array<DimSize_t, 2>&, + const std::array<DimSize_t, 2>&, + const std::array<DimSize_t, 4> &, + DimSize_t, + const void *, + const void *, + const void *, + void *)>; + +// Implementation entry point registration to Operator +// Uncomment to activate implementation for PaddedConv. It is currently less efficient, hence why it is commented. +// REGISTRAR(PaddedConv1D_Op, std::array<std::string, 2>({"cpu", "PaddedConv1D"}), Aidge::PaddedConvImpl1D_cpu::create); +// REGISTRAR(PaddedConv2D_Op, std::array<std::string, 2>({"cpu", "PaddedConv2D"}), Aidge::PaddedConvImpl2D_cpu::create); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/PaddedConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/PaddedConvImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..85fb72435421f0024f11db6a13c5ddfbae4a0aeb --- /dev/null +++ b/include/aidge/backend/cpu/operator/PaddedConvImpl_kernels.hpp @@ -0,0 +1,228 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_ + +#include <array> +#include <cstddef> +#include <vector> + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp" +#include "aidge/operator/Pad.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { +// Only works for constant padding zero +/** + * @brief Forward kernel for 1D Convolution on CPU backend. + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam B Bias data type. + * @tparam O Output data type. + * @param params tuple of Attributes from the Operator + * @param inputDims Array of input dimensions. + * @param input_ const input Tensor. + * @param weights_ const weight Tensor. + * @param biases_ const Biais Tensor. + * @param output_ Output Tensor. + */ +template <class I, class W, class B, class O> +void PaddedConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorders, + const std::array<DimSize_t, 1>& strideDims, + const std::array<DimSize_t, 1>& dilationDims, + const std::array<DimSize_t, 1>& kernelDims, + const std::array<DimSize_t, 3>& inputDims, + DimSize_t outChannels, + const void *input_, + const void *weights_, + const void *biases_, + void *output_) +{ + // FIXME: missing convolution attributes as arguments + const I *input = static_cast<const I *>(input_); + const W *weights = static_cast<const W *>(weights_); + const B *biases = static_cast<const B *>(biases_); + O *output = static_cast<O *>(output_); + + // output H size + const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1; + const std::size_t oxSize = + static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) / + static_cast<float>(strideDims[0]))); + + // TODO: kernel computation + // output (batch, outCh, Xout, Yout) + // input (batch, inCh, Xin, Yin) + // weight (outCh, inCh, kernelX, kernelY) + // does not take Dilation attribute into account + using signedsize = std::make_signed<std::size_t>::type; + for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { + for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { + const std::size_t oIndex = (outCh + batch*outChannels) * oxSize; + // If bias = nullptr, set B(0) + B biasVal = (biases != nullptr) ? biases[outCh] : B(0); + std::fill(output + oIndex, output+(oIndex+oxSize), biasVal); + for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { + const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2]; + const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0]; + for (std::size_t ox = 0; ox < oxSize; ++ox) { + const signedsize difx = static_cast<signedsize>(ox * strideDims[0]); + const std::size_t sxMin = static_cast<std::size_t>(std::max(static_cast<signedsize>(beginEndBorders[0]) - difx, signedsize(0))); + const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + static_cast<signedsize>(beginEndBorders[1]) - difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx); + + const std::size_t oIndexFull = oIndex + ox; + const signedsize ix = static_cast<signedsize>(ox * strideDims[0]) - static_cast<signedsize>(beginEndBorders[0]); + + for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) { + output[oIndexFull] += weights[wIndex + sx] * + input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))]; + } + } + } + } + } +} + +// Kernels registration to implementation entry point +REGISTRAR(PaddedConvImpl1D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))}, + {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr}); +REGISTRAR(PaddedConvImpl1D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))}, + {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr}); +REGISTRAR(PaddedConvImpl1D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))}, + {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr}); +REGISTRAR(PaddedConvImpl1D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))}, + {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr}); + + +/** + * @brief Forward kernel for 2D Convolution on CPU backend. + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam B Bias data type. + * @tparam O Output data type. + * @param params tuple of Attributes from the Operator + * @param inputDims Array of input dimensions. + * @param input_ const input Tensor. + * @param weights_ const weight Tensor. + * @param biases_ const Biais Tensor. + * @param output_ Output Tensor. + */ +template <class I, class W, class B, class O> +void PaddedConvImpl2D_cpu_forward_kernel( + const std::array<DimSize_t, 4>& beginEndBorders, + const std::array<DimSize_t, 2>& strideDims, + const std::array<DimSize_t, 2>& dilationDims, + const std::array<DimSize_t, 2>& kernelDims, + const std::array<DimSize_t, 4> &inputDims, + DimSize_t outChannels, + const void *input_, + const void *weights_, + const void *biases_, + void *output_) +{ + // FIXME: missing convolution attributes as arguments + const I *input = static_cast<const I *>(input_); + const W *weights = static_cast<const W *>(weights_); + const B *biases = static_cast<const B *>(biases_); + O *output = static_cast<O *>(output_); + + // output H size + const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1; + const std::size_t oxSize = + static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + beginEndBorders[0] + beginEndBorders[2] + strideDims[0]) / + static_cast<float>(strideDims[0]))); + // output W size + const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1; + const std::size_t oySize = + static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + beginEndBorders[1] + beginEndBorders[3] + strideDims[1]) / + static_cast<float>(strideDims[1]))); + + for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { + for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { + const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize; + // If bias = nullptr, set B(0) + B biasVal = (biases != nullptr) ? biases[outCh] : B(0); + std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal); + for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { + const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; + const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1]; + for (std::size_t ox = 0; ox < oxSize; ++ox) { + const std::size_t difx = ox * strideDims[0]; + const std::size_t sxMin = beginEndBorders[0] < difx ? std::size_t(0) : beginEndBorders[0] - difx; + const std::size_t sxMax = (inputDims[2] + beginEndBorders[2]) < difx ? + 0 : + ((inputDims[2] + beginEndBorders[2]) > dilated_kernel_x + difx ? + dilated_kernel_x : + (inputDims[2] + beginEndBorders[2] - difx)); + + for (std::size_t oy = 0; oy < oySize; ++oy) { + const std::size_t dify = oy * strideDims[1]; + const std::size_t syMin = beginEndBorders[1] < dify ? std::size_t(0) : beginEndBorders[1] - dify; + const std::size_t syMax = (inputDims[3] + beginEndBorders[3]) < dify ? + 0 : + ((inputDims[3] + beginEndBorders[3]) > dilated_kernel_y + dify ? + dilated_kernel_y : + (inputDims[3] + beginEndBorders[3] - dify)); + const std::size_t oIndexFull = oIndex + ox*oySize + oy; + const std::size_t ix = ox * strideDims[0] - beginEndBorders[0]; + const std::size_t iy = oy * strideDims[1] - beginEndBorders[1]; + + + if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) { + output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] + + weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] + + weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] + + weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] + + weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] + + weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] + + weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] + + weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] + + weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]); + } else { + for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) { + for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) { + output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] * + input[iIndex + (sx*dilationDims[0] + ix)*inputDims[3] + sy*dilationDims[1] + iy]; + } + } + } + } + } + } + } + } +} + + +// Kernels registration to implementation entry point +REGISTRAR(PaddedConvImpl2D_cpu, + // ImplSpec{std::vector<ImplSpec::IOSpec>({ImplSpec::IOSpec{DataType::Any, DataFormat::NCHW}, ImplSpec::IOSpec{DataType::Any, DataFormat::NCHW}}) , std::vector<ImplSpec::IOSpec>({ImplSpec::IOSpec{DataType::Int32, DataFormat::NCHW}})}, + {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))}, + {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>, nullptr}); +REGISTRAR(PaddedConvImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))}, + {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr}); +REGISTRAR(PaddedConvImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))}, + {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr}); +REGISTRAR(PaddedConvImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))}, + {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr}); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/PowImpl.hpp b/include/aidge/backend/cpu/operator/PowImpl.hpp index cfbb8173d1f83162519016a8f2b3c3166977a5b7..b31ce08c9089df05bd2e711fd87f09690fd2df23 100644 --- a/include/aidge/backend/cpu/operator/PowImpl.hpp +++ b/include/aidge/backend/cpu/operator/PowImpl.hpp @@ -12,18 +12,21 @@ #ifndef AIDGE_CPU_OPERATOR_POWIMPL_H_ #define AIDGE_CPU_OPERATOR_POWIMPL_H_ +#include <cstddef> // std::size_t +#include <memory> // std::unique_ptr, std::make_unique +#include <string> +#include <vector> + #include "aidge/backend/cpu/operator/OperatorImpl.hpp" #include "aidge/operator/Pow.hpp" #include "aidge/utils/Registrar.hpp" #include "aidge/utils/Types.h" -#include "aidge/backend/cpu/data/GetCPUPtr.h" -#include <memory> -#include <vector> + namespace Aidge { // Operator implementation entry point for the backend using PowImpl_cpu = OperatorImpl_cpu<Pow_Op, - void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*), + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*), void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, const void*, void*, void*)>; diff --git a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp index ab9b2ccc7b823842decd044b90a5c6364cedc9c9..cae106632053366e1370b5ce1d3a2ee4cfd3b62b 100644 --- a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp @@ -13,36 +13,141 @@ #define AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ #include "aidge/utils/Registrar.hpp" -#include <cmath> + +#include <cstddef> // std::size_t #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/PowImpl.hpp" namespace Aidge { -template <class I1, class I2, class O> -void PowImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims, - const std::vector<std::size_t>& input2Dims, + +namespace { +// suppose values are contiguous in memory +template <class I, class O> +void pow_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I* input1, + const I* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(std::pow(input1[in1_id], input2[in2_id])); + } +} +} + +template <class I, class O> +void PowImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, + const void* input0_, const void* input1_, - const void* input2_, void* output_) { - const I1* input_1 = static_cast<const I1*>(input1_); - const I2* input_2 = static_cast<const I2*>(input2_); + const I* input_0 = static_cast<const I*>(input0_); + const I* input_1 = static_cast<const I*>(input1_); O* output = static_cast<O*>(output_); - std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); - for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) - { - std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex); + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(std::pow(input_0[i], input_1[i])); + } + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } - std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); - std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); - - output[oIndex] = std::pow(input_1[idx1], input_2[idx2]); - } + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + pow_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } } + template <class I1, class I2, class O> void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims, const std::vector<std::size_t>& input1Dims, @@ -82,14 +187,23 @@ void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims, // Kernels registration to implementation entry point REGISTRAR(PowImpl_cpu, - {DataType::Float32}, - {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>}); +REGISTRAR(PowImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>}); +REGISTRAR(PowImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>}); +REGISTRAR(PowImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, Aidge::PowImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>}); REGISTRAR(PowImpl_cpu, - {DataType::Float64}, - {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, Aidge::PowImpl_cpu_backward_kernel<std::int8_t, std::int8_t, std::int8_t>}); REGISTRAR(PowImpl_cpu, - {DataType::Int32}, - {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, Aidge::PowImpl_cpu_backward_kernel<std::uint8_t, std::uint8_t, std::uint8_t>}); } // namespace Aidge #endif /* AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/ResizeImpl.hpp b/include/aidge/backend/cpu/operator/ResizeImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2bf5c1e807c0b0a64ac0dd2d3ac87219ba6349df --- /dev/null +++ b/include/aidge/backend/cpu/operator/ResizeImpl.hpp @@ -0,0 +1,37 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_RESIZEIMPL_H_ +#define AIDGE_CPU_OPERATOR_RESIZEIMPL_H_ + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/Resize.hpp" +#include "aidge/utils/Registrar.hpp" +#include <aidge/data/Interpolation.hpp> +#include <aidge/operator/Pad.hpp> +#include <cstdint> + +namespace Aidge { +// Operator implementation entry point for the backend +using ResizeImpl_cpu = OperatorImpl_cpu< + Resize_Op, + void(const void *, // input + const std::vector<DimSize_t> &, // INput dims + const std::vector<DimSize_t> &, // OUTput dims + const Interpolation::CoordinateTransformation, // coord transfo + const Interpolation::Mode, // interpolation mode + const PadBorderType, // padding mode + void *)>; // output +// Implementation entry point registration to Operator +REGISTRAR(Resize_Op, "cpu", Aidge::ResizeImpl_cpu::create); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_RESIZEIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6a22ff4ec9d7beaf05be3b479b43dd3ad69bc74b --- /dev/null +++ b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp @@ -0,0 +1,160 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_ +#define AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_ + +#include "aidge/backend/cpu/operator/ResizeImpl.hpp" + +#include <aidge/data/Data.hpp> +#include <aidge/data/half.hpp> +#include <aidge/operator/Pad.hpp> +#include <cmath> +#include <cstdint> +#include <numeric> + +#include "aidge/backend/cpu/data/Interpolation.hpp" +#include "aidge/data/Interpolation.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { + +template <typename IO> +void ResizeImpl_cpu_forward_kernel( + const void *input_, + const std::vector<DimSize_t> &inputDims, + const std::vector<DimSize_t> &outputDims, + const Interpolation::CoordinateTransformation coordTransfoMode, + const Interpolation::Mode interpMode, + const PadBorderType paddingMode, + // const double * /*roi*/, + // const float * /*scales*/, + // const int64_t * /*sizes*/, + void *output_) { + + // Seting a data + const IO *input = static_cast<const IO *>(input_); + IO *output = static_cast<IO *>(output_); + + const DimSize_t outputLen = std::accumulate(outputDims.cbegin(), + outputDims.cend(), + 1, + std::multiplies<DimSize_t>()); + std::vector<float> coordInApprox(inputDims.size()); + std::vector<std::size_t> coordIn(inputDims.size()); + std::vector<DimSize_t> coordOut; + for (DimSize_t idxFlatOut = 0; idxFlatOut < outputLen; ++idxFlatOut) { + coordOut = Tensor::toCoord(outputDims, idxFlatOut); + coordInApprox = + Interpolation::untransformCoordinates(coordOut, + inputDims, + outputDims, + coordTransfoMode); + if ((interpMode == Interpolation::Mode::Ceil) || (interpMode == Interpolation::Mode::Floor) || (interpMode == Interpolation::Mode::RoundPreferCeil) || (interpMode == Interpolation::Mode::RoundPreferFloor)) { + for (std::size_t i = 0; i < coordInApprox.size(); ++i) { + if (interpMode == Interpolation::Mode::Ceil) { + coordInApprox[i] = std::ceil(coordInApprox[i]); + } else if (interpMode == Interpolation::Mode::Floor) { + coordInApprox[i] = std::floor(coordInApprox[i]); + } else if (interpMode == Interpolation::Mode::RoundPreferCeil) { + coordInApprox[i] = std::floor(coordInApprox[i] + 0.5f); + } else { // (interpMode == Interpolation::Mode::RoundPreferFloor) + coordInApprox[i] = std::ceil(coordInApprox[i] - 0.5f); + } + } + if (Tensor::isInBounds<float>(inputDims, coordInApprox)) { + for (std::size_t i = 0; i < coordInApprox.size(); ++i) { + coordIn[i] = static_cast<std::size_t>(coordInApprox[i]); + } + } else { + if (paddingMode == PadBorderType::Edge) { + for (std::size_t i = 0; i < coordInApprox.size(); ++i) { + coordIn[i] = coordInApprox[i] < 0 ? 0 : (coordInApprox[i] >=inputDims[i] ? inputDims[i] - 1 : static_cast<std::size_t>(coordInApprox[i])); + } + } else { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Padding mode not supported"); + } + } + output[idxFlatOut] = input[Tensor::toIndex(inputDims, coordIn)]; + } else { + std::set<Interpolation::Point<IO>> neighbours = + InterpolationCPU::retrieveNeighbours(input, + inputDims, + coordInApprox, + paddingMode); + output[idxFlatOut] = InterpolationCPU::interpolate(coordInApprox, + neighbours, + interpMode); + } + } + return; +} +// Kernels registration to implementation entry point +REGISTRAR(ResizeImpl_cpu, + {{{DataType::Int16}, + {DataType::Float32}, + {DataType::Float32}, + {DataType::UInt64}}, + {DataType::Int16}}, + {ProdConso::inPlaceModel, + ResizeImpl_cpu_forward_kernel<int16_t>, + nullptr}); +REGISTRAR(ResizeImpl_cpu, + {{{DataType::Int32}, + {DataType::Float32}, + {DataType::Float32}, + {DataType::UInt64}}, + {DataType::Int32}}, + {ProdConso::inPlaceModel, + ResizeImpl_cpu_forward_kernel<int32_t>, + nullptr}); +REGISTRAR(ResizeImpl_cpu, + {{{DataType::Int64}, + {DataType::Float32}, + {DataType::Float32}, + {DataType::Int64}}, + {DataType::UInt64}}, + {ProdConso::inPlaceModel, + ResizeImpl_cpu_forward_kernel<int64_t>, + nullptr}); + +REGISTRAR(ResizeImpl_cpu, + {{{DataType::Float16}, + {DataType::Float32}, + {DataType::Float32}, + {DataType::UInt64}}, + {DataType::Float16}}, + {ProdConso::inPlaceModel, + ResizeImpl_cpu_forward_kernel<half_float::half>, + nullptr}); +REGISTRAR(ResizeImpl_cpu, + {{{DataType::Float32}, + {DataType::Float32}, + {DataType::Float32}, + {DataType::UInt64}}, + {DataType::Float32}}, + {ProdConso::inPlaceModel, + ResizeImpl_cpu_forward_kernel<float>, + nullptr}); +REGISTRAR(ResizeImpl_cpu, + {{{DataType::Float64}, + {DataType::Float32}, + {DataType::Float32}, + {DataType::UInt64}}, + {DataType::Float64}}, + {ProdConso::inPlaceModel, + ResizeImpl_cpu_forward_kernel<double>, + nullptr}); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_ */ diff --git a/include/aidge/backend/cpu/operator/RoundImpl.hpp b/include/aidge/backend/cpu/operator/RoundImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c595e251cc18348b6f732f1c36a05de54f647204 --- /dev/null +++ b/include/aidge/backend/cpu/operator/RoundImpl.hpp @@ -0,0 +1,34 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_ROUNDIMPL_H_ +#define AIDGE_CPU_OPERATOR_ROUNDIMPL_H_ + +#include <cstddef> // std::size_t +#include <memory> +#include <tuple> +#include <vector> + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/Round.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { +// Operator implementation entry point for the backend +using RoundImpl_cpu = OperatorImpl_cpu<Round_Op, + void(const std::size_t, const void*, void*)>; + +// Implementation entry point registration to Operator +REGISTRAR(Round_Op, "cpu", Aidge::RoundImpl_cpu::create); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_ROUNDIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ba9c63bc3618ba81e238d7721147c894b54cf832 --- /dev/null +++ b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp @@ -0,0 +1,46 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_ + +#include <cmath> //std::round +#include <cstddef> // std::size_t + +#include "aidge/utils/Registrar.hpp" + +#include "aidge/backend/cpu/operator/RoundImpl.hpp" + +namespace Aidge { +template <class I, class O> +void RoundImpl_cpu_forward_kernel(const std::size_t inputLenght, + const void* input_, + void* output_) { + + const I* input = static_cast<const I*>(input_); + O* output = static_cast<O*>(output_); + + for (std::size_t i = 0; i < inputLenght; ++i) { + //std::round would not work since it doesn't follow the halves rules (See ONNX Round) + output[i] = static_cast<O>(std::nearbyint(static_cast<float>(input[i]))); + } +} + + +REGISTRAR(RoundImpl_cpu, + {DataType::Float32}, + {ProdConso::inPlaceModel, Aidge::RoundImpl_cpu_forward_kernel<float, float>,nullptr}); +REGISTRAR(RoundImpl_cpu, + {DataType::Float64}, + {ProdConso::inPlaceModel, Aidge::RoundImpl_cpu_forward_kernel<double, double>,nullptr}); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp index 1bf4c491723c570fa8bfd1774beca1630d2de9be..d290c40f26270a789c2d328f98560c65ecac1559 100644 --- a/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp @@ -89,13 +89,13 @@ void SliceImpl_cpu_forward_kernel(const std::vector<std::int64_t>& starts, } REGISTRAR(SliceImpl_cpu, - {DataType::Float32}, + {{DataType::Float32, DataType::Any}, {DataType::Float32}}, {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<float, float>, nullptr}); REGISTRAR(SliceImpl_cpu, - {DataType::Float64}, + {{DataType::Float64, DataType::Any}, {DataType::Float64}}, {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<double, double>, nullptr}); REGISTRAR(SliceImpl_cpu, - {DataType::Int32}, + {{DataType::Int32, DataType::Any}, {DataType::Int32}}, {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr}); } // namespace Aidge diff --git a/include/aidge/backend/cpu/operator/SubImpl.hpp b/include/aidge/backend/cpu/operator/SubImpl.hpp index 2bb22bda74edf7db09404fd5613b6714ddcdf513..eed26ddcc9f57b3bb7796049a62f3f6be7de4eb5 100644 --- a/include/aidge/backend/cpu/operator/SubImpl.hpp +++ b/include/aidge/backend/cpu/operator/SubImpl.hpp @@ -23,7 +23,7 @@ namespace Aidge { // Operator implementation entry point for the backend using SubImpl_cpu = OperatorImpl_cpu<Sub_Op, - void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>; + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*)>; // Implementation entry point registration to Operator REGISTRAR(Sub_Op, "cpu", Aidge::SubImpl_cpu::create); diff --git a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp index 0486ed2105b23e95f9cdfcda578e14900fcb2c8e..1d789c3c8886d35ce6597d5704c76060bad196c1 100644 --- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp @@ -21,32 +21,132 @@ #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/SubImpl.hpp" +namespace { +// suppose values are contiguous in memory +template <class I1, class I2, class O> +void sub_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I1* input1, + const I2* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(input1[in1_id] - input2[in2_id]); + } +} +} + namespace Aidge { template <class I1, class I2, class O> -void SubImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims, - const std::vector<std::size_t>& input2Dims, +void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, + const void* input0_, const void* input1_, - const void* input2_, void* output_) { - const I1* input_1 = static_cast<const I1*>(input1_); - const I2* input_2 = static_cast<const I2*>(input2_); + const I1* input_0 = static_cast<const I1*>(input0_); + const I2* input_1 = static_cast<const I2*>(input1_); O* output = static_cast<O*>(output_); - size_t totalElements = 1; - for (size_t dimSize : outputDims) { - totalElements *= dimSize; + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(input_0[i] - input_1[i]); + } + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } } - for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) - { - std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); - std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); - std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); - output[oIndex] = input_1[idx1] - input_2[idx2]; - } + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + sub_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } } // Kernels registration to implementation entry point @@ -56,6 +156,12 @@ REGISTRAR(SubImpl_cpu, REGISTRAR(SubImpl_cpu, {DataType::Float64}, {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<double, double, double>, nullptr}); +REGISTRAR(SubImpl_cpu, + {DataType::Int8}, + {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::int8_t, std::int8_t, std::int8_t>, nullptr}); +REGISTRAR(SubImpl_cpu, + {DataType::UInt8}, + {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t, std::uint8_t>, nullptr}); REGISTRAR(SubImpl_cpu, {DataType::Int32}, {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr}); diff --git a/src/data/Interpolation.cpp b/src/data/Interpolation.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fbf224d84f65c442e98967783d303605a177d390 --- /dev/null +++ b/src/data/Interpolation.cpp @@ -0,0 +1,436 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cpu/data/Interpolation.hpp" + +#include <aidge/utils/Log.hpp> +#include <algorithm> +#include <cmath> +#include <cstdint> + +#include <iterator> +#include <stdexcept> +#include <utility> +#include <vector> + +#include <aidge/data/Interpolation.hpp> +#include <aidge/data/half.hpp> +#include <aidge/utils/ErrorHandling.hpp> +#include <aidge/utils/Types.h> + +namespace Aidge { + +template <typename T> +std::set<Interpolation::Point<T>> +InterpolationCPU::linearRecurse(const std::vector<float> &coordToInterpolate, + const std::set<Point<T>> &points, + const DimIdx_t alongDim) { + + // all points have been discriminated properly along given dimension. + if (points.size() == 1) { + return points; + } + + auto extractPtCoords = [](std::set<Point<T>> pts) -> std::set<Coords> { + std::set<Coords> result; + for (const auto &pt : pts) { + result.insert(pt.first); + } + return result; + }; + /////////////////// + // ERROR CHECKING + if (alongDim > coordToInterpolate.size() || points.size() == 0) { + // retrieving points coords as points values can be in half_float & + // this type is not fmt compatible + std::vector<Coords> pointsCoords; + for (const auto &point : points) { + pointsCoords.push_back(point.first); + } + AIDGE_ASSERT( + alongDim >= coordToInterpolate.size(), + "InterpolationCPU::linearInterpolationRecurse: alongDim value " + "exceeded exceeded number of dimensions of coordsTointerpolate. " + "Interpolation has failed. Input values : \n - " + "coordsToInterpolate {}\n - pointsToInterpolate {}\n - alongDim " + "{}", + coordToInterpolate, + pointsCoords, + alongDim); + AIDGE_ASSERT( + points.size() == 0, + "InterpolationCPU::linearInterpolationRecurse: entering recursive " + "function with 0 points. Interpolation has failed." + "Please file a bug report to aidge_backend_cpu repo: " + "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu/-/" + "issues." + "\nInput values : \n - " + "coordsToInterpolate {}\n - pointsToInterpolate {}\n - alongDim " + "{}", + coordToInterpolate, + pointsCoords, + alongDim); + } + Log::debug("\nEntering linear recurse with {} points.", points.size()); + Log::debug("Points : {}", extractPtCoords(points)); + Log::debug("coordsToInterpolate : {}", coordToInterpolate); + Log::debug("alongDim : {}", alongDim); + + /////////////////// + // COMPUTATION + // split all points along each dimension + // depending on if their coords[alongDim] are above or under + // coords to interpolate values + std::set<Point<T>> lowerPoints; + std::set<Point<T>> upperPoints; + for (const auto &point : points) { + if (point.first[alongDim] <= coordToInterpolate[alongDim]) { + lowerPoints.insert(point); + } else { + upperPoints.insert(point); + } + } + Log::debug("alongDim : {}", alongDim); + Log::debug("lowerPoints : {}", extractPtCoords(lowerPoints)); + Log::debug("upperPoints : {}", extractPtCoords(upperPoints)); + + // Here are 3 cases + // 1. upper/lowerPoints.size() == 0 + // Coordinates to interpolate along current dimension are round. + // That would be equivalent to a linear interpolation with a + // ponderation of 1 for lowerPoints & 0 for upperPoints(or the + // opposite idk), hence we will only take lower/upperPoints values + // from there. + // + // Why this happens : + // If coordinates are round, the floor()/ceil() operations called + // in retrieveNeighbours to generate direct neighbours of floating + // coordinates returned the same value. + // + // 2. lower/upperPoints.size() == 1 + // All dimensions have been discriminated, we can proceed to + // weighted interpolation + // + // 3. lower/upperPoints.size() > 1 + // points have not been all discriminated and must be further split + // so we call linearRecurse() + switch (lowerPoints.size()) { + case 0: { + return linearRecurse(coordToInterpolate, upperPoints, alongDim + 1); + } + case 1: { + break; + } + default: { + lowerPoints = + linearRecurse(coordToInterpolate, lowerPoints, alongDim + 1); + break; + } + } + + switch (upperPoints.size()) { + case 0: { + return linearRecurse(coordToInterpolate, lowerPoints, alongDim + 1); + } + case 1: { + break; + } + default: { + upperPoints = + linearRecurse(coordToInterpolate, upperPoints, alongDim + 1); + break; + } + } + + // At this point lowerPoints & upperPoints are garanteed to be + // 1 sized arrays + AIDGE_ASSERT(lowerPoints.size() == 1, + "LowerPoints Size = {} != 1", + lowerPoints.size()); + AIDGE_ASSERT(upperPoints.size() == 1, + "upperPoints Size = {} != 1", + upperPoints.size()); + + // ( point[dim] - Pl[dim] ) + // t = ------------------------ + // ( Pu[dim] - Pl[dim] ) + float weight = + (coordToInterpolate[alongDim] - lowerPoints.begin()->first[alongDim]) / + (upperPoints.begin()->first[alongDim] - + lowerPoints.begin()->first[alongDim]); + + Point<T> interpolatedPoint = std::make_pair( + lowerPoints.begin()->first, + static_cast<T>((1.F - weight) * lowerPoints.begin()->second + + weight * upperPoints.begin()->second)); + // 0 is just a sanity check to ensure later that all dims have been + // interpolate + interpolatedPoint.first[alongDim] = 0; + Log::debug("successfully returned from alongDim : {}", alongDim); + return std::set<Point<T>>({interpolatedPoint}); +} + +template <typename T> +T InterpolationCPU::linear(const std::vector<float> &coordToInterpolate, + const std::set<Point<T>> &pointsToInterpolate) { + + auto result = linearRecurse(coordToInterpolate, pointsToInterpolate, 0); + AIDGE_ASSERT(result.size() == 1, + "Result size is not 1 but {}", + result.size()); + // if (!std::all_of(result.begin()->first.begin(), + // result.begin()->first.end(), + // [](DimSize_t coord) -> bool { return coord == 0; })) { + // std::vector<Coords> ptCoords; + // std::transform(pointsToInterpolate.begin(), + // pointsToInterpolate.end(), + // std::back_inserter(ptCoords), + // [](Point<T> pt) { return pt.first; }); + // AIDGE_THROW_OR_ABORT(std::runtime_error, + // "Not all dimensions have been interpolated." + // "Input data :" + // "\n\t coord to interpolate : {}" + // "\n\t pointsToInterpolate : {}", + // // "\n\tAll non 0 values show dimensions + // // that were not interpolated : {}", + // coordToInterpolate, + // ptCoords //, + // // result.begin()->first + // ); + // } + return result.begin()->second; +} + +template <typename T> +T InterpolationCPU::nearest(const std::vector<float> &coordsToInterpolate, + const std::set<Point<T>> &points, + const Interpolation::Mode nearestMode) { + + AIDGE_ASSERT( + coordsToInterpolate.size() == points.begin()->first.size(), + "Interpolation::nearest(): dimension mismatch : coordinate " + "to interpolate ({}) have not the same number of dimensions than " + "the points to interpolate({}).", + coordsToInterpolate, + points.begin()->first); + std::function<int64_t(const float &)> updateCoordinates; + switch (nearestMode) { + case Interpolation::Mode::Ceil: { + updateCoordinates = [](const float &coord) -> int64_t { + return ceil(coord); + }; + break; + } + case Interpolation::Mode::Floor: { + updateCoordinates = [](const float &coord) -> int64_t { + return floor(coord); + }; + break; + } + case Interpolation::Mode::RoundPreferFloor: { + updateCoordinates = [](const float &coord) -> int64_t { + return (coord - floor(coord)) == 0.5 ? floor(coord) + : std::round(coord); + }; + break; + } + case Interpolation::Mode::RoundPreferCeil: { + updateCoordinates = [](const float &coord) -> int64_t { + return (coord - floor(coord)) == 0.5 ? ceil(coord) + : std::round(coord); + }; + break; + } + default: { + AIDGE_THROW_OR_ABORT( + std::runtime_error, + "Invalid Interpolation mode for " + "InterpolationCPU::interpolateNearest. Accepted modes are : " + "Ceil({}),Floor({}),RoundPreferCeil({}), " + "RoundPreferFloor({}). Got {}.", + static_cast<int>(Ceil), + static_cast<int>(Floor), + static_cast<int>(RoundPreferCeil), + static_cast<int>(RoundPreferFloor), + static_cast<int>(nearestMode)); + } + } + Coords nearestCoords; + nearestCoords.reserve(coordsToInterpolate.size()); + for (const auto &coord : coordsToInterpolate) { + nearestCoords.push_back(updateCoordinates(coord)); + } + auto it = std::find_if( + points.begin(), + points.end(), + [nearestCoords](auto &point) { return nearestCoords == point.first; }); + if (it != points.end()) { + return it->second; + } else { + Log::warn("Interpolate::nearest(): did not find a fitting point in " + "the neighbours whose coordinates were {}, returning 0. " + "Available neighbours are at following indexes: ", + coordsToInterpolate); + for (const auto &point : points) { + Log::warn("idx : [{}]\t\tvalue {}", point.first); + } + return static_cast<T>(0); + } +} + +template <typename T> +T InterpolationCPU::interpolate(const std::vector<float> &coordsToInterpolate, + const std::set<Point<T>> &points, + const Mode interpMode) { + + T result{0}; + switch (interpMode) { + case Interpolation::Mode::Cubic: { + AIDGE_THROW_OR_ABORT( + std::runtime_error, + "Unsupported interpolation mode selected : Cubic."); + break; + } + case Interpolation::Mode::Linear: { + return linear(coordsToInterpolate, points); + break; + } + case Interpolation::Mode::Ceil: + case Interpolation::Mode::Floor: + case Interpolation::Mode::RoundPreferFloor: + case Interpolation::Mode::RoundPreferCeil: { + result = + InterpolationCPU::nearest(coordsToInterpolate, points, interpMode); + break; + } + default: { + AIDGE_THROW_OR_ABORT(std::runtime_error, + "InterpolationCPU::Interpolate({}): Unsupported " + "interpolation mode given as input.", + static_cast<int>(interpMode)); + break; + } + } + return result; +} + +////////////////////////////////////////////////////////////////////////////////////////////////////// +// TEMPLATE DECLARATION +////////////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////// +// INTERPOLATE +template int8_t InterpolationCPU::interpolate<int8_t>( + const std::vector<float> &originalCoords, + const std::set<Point<int8_t>> &points, + const Mode interpMode); +template int16_t InterpolationCPU::interpolate<int16_t>( + const std::vector<float> &originalCoords, + const std::set<Point<int16_t>> &points, + const Mode interpMode); +template int32_t InterpolationCPU::interpolate<int32_t>( + const std::vector<float> &originalCoords, + const std::set<Point<int32_t>> &points, + const Mode interpMode); +template int64_t InterpolationCPU::interpolate<int64_t>( + const std::vector<float> &originalCoords, + const std::set<Point<int64_t>> &points, + const Mode interpMode); + +template half_float::half InterpolationCPU::interpolate<half_float::half>( + const std::vector<float> &originalCoords, + const std::set<Point<half_float::half>> &points, + const Mode interpMode); +template float InterpolationCPU::interpolate<float>( + const std::vector<float> &originalCoords, + const std::set<Point<float>> &points, + const Mode interpMode); +template double InterpolationCPU::interpolate<double>( + const std::vector<float> &originalCoords, + const std::set<Point<double>> &points, + const Mode interpMode); + +//////////////////////////////////////////////////////////////////// +// INTERPOLATE LINEAR (& its associated recursive function) +template int8_t +InterpolationCPU::linear(const std::vector<float> &coordsToInterpolate, + const std::set<Point<int8_t>> &points); +template std::set<Interpolation::Point<int8_t>> +InterpolationCPU::linearRecurse(const std::vector<float> &coordsToInterpolate, + const std::set<Point<int8_t>> &points, + DimIdx_t alongDim); +template int16_t +InterpolationCPU::linear(const std::vector<float> &coordsToInterpolate, + const std::set<Point<int16_t>> &points); +template std::set<Interpolation::Point<int16_t>> +InterpolationCPU::linearRecurse(const std::vector<float> &coordsToInterpolate, + const std::set<Point<int16_t>> &points, + DimIdx_t alongDim); +template int32_t +InterpolationCPU::linear(const std::vector<float> &coordsToInterpolate, + const std::set<Point<int32_t>> &points); +template std::set<Interpolation::Point<int32_t>> +InterpolationCPU::linearRecurse(const std::vector<float> &coordsToInterpolate, + const std::set<Point<int32_t>> &points, + DimIdx_t alongDim); + +template half_float::half +InterpolationCPU::linear(const std::vector<float> &coordsToInterpolate, + const std::set<Point<half_float::half>> &points); +template std::set<Interpolation::Point<half_float::half>> +InterpolationCPU::linearRecurse( + const std::vector<float> &coordsToInterpolate, + const std::set<Point<half_float::half>> &points, + DimIdx_t alongDim); +template float +InterpolationCPU::linear(const std::vector<float> &coordsToInterpolate, + const std::set<Point<float>> &points); +template std::set<Interpolation::Point<float>> +InterpolationCPU::linearRecurse(const std::vector<float> &coordsToInterpolate, + const std::set<Point<float>> &points, + DimIdx_t alongDim); +template double +InterpolationCPU::linear(const std::vector<float> &coordsToInterpolate, + const std::set<Point<double>> &points); +template std::set<Interpolation::Point<double>> +InterpolationCPU::linearRecurse(const std::vector<float> &coordsToInterpolate, + const std::set<Point<double>> &points, + DimIdx_t alongDim); + +////////////////////////////////// +// INTERPOLATE NEAREST +template int8_t +InterpolationCPU::nearest(const std::vector<float> &originalCoords, + const std::set<Point<int8_t>> &points, + const Interpolation::Mode nearestMode); +template int16_t +InterpolationCPU::nearest(const std::vector<float> &originalCoords, + const std::set<Point<int16_t>> &points, + const Interpolation::Mode nearestMode); +template int32_t +InterpolationCPU::nearest(const std::vector<float> &originalCoords, + const std::set<Point<int32_t>> &points, + const Interpolation::Mode nearestMode); + +template half_float::half +InterpolationCPU::nearest(const std::vector<float> &originalCoords, + const std::set<Point<half_float::half>> &points, + const Interpolation::Mode nearestMode); +template float +InterpolationCPU::nearest(const std::vector<float> &originalCoords, + const std::set<Point<float>> &points, + const Interpolation::Mode nearestMode); +template double +InterpolationCPU::nearest(const std::vector<float> &originalCoords, + const std::set<Point<double>> &points, + const Interpolation::Mode nearestMode); + +} // namespace Aidge diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp index 457a0b17e531fac35ff873f9eedca7bbbe82d459..101743eccb606c998a38f49dd9b89f5ec279bcae 100644 --- a/src/operator/AddImpl.cpp +++ b/src/operator/AddImpl.cpp @@ -12,7 +12,6 @@ #include "aidge/backend/cpu/operator/AddImpl.hpp" #include <cassert> -#include <numeric> // std::accumulate #include <vector> #include "aidge/backend/cpu/data/GetCPUPtr.h" @@ -28,12 +27,11 @@ void Aidge::AddImpl_cpu::forward() { // Check inputs AIDGE_ASSERT(op.getInput(0), "missing input in Add operator"); AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Add forward because the 0-th input has no implementation."); - DataType datatypeFirstInput = op.getInput(0)->dataType(); - for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { - AIDGE_ASSERT(op.getInput(i), "missing input in Add operator"); - AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Add forward because the {}-th input has no implementation.", i); - AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type."); - } + + AIDGE_ASSERT(op.getInput(1), "missing input in Add operator"); + AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Add forward because the 1st input has no implementation."); + + AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot add inputs with two differents data type."); // Find the correct kernel type const auto impl = Registrar<AddImpl_cpu>::create(getBestMatch(getRequiredSpec())); @@ -42,28 +40,17 @@ void Aidge::AddImpl_cpu::forward() { // TODO: right now, if needed, memory will be allocated/deallocated at each // call to forward(). We might put the following shared_ptr as members of // this class to avoid that. - const std::size_t nbDims = op.getOutput(0)->nbDims(); - std::vector<std::vector<std::size_t>> inputsDims; - std::vector<const void*> opInputs; - std::vector<std::shared_ptr<Tensor>> inputsFallback(op.nbInputs()); - for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - std::vector<std::size_t> inputDims(nbDims, 1); - auto dims = op.getInput(i)->dims(); - for(std::size_t j=dims.size()-1; j+1>0; --j) - { - std::size_t idx = nbDims - (dims.size()-j); - inputDims[idx] = dims[j]; - } - inputsDims.push_back(inputDims); - const auto& input = op.getInput(i)->refCastFrom(inputsFallback[i], *op.getOutput(0)); - opInputs.push_back(input.getImpl()->rawPtr()); - } + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; + const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0)); + const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1)); + - impl.forward(opInputs, - inputsDims, - op.getOutput(0)->size(), - op.getOutput(0)->dims(), - getCPUPtr(op.getRawOutput(0))); + impl.forward(op.getInput(0)->dims(), + op.getInput(1)->dims(), + op.getOutput(0)->dims(), + input0.getImpl()->rawPtr(), + input1.getImpl()->rawPtr(), + getCPUPtr(op.getRawOutput(0))); } template <> diff --git a/src/operator/AndImpl.cpp b/src/operator/AndImpl.cpp index 2e0f59769ad86f6e4143ab59d089706e34792244..0cff914a4d03f6ef1ef339d7c7b46e48b6f4c293 100644 --- a/src/operator/AndImpl.cpp +++ b/src/operator/AndImpl.cpp @@ -25,22 +25,34 @@ template <> void Aidge::AndImpl_cpu::forward() { - const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()); - const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); + const And_Op& op = static_cast<const And_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in And operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run And forward because the 0-th input has no implementation."); + AIDGE_ASSERT(op.getInput(1), "missing input in And operator"); + AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run And forward because the 1st input has no implementation."); + + AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot And inputs with two differents data type."); // Find the correct kernel type const auto impl = Registrar<AndImpl_cpu>::create(getBestMatch(getRequiredSpec())); - // Call kernel - impl.forward(inputDims0, - inputDims1, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - getCPUPtr(mOp.getRawInput(0)), - getCPUPtr(mOp.getRawInput(1)), - getCPUPtr(mOp.getRawOutput(0))); + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; + const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0)); + const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1)); + + + impl.forward(op.getInput(0)->dims(), + op.getInput(1)->dims(), + op.getOutput(0)->dims(), + input0.getImpl()->rawPtr(), + input1.getImpl()->rawPtr(), + getCPUPtr(op.getRawOutput(0))); } template <> diff --git a/src/operator/AtanImpl.cpp b/src/operator/AtanImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..af3393e7eb13fad4b414172edc7d1ab32ffcc573 --- /dev/null +++ b/src/operator/AtanImpl.cpp @@ -0,0 +1,54 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <chrono> // std::chrono::milliseconds +#include <numeric> // std::accumulate +#include <thread> // std::this_thread::sleep_for +#include <vector> + +#include "aidge/operator/Atan.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" + +#include "aidge/backend/cpu/operator/AtanImpl.hpp" +#include "aidge/backend/cpu/operator/AtanImpl_kernels.hpp" + +template <> +void Aidge::AtanImpl_cpu::forward() { + const Atan_Op& op_ = dynamic_cast<const Atan_Op&>(mOp); + std::shared_ptr<Tensor> in0 = op_.getInput(0); + std::shared_ptr<Tensor> out0 = op_.getOutput(0); + AIDGE_ASSERT(in0, "missing input #0"); + + // Find the correct kernel type + const auto impl = Registrar<AtanImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Call kernel + impl.forward(in0->size(), + getCPUPtr(mOp.getRawInput(0)), + getCPUPtr(mOp.getRawOutput(0))); +} + +template <> +void Aidge::AtanImpl_cpu::backward() { + const Atan_Op& op_ = dynamic_cast<const Atan_Op&>(mOp); + std::shared_ptr<Tensor> out0 = op_.getOutput(0); + std::shared_ptr<Tensor> gra_int0 = op_.getInput(0)->grad(); + std::shared_ptr<Tensor> gra_out0 = op_.getOutput(0)->grad(); + AIDGE_ASSERT(out0, "missing output #0 for current {} operator", op_.type()); + + // Find the correct kernel type + const auto impl = Registrar<AtanImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Call kernel + impl.backward(gra_int0->size(), getCPUPtr(out0), getCPUPtr(gra_out0), getCPUPtr(gra_int0)); +} diff --git a/src/operator/BatchNormImpl.cpp b/src/operator/BatchNormImpl.cpp index 9f1d986e63f14e6038c80054e5e3bc631ec24224..af59310830a865b496019e7620cfb661721ff39a 100644 --- a/src/operator/BatchNormImpl.cpp +++ b/src/operator/BatchNormImpl.cpp @@ -30,15 +30,13 @@ void Aidge::BatchNormImpl2D_cpu::forward() { AIDGE_ASSERT(op_.getInput(3), "missing input #3 for BatchNorm Operator"); AIDGE_ASSERT(op_.getInput(4), "missing input #4 for BatchNorm Operator"); - AIDGE_ASSERT(op_.getOutput(0)->nbDims() == 4, ""); - // Find the correct kernel type const auto impl = Registrar<BatchNormImpl2D_cpu>::create(getBestMatch(getRequiredSpec())); // Call kernel impl.forward(op_.epsilon(), op_.momentum(), - op_.getInput(0)->template dims<4>(), + op_.getInput(0)->dims(), getCPUPtr(op_.getRawInput(0)), getCPUPtr(op_.getRawInput(1)), getCPUPtr(op_.getRawInput(2)), diff --git a/src/operator/BitShiftImpl.cpp b/src/operator/BitShiftImpl.cpp index 1e0f79fd29fd140f0b41c64d245b9b240da80028..c6940554dd925905a18de66651707c3d58594ade 100644 --- a/src/operator/BitShiftImpl.cpp +++ b/src/operator/BitShiftImpl.cpp @@ -28,27 +28,18 @@ void Aidge::BitShiftImpl_cpu::forward() { const auto& op_ = dynamic_cast<const BitShift_Op&>(mOp); - const auto impl = Registrar<BitShiftImpl_cpu>::create(getBestMatch(getRequiredSpec())); - - const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()); - const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); - - BitShift_Op::BitShiftDirection direction = op_.direction(); - // Call kernel impl.forward( - direction, - inputDims0, - inputDims1, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), + op_.direction(), + op_.getInput(0)->dims(), + op_.getInput(1)->dims(), + op_.getOutput(0)->dims(), getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawOutput(0))); - + } template <> diff --git a/src/operator/ClipImpl.cpp b/src/operator/ClipImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..931d25426a8f6e08363bfc08d23f1714e934634c --- /dev/null +++ b/src/operator/ClipImpl.cpp @@ -0,0 +1,67 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <memory> +#include <vector> + +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/Clip.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include "aidge/utils/ErrorHandling.hpp" + +#include "aidge/backend/cpu/operator/ClipImpl.hpp" +#include "aidge/backend/cpu/operator/ClipImpl_kernels.hpp" + +template<> +void Aidge::ClipImpl_cpu::forward() { + + const Clip_Op& op_ = dynamic_cast<const Clip_Op&>(mOp); + std::shared_ptr<Tensor> in0 = op_.getInput(0); + std::shared_ptr<Tensor> out0 = op_.getOutput(0); + AIDGE_ASSERT(in0, "missing input #0"); + /*AIDGE_ASSERT(in1, "missing input #1 -> Min value empty shape Tensor"); + AIDGE_ASSERT(in2, "missing input #2 -> Max value empty shape Tensor");*/ + // Find the correct kernel type + const auto impl = Registrar<ClipImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Call kernel + impl.forward( + op_.min(), + op_.max(), + getCPUPtr(mOp.getRawInput(0)), + in0->size(), + getCPUPtr(mOp.getRawOutput(0)) + ); +} + +template<> +void Aidge::ClipImpl_cpu::backward() { + + const Clip_Op& op_ = dynamic_cast<const Clip_Op&>(mOp); + std::shared_ptr<Tensor> in0 = op_.getInput(0); + std::shared_ptr<Tensor> out0 = op_.getOutput(0); + std::shared_ptr<Tensor> gra_in0 = op_.getInput(0)->grad(); + std::shared_ptr<Tensor> gra_out0 = op_.getOutput(0)->grad(); + AIDGE_ASSERT(out0, "missing output #0 for current {} operator", op_.type()); + + // Find the correct kernel type + const auto impl = Registrar<ClipImpl_cpu>::create(getBestMatch(getRequiredSpec())); + // Call kernel + impl.backward( + op_.min(), + op_.max(), + gra_in0->size(), + getCPUPtr(in0), + getCPUPtr(gra_out0), + getCPUPtr(gra_in0) + ); +} diff --git a/src/operator/ConvDepthWiseImpl.cpp b/src/operator/ConvDepthWiseImpl.cpp index d86bba8d1abf348eb25e2d9c69d04b5c33a8a176..9b4ca3ad50d4b1db3367d39381191cf6d8b01314 100644 --- a/src/operator/ConvDepthWiseImpl.cpp +++ b/src/operator/ConvDepthWiseImpl.cpp @@ -65,7 +65,6 @@ void Aidge::ConvDepthWiseImpl2D_cpu::forward() { AIDGE_ASSERT(op_.getInput(0), "missing input #0 in ConvDepthWise Operator"); AIDGE_ASSERT(op_.getInput(1), "missing input #1 in ConvDepthWise Operator"); - AIDGE_ASSERT(op_.getInput(2), "missing input #2 in ConvDepthWise Operator"); AIDGE_ASSERT((op_.getInput(0)->nbDims() == 4), "support for 4-dimensions tensors only"); diff --git a/src/operator/LRNImpl.cpp b/src/operator/LRNImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b914ffac236e995c58fe2c6a10417c32493b791c --- /dev/null +++ b/src/operator/LRNImpl.cpp @@ -0,0 +1,46 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <chrono> // std::chrono::milliseconds +#include <numeric> // std::accumulate +#include <thread> // std::this_thread::sleep_for +#include <vector> + +#include "aidge/operator/LRN.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" + +#include "aidge/backend/cpu/operator/LRNImpl.hpp" +#include "aidge/backend/cpu/operator/LRNImpl_kernels.hpp" + +template <> +void Aidge::LRNImpl_cpu::forward() { + const auto& op_ = dynamic_cast<const LRN_Op&>(mOp); + AIDGE_ASSERT(!op_.getInput(0)->empty(), "LRN input empty"); + + // Find the correct kernel type + const auto impl = Registrar<LRNImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Call kernel + impl.forward(op_.alpha(), + op_.beta(), + op_.bias(), + op_.size(), + std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims(), + std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->getImpl()->rawPtr(), + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()); +} + +template <> +void Aidge::LRNImpl_cpu::backward() { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for LRN_Op on backend cpu"); +} diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp index ea5e3d3ab8ac24934a0cb6f9042858fa094700af..422bdd005f058fc9200cf5f7962bfc8d5877e6e1 100644 --- a/src/operator/MulImpl.cpp +++ b/src/operator/MulImpl.cpp @@ -25,18 +25,15 @@ template <> void Aidge::MulImpl_cpu::forward() { - const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()); - const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); + const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp); // Find the correct kernel type const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec())); // Call kernel - impl.forward(inputDims0, - inputDims1, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), + impl.forward(op_.getInput(0)->dims(), + op_.getInput(1)->dims(), + op_.getOutput(0)->dims(), getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawOutput(0))); @@ -45,7 +42,7 @@ void Aidge::MulImpl_cpu::forward() { template <> void Aidge::MulImpl_cpu::backward() { const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp); - + auto in0 = op_.getInput(0); auto in1 = op_.getInput(1); auto in0grad = op_.getInput(0)->grad(); @@ -56,14 +53,14 @@ void Aidge::MulImpl_cpu::backward() { const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec())); // Call kernel - impl.backward(/* input0Length */ in0grad->size(), + impl.backward(/* input0Length */ in0grad->size(), /* input1Length */ in1grad->size(), /* grad0Length */ out0grad->size(), /* input0Dims */ in0->dims(), /* input1Dims */ in1->dims(), - getCPUPtr(in0), - getCPUPtr(in1), - getCPUPtr(out0grad), - getCPUPtr(in0grad), + getCPUPtr(in0), + getCPUPtr(in1), + getCPUPtr(out0grad), + getCPUPtr(in0grad), getCPUPtr(in1grad)); } diff --git a/src/operator/PaddedConvImpl.cpp b/src/operator/PaddedConvImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b85039d1fb86484e7b7609a0cb335d5e41bbc21f --- /dev/null +++ b/src/operator/PaddedConvImpl.cpp @@ -0,0 +1,128 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp" +#include "aidge/backend/cpu/operator/PaddedConvImpl_kernels.hpp" + +#include <memory> +#include <vector> + +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/MetaOperator.hpp" +#include "aidge/operator/Conv.hpp" +#include "aidge/operator/Pad.hpp" +#include "aidge/utils/ErrorHandling.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" + +template <> +void Aidge::PaddedConvImpl1D_cpu::forward() { + const auto& op_ = static_cast<const MetaOperator_Op&>(mOp); + + // FIXME: uncomment the following code once memory handling will work + AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator."); + AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator."); + + // Find the correct kernel type + const auto impl = Registrar<PaddedConvImpl1D_cpu>::create(getBestMatch(getRequiredSpec())); + + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; + const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *op_.getOutput(0)); + const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *op_.getOutput(0)); + const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor(); + + std::shared_ptr<Conv_Op<1>> conv_op; + std::shared_ptr<Pad_Op<1>> pad_op; + for (const auto& n : op_.getMicroGraph()->getNodes()) { + if (n->getOperator()->type() == Conv_Op<1>::Type) { + conv_op = std::static_pointer_cast<Conv_Op<1>>(n->getOperator()); + } else { + pad_op = std::static_pointer_cast<Pad_Op<1>>(n->getOperator()); + } + } + + // Call kernel + impl.forward( + pad_op->beginEndBorders(), + conv_op->strideDims(), + conv_op->dilationDims(), + conv_op->kernelDims(), + op_.getInput(0)->template dims<3>(), // input dimensions + conv_op->outChannels(), // outChannels + input0.getImpl()->rawPtr(), // input + input1.getImpl()->rawPtr(), // weight + op_.getInput(2) ? input2.getImpl()->rawPtr() : nullptr, // bias + getCPUPtr(mOp.getRawOutput(0)) // output + ); +} + +template <> +void Aidge::PaddedConvImpl1D_cpu::backward() { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<1> on backend cpu"); +} + +template <> +void Aidge::PaddedConvImpl2D_cpu::forward() { + const auto& op_ = dynamic_cast<const MetaOperator_Op&>(mOp); + + // FIXME: uncomment the following code once memory handling will work + AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator."); + AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator."); + + // Find the correct kernel type + const auto impl = Registrar<PaddedConvImpl2D_cpu>::create(getBestMatch(getRequiredSpec())); + + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; + const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *op_.getOutput(0)); + const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *op_.getOutput(0)); + const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor(); + + std::shared_ptr<Conv_Op<2>> conv_op; + std::shared_ptr<Pad_Op<2>> pad_op; + + for (const auto& n : op_.getMicroGraph()->getNodes()) { + if (n->getOperator()->type() == Conv_Op<2>::Type) { + conv_op = std::static_pointer_cast<Conv_Op<2>>(n->getOperator()); + } else { + pad_op = std::static_pointer_cast<Pad_Op<2>>(n->getOperator()); + } + } + + // Call kernel + impl.forward( + pad_op->beginEndBorders(), + conv_op->strideDims(), + conv_op->dilationDims(), + conv_op->kernelDims(), + op_.getInput(0)->template dims<4>(), // input dimensions + conv_op->outChannels(), // outChannels + input0.getImpl()->rawPtr(), // input + input1.getImpl()->rawPtr(), // weight + op_.getInput(2) ? input2.getImpl()->rawPtr() : nullptr, // bias + getCPUPtr(mOp.getRawOutput(0)) // output + ); +} + +template <> +void Aidge::PaddedConvImpl2D_cpu::backward() { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<2> on backend cpu"); +} diff --git a/src/operator/PowImpl.cpp b/src/operator/PowImpl.cpp index 74a7be71e176ba8e1cb8851050e575d6aa7465df..4448c8e9c455e59b584b084d32a8b17e8ae03453 100644 --- a/src/operator/PowImpl.cpp +++ b/src/operator/PowImpl.cpp @@ -25,21 +25,36 @@ template <> void Aidge::PowImpl_cpu::forward() { - const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()); - const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); + + const Pow_Op& op = static_cast<const Pow_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in Pow operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Pow forward because the 0-th input has no implementation."); + + AIDGE_ASSERT(op.getInput(1), "missing input in Pow operator"); + AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Pow forward because the 1st input has no implementation."); + + AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot compute Pow with inputs of two differents data type."); // Find the correct kernel type const auto impl = Registrar<PowImpl_cpu>::create(getBestMatch(getRequiredSpec())); - // Call kernel - impl.forward(inputDims0, - inputDims1, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - getCPUPtr(mOp.getRawInput(0)), - getCPUPtr(mOp.getRawInput(1)), - getCPUPtr(mOp.getRawOutput(0))); + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; + const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0)); + const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1)); + + + impl.forward(op.getInput(0)->dims(), + op.getInput(1)->dims(), + op.getOutput(0)->dims(), + input0.getImpl()->rawPtr(), + input1.getImpl()->rawPtr(), + getCPUPtr(op.getRawOutput(0))); + } template <> @@ -69,4 +84,4 @@ void Aidge::PowImpl_cpu::backward() { getCPUPtr(out0grad), getCPUPtr(in0grad), getCPUPtr(in1grad)); -} \ No newline at end of file +} diff --git a/src/operator/ResizeImpl.cpp b/src/operator/ResizeImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..38e3639312879ed75dac13fd5ed1226620e0cbd9 --- /dev/null +++ b/src/operator/ResizeImpl.cpp @@ -0,0 +1,59 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ +#include "aidge/backend/cpu/operator/ResizeImpl.hpp" +#include "aidge/backend/cpu/operator/ResizeImpl_kernels.hpp" +#include "aidge/operator/Resize.hpp" + +#include <cassert> +#include <cstdint> +#include <sys/stat.h> + +#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/utils/ErrorHandling.hpp" + +namespace Aidge { + +template <> void ResizeImpl_cpu::forward() { + auto &op = dynamic_cast<const Resize_Op &>(mOp); + + /** @brief input #0 */ + int8_t idxData = 0; + + const bool input0DataPresent = + op.getInput(idxData) && !op.getInput(idxData)->undefined(); + + /////////////////////////////////////// + // CHECKING NODE CONNECTIONS + AIDGE_ASSERT(input0DataPresent, "{}: missing data input #0", op.type()); + + /////////////////////////////////////// + // CALL TO FORWARD + const auto impl = + Registrar<ResizeImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + impl.forward(op.getInput(idxData)->getImpl()->rawPtr(), + op.getInput(idxData)->dims(), + op.getOutput(0)->dims(), + + op.coordinateTransformationMode(), + op.interpolationMode(), + op.paddingMode(), + + op.getOutput(0)->getImpl()->rawPtr() // output pointer + ); +} + +template <> void Aidge::ResizeImpl_cpu::backward() { + AIDGE_THROW_OR_ABORT( + std::runtime_error, + "Backward not yet implemented for Slice_Op on backend cpu"); +} +} // namespace Aidge diff --git a/src/operator/RoundImpl.cpp b/src/operator/RoundImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6f19f064cabfaa6bde7b434b0defe53f5c1b78cf --- /dev/null +++ b/src/operator/RoundImpl.cpp @@ -0,0 +1,40 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <memory> +#include <vector> + +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/Round.hpp" +#include "aidge/utils/ErrorHandling.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/operator/RoundImpl.hpp" +#include "aidge/backend/cpu/operator/RoundImpl_kernels.hpp" + +template <> +void Aidge::RoundImpl_cpu::forward() { + std::shared_ptr<Tensor> in0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0)); + std::shared_ptr<Tensor> out0 = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)); + AIDGE_ASSERT(in0, "missing input #0"); + + // Find the correct kernel type + const auto impl = Registrar<RoundImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Call kernel + impl.forward(in0->size(), + getCPUPtr(mOp.getRawInput(0)), + getCPUPtr(mOp.getRawOutput(0))); +} +template <> +void Aidge::RoundImpl_cpu::backward() { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Round_Op on backend cpu"); +} \ No newline at end of file diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp index d43771b967889183801cb93418c967ce9d9c8453..e36abe2a9d68a2b56ab1777aa04b0e911df514c8 100644 --- a/src/operator/SubImpl.cpp +++ b/src/operator/SubImpl.cpp @@ -25,18 +25,15 @@ template <> void Aidge::SubImpl_cpu::forward() { - const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()); - const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); + const Sub_Op& op_ = dynamic_cast<const Sub_Op&>(mOp); // Find the correct kernel type const auto impl = Registrar<SubImpl_cpu>::create(getBestMatch(getRequiredSpec())); // Call kernel - impl.forward(inputDims0, - inputDims1, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), + impl.forward(op_.getInput(0)->dims(), + op_.getInput(1)->dims(), + op_.getOutput(0)->dims(), getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawOutput(0))); diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt index 8178df93beb96a3a7538dae8d9a706380c06ecf8..5984524fdc8c596641e505897d16e12de78024cc 100644 --- a/unit_tests/CMakeLists.txt +++ b/unit_tests/CMakeLists.txt @@ -3,7 +3,7 @@ Include(FetchContent) FetchContent_Declare( Catch2 GIT_REPOSITORY https://github.com/catchorg/Catch2.git - GIT_TAG v3.0.1 # or a later release + GIT_TAG v3.7.1 # or a later release ) FetchContent_MakeAvailable(Catch2) diff --git a/unit_tests/data/Test_Interpolation.cpp b/unit_tests/data/Test_Interpolation.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5c3b56f02ab17092a6ba238cc74e1bf75e203718 --- /dev/null +++ b/unit_tests/data/Test_Interpolation.cpp @@ -0,0 +1,237 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <aidge/backend/cpu/data/Interpolation.hpp> +#include <aidge/data/Interpolation.hpp> +#include <aidge/data/Tensor.hpp> +#include <aidge/filler/Filler.hpp> +#include <aidge/utils/Types.h> +#include <catch2/catch_test_macros.hpp> +#include <limits> + +#include "aidge/backend/cpu/data/Interpolation.hpp" + +namespace Aidge { + +TEST_CASE("Interpolation", "[Interpolation][Data]") { + + SECTION("Linear") { + std::set<Interpolation::Point<int>> pointsToInterpolateInt; + std::set<Interpolation::Point<float>> pointsToInterpolateFloat; + + SECTION("1D") { + pointsToInterpolateInt = + std::set<Interpolation::Point<int>>({{{0}, 10}, {{1}, 20}}); + CHECK(abs(InterpolationCPU::linear({0.5}, pointsToInterpolateInt) - + 15) <= std::numeric_limits<int>::epsilon()); + + pointsToInterpolateFloat = std::set<Interpolation::Point<float>>( + {{{0}, .0F}, {{1}, 0.2F}}); + CHECK(fabs(InterpolationCPU::linear({0.3}, + pointsToInterpolateFloat) - + .06F) <= 1e-5); + } + SECTION("2D") { + // example taken from + // https://en.wikipedia.org/wiki/Bilinear_interpolation + pointsToInterpolateFloat = {{{14, 20}, 91.F}, + {{14, 21}, 162.F}, + {{15, 20}, 210.F}, + {{15, 21}, 95.F}}; + CHECK(fabs(InterpolationCPU::linear<float>( + {14.5F, 20.2F}, + pointsToInterpolateFloat) - + 146.1) < 1e-5); + // pointsToInterpolateFloat = {{{0, 0}, .10F}, + // {{0, 1}, .20F}, + // {{1, 0}, .30F}, + // {{1, 1}, .40F}}; + // CHECK(abs(InterpolationCPU::linear<float>({1.5, 0.5}, + // pointsToInterpolateInt) + // - + // 25) < std::numeric_limits<int>::epsilon()); + + // pointsToInterpolateFloat = std::vector({0.1F, 0.2F, 0.3F, + // 0.4F}); CHECK(InterpolationCPU::linear(pointsToInterpolateFloat) + // == .25f); + } + SECTION("3D") { + pointsToInterpolateFloat = {{{0, 0, 0}, .1F}, + {{0, 0, 1}, .2F}, + {{0, 1, 0}, .3F}, + {{0, 1, 1}, .4F}, + {{1, 0, 0}, .5F}, + {{1, 0, 1}, .6F}, + {{1, 1, 0}, .7F}, + {{1, 1, 1}, .8F}}; + CHECK(fabs(InterpolationCPU::linear({.5, .5, .5}, + pointsToInterpolateFloat) - + .45f) < 1e-5); + } + SECTION("4D") { + SECTION("Casual") { + pointsToInterpolateFloat = {{{0, 0, 0, 0}, .1F}, + {{0, 0, 0, 1}, .2F}, + {{0, 0, 1, 0}, .3F}, + {{0, 0, 1, 1}, .4F}, + {{0, 1, 0, 0}, .5F}, + {{0, 1, 0, 1}, .6F}, + {{0, 1, 1, 0}, .7F}, + {{0, 1, 1, 1}, .8F}, + {{1, 0, 0, 0}, .9F}, + {{1, 0, 0, 1}, 1.F}, + {{1, 0, 1, 0}, 1.1F}, + {{1, 0, 1, 1}, 1.2F}, + {{1, 1, 0, 0}, 1.3F}, + {{1, 1, 0, 1}, 1.4F}, + {{1, 1, 1, 0}, 1.5F}, + {{1, 1, 1, 1}, 1.6F}}; + CHECK(fabs(InterpolationCPU::linear<float>( + {.5, .5, .5, .5}, + pointsToInterpolateFloat) - + .85f) < 0.0001); + } + } + SECTION("Some of the coords to interpolate were round") { + // In this case retrieveNeighbours() + // only retrieved the neighbours against not round dimensions + auto tensor = + std::make_shared<Tensor>(std::vector<DimSize_t>({10, 10})); + tensor->setDataType(DataType::Float32); + tensor->setBackend("cpu"); + Aidge::constantFiller(tensor, 1337.F); + + std::set<Interpolation::Point<float>> expectedResult = { + {{0, 0, -1, -1}, 0.F}, + {{0, 0, 0, -1}, 0.F}, + {{0, 0, -1, 0}, 0.F}, + {{0, 0, 0, 0}, 1337.F}}; + + pointsToInterpolateFloat = Interpolation::retrieveNeighbours( + reinterpret_cast<float *>(tensor->getImpl()->rawPtr()), + tensor->dims(), + std::vector<float>({0.F, 0.F, -0.25F, -0.25F})); + + pointsToInterpolateFloat = {{{0, 0, -1, -1}, 1337.F}, + {{0, 0, 0, -1}, 1337.F}, + {{0, 0, -1, 0}, 1337.F}, + {{0, 0, 0, 0}, 1337.F}}; + } + } + SECTION("Nearest") { + std::set<Interpolation::Point<float>> pointsToInterpolate; + std::vector<float> coordToInterpolate; + SECTION("1D") { + coordToInterpolate = {0.5F}; + pointsToInterpolate = + std::set<Interpolation::Point<float>>{{{0}, 1.0F}, + {{1}, 2.0F}, + {{2}, 3.0F}, + {{3}, 4.0F}, + {{4}, 5.0F}}; + + SECTION("Floor") { + CHECK(InterpolationCPU::nearest( + coordToInterpolate, + pointsToInterpolate, + Interpolation::Mode::Floor) == 1); + } + SECTION("Ceil") { + CHECK(InterpolationCPU::nearest( + coordToInterpolate, + pointsToInterpolate, + Interpolation::Mode::Ceil) == 2); + } + SECTION("RoundPreferFloor") { + CHECK(InterpolationCPU::nearest( + coordToInterpolate, + pointsToInterpolate, + Interpolation::Mode::RoundPreferFloor) == 1); + } + SECTION("RoundPreferCeil") { + CHECK(InterpolationCPU::nearest( + coordToInterpolate, + pointsToInterpolate, + Interpolation::Mode::RoundPreferCeil) == 2); + } + } + SECTION("2D") { + coordToInterpolate = {2.5F, 3.97F}; + pointsToInterpolate = {{{0, 0}, 10.0}, + {{1, 1}, 20.0}, + {{2, 3}, 30.0}, + {{2, 4}, 40.0}, + {{3, 3}, 50.0}, + {{3, 4}, 60.0}}; + SECTION("Floor") { + CHECK(InterpolationCPU::nearest( + coordToInterpolate, + pointsToInterpolate, + Interpolation::Mode::Floor) == 30.); + } + SECTION("Ceil") { + CHECK(InterpolationCPU::nearest( + coordToInterpolate, + pointsToInterpolate, + Interpolation::Mode::Ceil) == 60.); + } + SECTION("RoundPreferFloor") { + CHECK(InterpolationCPU::nearest( + coordToInterpolate, + pointsToInterpolate, + Interpolation::Mode::RoundPreferFloor) == + 40.); + } + SECTION("RoundPreferCeil") { + CHECK(InterpolationCPU::nearest( + coordToInterpolate, + pointsToInterpolate, + Interpolation::Mode::RoundPreferCeil) == 60.); + } + } + SECTION("3D") { + coordToInterpolate = {1.9, 2.1, 3.6}; + pointsToInterpolate = {{{0, 0, 0}, 5.0}, + {{1, 2, 3}, 10.0}, + {{2, 1, 4}, 20.0}, + {{2, 2, 4}, 30.0}, + {{2, 3, 3}, 40.0}, + {{2, 3, 4}, 50.0}, + {{3, 3, 4}, 60.0}}; + SECTION("Floor") { + CHECK(InterpolationCPU::nearest( + coordToInterpolate, + pointsToInterpolate, + Interpolation::Mode::Floor) == 10.); + } + SECTION("Ceil") { + CHECK(InterpolationCPU::nearest( + coordToInterpolate, + pointsToInterpolate, + Interpolation::Mode::Ceil) == 50.); + } + SECTION("RoundPreferFloor") { + CHECK(InterpolationCPU::nearest( + coordToInterpolate, + pointsToInterpolate, + Interpolation::Mode::RoundPreferFloor) == + 30.); + } + SECTION("RoundPreferCeil") { + CHECK(InterpolationCPU::nearest( + coordToInterpolate, + pointsToInterpolate, + Interpolation::Mode::RoundPreferCeil) == 30.); + } + } + } +} +} // namespace Aidge diff --git a/unit_tests/data/Test_TensorImpl.cpp b/unit_tests/data/Test_TensorImpl.cpp index 5f870acfb44366632474b7290228658d7a4701dd..fd938f10a947d1520600a1d00022eeb970cd76e6 100644 --- a/unit_tests/data/Test_TensorImpl.cpp +++ b/unit_tests/data/Test_TensorImpl.cpp @@ -25,7 +25,7 @@ namespace Aidge { -TEST_CASE("Test addition of Tensors","[TensorImpl][Add]") { +TEST_CASE("Test addition of Tensors","[TensorImpl][Add][Data]") { constexpr std::uint16_t NBTRIALS = 10; // Create a random number generator std::random_device rd; @@ -35,7 +35,7 @@ TEST_CASE("Test addition of Tensors","[TensorImpl][Add]") { std::uniform_int_distribution<int> boolDist(0,1); // Create MatMul Operator - std::shared_ptr<Node> mySub = Add(2); + std::shared_ptr<Node> mySub = Add(); auto op = std::static_pointer_cast<OperatorTensor>(mySub-> getOperator()); op->setDataType(DataType::Float32); op->setBackend("cpu"); @@ -193,4 +193,100 @@ TEST_CASE("Test division of Tensors","[TensorImpl][Div]") { Tensor T3(T1.dims()); REQUIRE_THROWS(T0 / T3); } + +TEST_CASE("Tensor arithmetic operators", "[Tensor][Operator][CPU]") { + SECTION("Addition") { + const Tensor t = Array1D<std::int32_t, 5>{1,2,3,4,5}; + const Tensor t2 = Array1D<std::int32_t, 5>{10,20,30,40,50}; + const Tensor t3 = Tensor(std::int32_t(3)); + + SECTION("operator+") { + auto a = t.clone(); + auto b = t2.clone(); + auto c = t3.clone(); + + // simple addition + auto r1 = a + b; + const Tensor expected_res_simple = Array1D<std::int32_t, 5>{11,22,33,44,55}; + + // input tensors are not modified + REQUIRE(a == t); + REQUIRE(b == t2); + // result is right + REQUIRE(r1 == expected_res_simple); + + // simple addition of arithmetic value + auto r2 = a + 10; + const Tensor expected_res_simple_arithmetic = Array1D<std::int32_t, 5>{11,12,13,14,15}; + + // input tensors are not modified + REQUIRE(a == t); + // result is right + REQUIRE(r2 == expected_res_simple_arithmetic); + + + // chained addition a+b+c + auto r3 = a + b + c; + const Tensor expected_res_chained = Array1D<std::int32_t, 5>{14,25,36,47,58}; + + // input Tensors are not modified + REQUIRE(a == t); + REQUIRE(b == t2); + REQUIRE(c == t3); + // result is right + REQUIRE(r3 == expected_res_chained); + } + SECTION("operator+=") { + auto a = t.clone(); + auto b = t2.clone(); + + a += b; + const Tensor expected_res = Array1D<std::int32_t, 5>{11,22,33,44,55}; + + // input tensors are not modified + REQUIRE(b == t2); + // result is right + REQUIRE(a == expected_res); + + // simple addition of arithmetic value + a = t.clone(); + a += 10; + const Tensor expected_res_arithmetic = Array1D<std::int32_t, 5>{11,12,13,14,15}; + + // result is right + REQUIRE(a == expected_res_arithmetic); + } + } + SECTION("Substraction") { + const Tensor t = Array1D<std::int32_t, 5>{1,2,3,4,5}; + const Tensor t2 = Tensor(std::int32_t(3)); + + SECTION("operator-") { + auto a = t.clone(); + auto b = t2.clone(); + + // simple substraction + auto r1 = a - b; + const Tensor expected_res_simple = Array1D<std::int32_t, 5>{-2,-1,0,1,2}; + + // input tensors are not modified + REQUIRE(a == t); + REQUIRE(b == t2); + // result is right + REQUIRE(r1 == expected_res_simple); + } + SECTION("operator-=") { + auto a = t.clone(); + auto b = t2.clone(); + + a -= b; + const Tensor expected_res = Array1D<std::int32_t, 5>{-2,-1,0,1,2}; + + // input tensors are not modified + REQUIRE(b == t2); + // result is right + REQUIRE(a == expected_res); + } + } +} } // namespace Aidge diff --git a/unit_tests/operator/Test_AddImpl.cpp b/unit_tests/operator/Test_AddImpl.cpp index 95a0e96fe6cf8c19beeef2bdbae3c07873996dcf..bca4025705cb1c851dcf3e9accbf016c4535120a 100644 --- a/unit_tests/operator/Test_AddImpl.cpp +++ b/unit_tests/operator/Test_AddImpl.cpp @@ -39,17 +39,6 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") { } // }); // - SECTION("One input") { - std::shared_ptr<Node> myAdd = Add(1); - auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); - op->associateInput(0, input1); - op->setBackend("cpu"); - op->setDataType(DataType::Int32); - myAdd->forward(); - - REQUIRE(*(op->getOutput(0)) == *input1); - } - SECTION("Two inputs") { std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { { @@ -71,7 +60,7 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") { } }); - std::shared_ptr<Node> myAdd = Add(2); + std::shared_ptr<Node> myAdd = Add(); auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); op->associateInput(0, input1); op->associateInput(1, input1); @@ -82,39 +71,6 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") { REQUIRE(*(op->getOutput(0)) == *expectedOutput); } - SECTION("Three inputs") { - std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { - { - { - {{ 60, 141},{ 63, 144},{ 66, 147}}, - {{ 69, 150},{ 72, 153},{ 75, 156}}, - {{ 78, 159},{ 81, 162},{ 84, 165}} - }, - { - {{ 87, 168},{ 90, 171},{ 93, 174}}, - {{ 96, 177},{ 99, 180},{102, 183}}, - {{105, 186},{108, 189},{111, 192}} - }, - { - {{114, 195},{117, 198},{120, 201}}, - {{123, 204},{126, 207},{129, 210}}, - {{132, 213},{135, 216},{138, 219}} - } - } - }); - - std::shared_ptr<Node> myAdd = Add(3); - auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); - op->associateInput(0, input1); - op->associateInput(1, input1); - op->associateInput(2, input1); - op->setDataType(DataType::Int32); - op->setBackend("cpu"); - myAdd->forward(); - - REQUIRE(*op->getOutput(0) == *expectedOutput); - } - SECTION("Broadcasting") { std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<int,3,1,3,2> { { // @@ -139,7 +95,7 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") { } // }); // - std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{100,200}}); + std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{100,200}}); std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { { // { // @@ -160,16 +116,23 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") { } // }); // - std::shared_ptr<Node> myAdd = Add(3); - auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator()); - op->associateInput(0, input_0); - op->associateInput(1, input_1); - op->associateInput(2, input_2); - op->setDataType(DataType::Int32); - op->setBackend("cpu"); - myAdd->forward(); - op->getOutput(0)->print(); + std::shared_ptr<Node> myAdd_0 = Add(); + std::shared_ptr<Node> myAdd_1 = Add(); + auto op_0 = std::static_pointer_cast<OperatorTensor>(myAdd_0 -> getOperator()); + auto op_1 = std::static_pointer_cast<OperatorTensor>(myAdd_1 -> getOperator()); + op_0->associateInput(0, input_0); + op_0->associateInput(1, input_1); + + op_1->associateInput(0, input_2); + op_1->associateInput(1, op_0->getOutput(0)); + op_0->setDataType(DataType::Int32); + op_1->setDataType(DataType::Int32); + op_0->setBackend("cpu"); + op_1->setBackend("cpu"); + myAdd_0->forward(); + myAdd_1->forward(); + op_1->getOutput(0)->print(); expectedOutput->print(); - REQUIRE(*op->getOutput(0) == *expectedOutput); + REQUIRE(*op_1->getOutput(0) == *expectedOutput); } } \ No newline at end of file diff --git a/unit_tests/operator/Test_Atan.cpp b/unit_tests/operator/Test_Atan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9548e35d81b0423125424a4198d82558c4e57df4 --- /dev/null +++ b/unit_tests/operator/Test_Atan.cpp @@ -0,0 +1,77 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/Atan.hpp" + +#include "aidge/backend/cpu.hpp" + +#include <memory> + +using namespace Aidge; + +TEST_CASE("[cpu/operator] Atan(forward)") { + SECTION("1D Tensor") { + std::shared_ptr<Tensor> input0 = + std::make_shared<Tensor>(Array1D<float, 10>{ + {0.41384590, 0.43120754, 0.93762982, 0.31049860, 0.77547199, + 0.09514862, 0.16145366, 0.42776686, 0.43487436, 0.41170865}}); + std::shared_ptr<Tensor> expectedOutput = + std::make_shared<Tensor>(Array1D<float, 10>{ + {0.39238522, 0.40711672, 0.75322037, 0.30106049, 0.65960488, + 0.09486303, 0.16007232, 0.40421187, 0.4102045, 0.39055911}}); + + std::shared_ptr<Node> myAtan = Atan(); + auto op = std::static_pointer_cast<OperatorTensor>(myAtan->getOperator()); + op->associateInput(0, input0); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + myAtan->forward(); + + float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr()); + float* expectedPtr = + static_cast<float*>(expectedOutput->getImpl()->rawPtr()); + for (std::size_t i = 0; i < expectedOutput->size(); ++i) { + REQUIRE(std::abs(resPtr[i] - expectedPtr[i]) < 0.00001); + } + } + + SECTION("3D Tensor") { + std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>( + Array3D<float, 2, 2, 3>{{{ + {0.97037154, 0.86208081, 0.77767169}, + {0.38160080, 0.11422747, 0.77284443}, + }, + {{0.51592529, 0.72543722, 0.54641193}, + {0.93866944, 0.97767913, 0.34172094}}}}); + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>( + Array3D<float, 2, 2, 3>{{{{0.77036231, 0.71146592, 0.66097706}, + {0.36454508, 0.11373451, 0.65796196}}, + {{0.47630652, 0.62759472, 0.50008428}, + {0.75377332, 0.77411225, 0.32928031}}}}); + + std::shared_ptr<Node> myAtan = Atan(); + auto op = std::static_pointer_cast<OperatorTensor>(myAtan->getOperator()); + op->associateInput(0, input0); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + myAtan->forward(); + + float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr()); + float* expectedPtr = + static_cast<float*>(expectedOutput->getImpl()->rawPtr()); + for (std::size_t i = 0; i < expectedOutput->size(); ++i) { + REQUIRE(std::abs(resPtr[i] - expectedPtr[i]) < 0.00001); + } + } +} diff --git a/unit_tests/operator/Test_ClipImpl.cpp b/unit_tests/operator/Test_ClipImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..45c8da5bf7ecc84fad6b3e694fe204540f579af3 --- /dev/null +++ b/unit_tests/operator/Test_ClipImpl.cpp @@ -0,0 +1,318 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <catch2/catch_test_macros.hpp> +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <chrono> +#include <iostream> +#include <vector> +#include <algorithm> +#include <iomanip> +#include <memory> +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/Clip.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/utils/TensorUtils.hpp" +#include "aidge/backend/cpu.hpp" + +void ComputeClipBackward(const std::vector<float>& vec1, std::vector<float>& vec2, float min, float max) { + if (vec1.size() != vec2.size()) { + std::cerr << "Vectors should have the same sizes." << std::endl; + return; + } + + for (size_t i = 0; i < vec1.size(); ++i) { + if (vec1[i] < min || vec1[i] > max) { + vec2[i] = 0.0f; + } + } +} +namespace Aidge +{ +TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") + { + const std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> dis(0.0, 10.0); + std::uniform_real_distribution<float> dismin(0.0, 4.5); + std::uniform_real_distribution<float> dismax(5.5, 10.0); + std::uniform_int_distribution<std::size_t> distDims(5,15); + std::uniform_int_distribution<std::size_t> distNbMatrix(1, 5); + + // Create MatMul Operator + std::shared_ptr<Node> myClip = Aidge::Clip("nop"); + auto op = std::static_pointer_cast<OperatorTensor>(myClip -> getOperator()); + + // To measure execution time of 'MatMul_Op::forward()' member function call + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration; + + SECTION("Simple clip test [Forward]") { + std::size_t totalComputation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate Tensors dimensions + const std::size_t dim0 = distDims(gen); + const std::size_t dim1 = distDims(gen); + totalComputation += dim0*dim1; + + // Create and populate the array with random float values + float* Array = new float[dim0*dim1]; + for (int i = 0; i < dim0*dim1; ++i) { + Array[i] = dis(gen); // Generate random float value + } + + // Convert Input to Tensor + std::shared_ptr<Tensor> TInput = std::make_shared<Tensor>(DataType::Float32); + TInput -> resize({dim0,dim1}); + TInput -> setBackend("cpu"); + TInput -> getImpl() -> setRawPtr(Array, dim0*dim1); + + float min = dismin(gen); + std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32); + Tmin -> resize({}); + Tmin -> setBackend("cpu"); + Tmin -> getImpl() -> setRawPtr(&min,1); + + float max = dismax(gen); + std::shared_ptr<Tensor> Tmax = std::make_shared<Tensor>(DataType::Float32); + Tmax -> resize({}); + Tmax -> setBackend("cpu"); + Tmax -> getImpl() -> setRawPtr(&max,1); + // convert res to Tensordf + std::vector<float> GT(Array, Array + (dim0*dim1)); + for (float& val : GT) + { + val = std::max(min, std::min(val, max)); + } + std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32); + Tres -> resize({dim0,dim1}); + Tres -> setBackend("cpu"); + Tres -> getImpl() -> setRawPtr(GT.data(), dim0*dim1); + + op->associateInput(0, TInput); + op->associateInput(1, Tmin); + op->associateInput(2, Tmax); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->forwardDims(true); + + start = std::chrono::system_clock::now(); + myClip->forward(); + end = std::chrono::system_clock::now(); + + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); + } + std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; + std::cout << "total time: " << duration.count() << std::endl; + } + SECTION("Clip test with min >= max [Forward]") { + std::size_t totalComputation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate Tensors dimensions + const std::size_t dim0 = distDims(gen); + const std::size_t dim1 = distDims(gen); + totalComputation += dim0*dim1; + + // Create and populate the array with random float values + float* Array = new float[dim0*dim1]; + for (int i = 0; i < dim0*dim1; ++i) { + Array[i] = dis(gen); // Generate random float value + } + + // Convert Input to Tensor + std::shared_ptr<Tensor> TInput = std::make_shared<Tensor>(DataType::Float32); + TInput -> resize({dim0,dim1}); + TInput -> setBackend("cpu"); + TInput -> getImpl() -> setRawPtr(Array, dim0*dim1); + + float min = dismax(gen); + std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32); + Tmin -> resize({}); + Tmin -> setBackend("cpu"); + Tmin -> getImpl() -> setRawPtr(&min,1); + + float max = dismin(gen); //We generate max and min so that max is always <= min + std::shared_ptr<Tensor> Tmax = std::make_shared<Tensor>(DataType::Float32); + Tmax -> resize({}); + Tmax -> setBackend("cpu"); + Tmax -> getImpl() -> setRawPtr(&max,1); + // convert res to Tensor + std::vector<float> GT(Array, Array + (dim0*dim1)); + for (float& val : GT) + { + val = max; + } + std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32); + Tres -> resize({dim0,dim1}); + Tres -> setBackend("cpu"); + Tres -> getImpl() -> setRawPtr(GT.data(), dim0*dim1); + + op->associateInput(0, TInput); + op->associateInput(1, Tmin); + op->associateInput(2, Tmax); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->forwardDims(true); + + start = std::chrono::system_clock::now(); + myClip->forward(); + end = std::chrono::system_clock::now(); + + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); + } + std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; + std::cout << "total time: " << duration.count() << std::endl; + } + SECTION("Clip with Clip Attr [Forward]") + { + std::size_t totalComputation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + + float min = dismin(gen); + float max = dismax(gen); + std::shared_ptr<Node> myCl = Aidge::Clip("",min,max); + auto op = std::static_pointer_cast<OperatorTensor>(myCl -> getOperator()); + + + // generate Tensors dimensions + const std::size_t dim0 = 3; + const std::size_t dim1 = 3; + totalComputation += dim0*dim1; + + // Create and populate the array with random float values + float* Array = new float[dim0*dim1]; + for (int i = 0; i < dim0*dim1; ++i) { + Array[i] = dis(gen); // Generate random float value + } + // Convert Input to Tensor + std::shared_ptr<Tensor> TInput = std::make_shared<Tensor>(DataType::Float32); + TInput -> resize({dim0,dim1}); + TInput -> setBackend("cpu"); + TInput -> getImpl() -> setRawPtr(Array, dim0*dim1); + + // convert res to Tensordf + std::vector<float> GT(Array, Array + (dim0*dim1)); + for (float& val : GT) + { + val = std::max(min, std::min(val, max)); + } + std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32); + Tres -> resize({dim0,dim1}); + Tres -> setBackend("cpu"); + Tres -> getImpl() -> setRawPtr(GT.data(), dim0*dim1); + op->associateInput(0, TInput); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->forwardDims(true); + start = std::chrono::system_clock::now(); + myCl->forward(); + end = std::chrono::system_clock::now(); + + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); + } + std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; + std::cout << "total time: " << duration.count() << std::endl; + } + SECTION("Simple clip test [Backward]") { + std::size_t totalComputation = 0; + duration = std::chrono::duration<double, std::micro>::zero(); + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + std::size_t totalComputation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate Tensors dimensions + const std::size_t dim0 = distDims(gen); + const std::size_t dim1 = distDims(gen); + + totalComputation += dim0*dim1; + + // Create and populate the array with random float values + float* Array = new float[dim0*dim1]; + float* gradArray = new float[dim0*dim1]; + for (int i = 0; i < dim0*dim1; ++i) { + Array[i] = dis(gen); // Generate random float value + gradArray[i] = dis(gen); + } + + std::shared_ptr<Tensor> TGrad = std::make_shared<Tensor>(DataType::Float32); + TGrad -> resize({dim0,dim1}); + TGrad -> setBackend("cpu"); + TGrad -> getImpl() -> setRawPtr(gradArray, dim0*dim1); + + // Convert Input to Tensor + std::shared_ptr<Tensor> TInput = std::make_shared<Tensor>(DataType::Float32); + TInput -> resize({dim0,dim1}); + TInput -> setBackend("cpu"); + TInput -> getImpl() -> setRawPtr(Array, dim0*dim1); + + float min = dismin(gen); + std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32); + Tmin -> resize({}); + Tmin -> setBackend("cpu"); + Tmin -> getImpl() -> setRawPtr(&min,1); + + float max = dismax(gen); + std::shared_ptr<Tensor> Tmax = std::make_shared<Tensor>(DataType::Float32); + Tmax -> resize({}); + Tmax -> setBackend("cpu"); + Tmax -> getImpl() -> setRawPtr(&max,1); + // convert res to Tensor + std::vector<float> GT(Array, Array + (dim0*dim1)); + for (float& val : GT) + { + val = std::max(min, std::min(val, max));//Clip operation + } + std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32); + Tres -> resize({dim0,dim1}); + Tres -> setBackend("cpu"); + Tres -> getImpl() -> setRawPtr(GT.data(), dim0*dim1); + + op->associateInput(0, TInput); + op->associateInput(1, Tmin); + op->associateInput(2, Tmax); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->forwardDims(true); + myClip->forward(); + + op->getOutput(0)->setGrad(TGrad); + + start = std::chrono::system_clock::now(); + REQUIRE_NOTHROW(myClip->backward()); + end = std::chrono::system_clock::now(); + + auto GradTensor = op->getInput(0)->grad(); + float* BackwardTensor = (float*)GradTensor->getImpl()->rawPtr(); + std::vector<float> GT0(Array,Array+(dim0*dim1)); + std::vector<float> GT1(gradArray,gradArray+(dim0*dim1)); + std::vector<float> BackwardTensorVec(BackwardTensor,BackwardTensor+(dim0*dim1)); + ComputeClipBackward(GT0,GT1,min,max); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + REQUIRE(GT1 == BackwardTensorVec); + } + std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; + std::cout << "total time: " << duration.count() << std::endl; + } + } +} // namespace Aidge +} \ No newline at end of file diff --git a/unit_tests/operator/Test_ConvDepthWiseImpl.cpp b/unit_tests/operator/Test_ConvDepthWiseImpl.cpp index e4e46de91bfbc38f41520f1edfc7e99d197e5c83..f1594ef5a21070803a7b86861eac513708ec03a2 100644 --- a/unit_tests/operator/Test_ConvDepthWiseImpl.cpp +++ b/unit_tests/operator/Test_ConvDepthWiseImpl.cpp @@ -11,144 +11,219 @@ #include <catch2/catch_test_macros.hpp> #include <memory> +#include <vector> +#include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp" +#include "aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/operator/ConvDepthWise.hpp" - -#include "aidge/backend/cpu.hpp" +#include "aidge/utils/TensorUtils.hpp" using namespace Aidge; TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") { - std::shared_ptr<Node> myCDW = ConvDepthWise(4, {3,3}, "mycdw"); - auto op = std::static_pointer_cast<OperatorTensor>(myCDW -> getOperator()); - std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int,4,1,3,3> { - { - {{ - { 0, 1, 2}, - { 3, 4, 5}, - { 6, 7, 8} - - }}, - {{ - { 27, 28, 29}, - { 30, 31, 32}, - { 33, 34, 35} - - }}, - {{ - { 54, 55, 56}, - { 57, 58, 59}, - { 60, 61, 62} - }}, - {{ - { 81, 82, 83}, - { 84, 85, 86}, - { 87, 88, 89} - }} - } - }); - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int,4> {{7,0,9,0}}); - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,4,5,5> { //NCHW - { - { - {{ 0, 1, 2, 3, 4}, - { 5, 6, 7, 8, 9}, - { 10, 11, 12, 13, 14}, - { 15, 16, 17, 18, 19}, - { 20, 21, 22, 23, 24}}, - - {{ 25, 26, 27, 28, 29}, - { 30, 31, 32, 33, 34}, - { 35, 36, 37, 38, 39}, - { 40, 41, 42, 43, 44}, - { 45, 46, 47, 48, 49}}, - - {{ 50, 51, 52, 53, 54}, - { 55, 56, 57, 58, 59}, - { 60, 61, 62, 63, 64}, - { 65, 66, 67, 68, 69}, - { 70, 71, 72, 73, 74}}, - - {{ 75, 76, 77, 78, 79}, - { 80, 81, 82, 83, 84}, - { 85, 86, 87, 88, 89}, - { 90, 91, 92, 93, 94}, - { 95, 96, 97, 98, 99}} - }, + SECTION("k[3,3]") { + std::shared_ptr<Node> myCDW = ConvDepthWise(4, {3,3}, "mycdw"); + auto op = std::static_pointer_cast<OperatorTensor>(myCDW -> getOperator()); + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int,4,1,3,3> { { - {{100, 101, 102, 103, 104}, - {105, 106, 107, 108, 109}, - {110, 111, 112, 113, 114}, - {115, 116, 117, 118, 119}, - {120, 121, 122, 123, 124}}, - - {{125, 126, 127, 128, 129}, - {130, 131, 132, 133, 134}, - {135, 136, 137, 138, 139}, - {140, 141, 142, 143, 144}, - {145, 146, 147, 148, 149}}, - - {{150, 151, 152, 153, 154}, - {155, 156, 157, 158, 159}, - {160, 161, 162, 163, 164}, - {165, 166, 167, 168, 169}, - {170, 171, 172, 173, 174}}, - - {{175, 176, 177, 178, 179}, - {180, 181, 182, 183, 184}, - {185, 186, 187, 188, 189}, - {190, 191, 192, 193, 194}, - {195, 196, 197, 198, 199}} + {{ + { 0, 1, 2}, + { 3, 4, 5}, + { 6, 7, 8} + + }}, + {{ + { 27, 28, 29}, + { 30, 31, 32}, + { 33, 34, 35} + + }}, + {{ + { 54, 55, 56}, + { 57, 58, 59}, + { 60, 61, 62} + }}, + {{ + { 81, 82, 83}, + { 84, 85, 86}, + { 87, 88, 89} + }} } - } - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,4,3,3> { - { + }); + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int,4> {{7,0,9,0}}); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,4,5,5> { //NCHW { - {{ 319, 355, 391}, - { 499, 535, 571}, - { 679, 715, 751}}, - - {{ 8745, 9024, 9303}, - { 10140, 10419, 10698}, - { 11535, 11814, 12093}}, - - {{ 29337, 29859, 30381}, - { 31947, 32469, 32991}, - { 34557, 35079, 35601}}, - - {{ 62061, 62826, 63591}, - { 65886, 66651, 67416}, - { 69711, 70476, 71241}} - }, + { + {{ 0, 1, 2, 3, 4}, + { 5, 6, 7, 8, 9}, + { 10, 11, 12, 13, 14}, + { 15, 16, 17, 18, 19}, + { 20, 21, 22, 23, 24}}, + + {{ 25, 26, 27, 28, 29}, + { 30, 31, 32, 33, 34}, + { 35, 36, 37, 38, 39}, + { 40, 41, 42, 43, 44}, + { 45, 46, 47, 48, 49}}, + + {{ 50, 51, 52, 53, 54}, + { 55, 56, 57, 58, 59}, + { 60, 61, 62, 63, 64}, + { 65, 66, 67, 68, 69}, + { 70, 71, 72, 73, 74}}, + + {{ 75, 76, 77, 78, 79}, + { 80, 81, 82, 83, 84}, + { 85, 86, 87, 88, 89}, + { 90, 91, 92, 93, 94}, + { 95, 96, 97, 98, 99}} + }, + { + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}, + + {{150, 151, 152, 153, 154}, + {155, 156, 157, 158, 159}, + {160, 161, 162, 163, 164}, + {165, 166, 167, 168, 169}, + {170, 171, 172, 173, 174}}, + + {{175, 176, 177, 178, 179}, + {180, 181, 182, 183, 184}, + {185, 186, 187, 188, 189}, + {190, 191, 192, 193, 194}, + {195, 196, 197, 198, 199}} + } + } + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,4,3,3> { { - {{ 3919, 3955, 3991}, - { 4099, 4135, 4171}, - { 4279, 4315, 4351}}, - - {{ 36645, 36924, 37203}, - { 38040, 38319, 38598}, - { 39435, 39714, 39993}}, - - {{ 81537, 82059, 82581}, - { 84147, 84669, 85191}, - { 86757, 87279, 87801}}, - - {{138561, 139326, 140091}, - {142386, 143151, 143916}, - {146211, 146976, 147741}} + { + {{ 319, 355, 391}, + { 499, 535, 571}, + { 679, 715, 751}}, + + {{ 8745, 9024, 9303}, + { 10140, 10419, 10698}, + { 11535, 11814, 12093}}, + + {{ 29337, 29859, 30381}, + { 31947, 32469, 32991}, + { 34557, 35079, 35601}}, + + {{ 62061, 62826, 63591}, + { 65886, 66651, 67416}, + { 69711, 70476, 71241}} + }, + { + {{ 3919, 3955, 3991}, + { 4099, 4135, 4171}, + { 4279, 4315, 4351}}, + + {{ 36645, 36924, 37203}, + { 38040, 38319, 38598}, + { 39435, 39714, 39993}}, + + {{ 81537, 82059, 82581}, + { 84147, 84669, 85191}, + { 86757, 87279, 87801}}, + + {{138561, 139326, 140091}, + {142386, 143151, 143916}, + {146211, 146976, 147741}} + } } - } - }); - op -> associateInput(0, myInput); - op -> associateInput(1, myWeights); - op -> associateInput(2, myBias); - op->setDataType(DataType::Int32); - op->setBackend("cpu"); - myCDW -> forward(); - op -> getOutput(0) -> print(); - REQUIRE(*(op -> getOutput(0)) == *myOutput); - - // std::cout << static_cast<Tensor>((*op)["weight"])[0][0][0][0] << std::endl; + }); + op -> associateInput(0, myInput); + op -> associateInput(1, myWeights); + op -> associateInput(2, myBias); + op->setDataType(DataType::Int32); + op->setBackend("cpu"); + myCDW -> forward(); + op -> getOutput(0) -> print(); + REQUIRE(*(op -> getOutput(0)) == *myOutput); + } + SECTION("point-wise") { + ConvDepthWise_Op<2> conv_op = ConvDepthWise_Op<2>({1,1}); + std::shared_ptr<Tensor> weights = std::make_shared<Tensor>(std::vector<std::size_t>({3,1,1,1})); + weights -> setBackend("cpu"); + std::shared_ptr<Tensor> biases = std::make_shared<Tensor>(std::vector<std::size_t>({3})); + biases -> setBackend("cpu"); + std::shared_ptr<Tensor> input = std::make_shared<Tensor>(std::vector<std::size_t>({2,3,5,5})); + input -> setBackend("cpu"); + std::shared_ptr<Tensor> expected_output = std::make_shared<Tensor>(std::vector<std::size_t>({2,3,5,5})); + expected_output -> setBackend("cpu"); + + float weighst_array[3] {-0.0045, -0.4223, -0.9452}; + weights->getImpl()->setRawPtr(weighst_array, 3); + + float biases_array[3] {-0.8595, 0.7062, -0.0062}; + biases->getImpl()->setRawPtr(biases_array, 3); + + float input_array[2*3*5*5] { + 0.6581, 0.2509, 0.2660, 0.8270, 0.8040, 0.3147, 0.5028, 0.2591, 0.8585, + 0.7762, 0.9972, 0.0305, 0.1202, 0.2682, 0.9306, 0.7927, 0.1494, 0.0678, + 0.5550, 0.4132, 0.4742, 0.6199, 0.1802, 0.6350, 0.2539, 0.5594, 0.0143, + 0.8656, 0.7105, 0.1420, 0.2464, 0.7883, 0.5715, 0.7642, 0.5492, 0.6628, + 0.4922, 0.7941, 0.8421, 0.7914, 0.0237, 0.8081, 0.0174, 0.6018, 0.7402, + 0.3770, 0.8786, 0.3651, 0.5355, 0.4267, 0.4457, 0.6756, 0.9631, 0.0145, + 0.4470, 0.5202, 0.2675, 0.5815, 0.3487, 0.3457, 0.7179, 0.0518, 0.1520, + 0.0573, 0.9219, 0.3615, 0.0866, 0.5237, 0.4725, 0.2565, 0.8726, 0.6434, + 0.6875, 0.2919, 0.3355, 0.1886, 0.1749, 0.0785, 0.4091, 0.1907, 0.4664, + 0.2738, 0.4784, 0.7807, 0.0687, 0.3091, 0.4557, 0.2277, 0.2424, 0.8691, + 0.1893, 0.2918, 0.5691, 0.1926, 0.2866, 0.0097, 0.5445, 0.5085, 0.1110, + 0.7099, 0.8927, 0.6182, 0.2538, 0.8694, 0.7872, 0.3196, 0.0710, 0.2888, + 0.0403, 0.1670, 0.6840, 0.7323, 0.4861, 0.3390, 0.1096, 0.5070, 0.3872, + 0.7473, 0.6224, 0.6910, 0.7530, 0.0149, 0.0866, 0.9022, 0.5027, 0.3849, + 0.5255, 0.1977, 0.0570, 0.9581, 0.5461, 0.4623, 0.0101, 0.2362, 0.5922, + 0.8398, 0.1497, 0.5160, 0.2862, 0.5931, 0.9728, 0.1353, 0.7790, 0.9137, + 0.9351, 0.4036, 0.7638, 0.3873, 0.0494, 0.7450}; + input->getImpl()->setRawPtr(input_array, 2*3*5*5); + + float expected_output_array[2*3*5*5] { + -0.8624, -0.8606, -0.8607, -0.8632, -0.8631, -0.8609, -0.8617, -0.8606, + -0.8633, -0.8629, -0.8639, -0.8596, -0.8600, -0.8607, -0.8636, -0.8630, + -0.8601, -0.8598, -0.8620, -0.8613, -0.8616, -0.8622, -0.8603, -0.8623, + -0.8606, 0.4700, 0.7002, 0.3407, 0.4062, 0.6463, 0.6022, 0.3733, + 0.4649, 0.3835, 0.4743, 0.4263, 0.4984, 0.3709, 0.3506, 0.3720, + 0.6962, 0.3650, 0.6989, 0.4521, 0.3936, 0.5470, 0.3352, 0.5520, + 0.4801, 0.5260, -0.4274, -0.6447, -0.9165, -0.0199, -0.4287, -0.4979, + -0.2590, -0.5559, -0.3358, -0.3329, -0.6847, -0.0552, -0.1499, -0.0603, + -0.8776, -0.3479, -0.0881, -0.5011, -0.4528, -0.2486, -0.8309, -0.6143, + -0.6561, -0.2821, -0.3233, -0.8603, -0.8603, -0.8598, -0.8613, -0.8603, + -0.8616, -0.8607, -0.8616, -0.8630, -0.8598, -0.8609, -0.8615, -0.8605, + -0.8606, -0.8634, -0.8603, -0.8608, -0.8620, -0.8603, -0.8608, -0.8595, + -0.8619, -0.8617, -0.8600, -0.8626, 0.3292, 0.4451, 0.5991, 0.3390, + 0.3738, 0.5712, 0.6762, 0.5843, 0.6892, 0.6357, 0.4174, 0.3969, + 0.5009, 0.5631, 0.6599, 0.4921, 0.5427, 0.3906, 0.4434, 0.4144, + 0.3882, 0.6999, 0.6697, 0.3252, 0.4939, -0.3700, -0.5029, -0.1931, + -0.0601, -0.9118, -0.5224, -0.4432, -0.0157, -0.2294, -0.5660, -0.7999, + -0.1477, -0.4939, -0.2767, -0.5668, -0.9257, -0.1341, -0.7425, -0.8698, + -0.8900, -0.3877, -0.7282, -0.3722, -0.0529, -0.7103}; + expected_output->getImpl()->setRawPtr(expected_output_array, 2*3*5*5); + + conv_op.associateInput(0, input); + conv_op.associateInput(1, weights); + conv_op.associateInput(2, biases); + + conv_op.setBackend("cpu"); + conv_op.setDataType(DataType::Float32); + conv_op.forwardDims(); + + conv_op.forward(); + + conv_op.getOutput(0)->print(); + + REQUIRE(approxEq<float>(*(conv_op.getOutput(0)),*expected_output, 1e-3f, 1e-4f)); + } } \ No newline at end of file diff --git a/unit_tests/operator/Test_ConvImpl.cpp b/unit_tests/operator/Test_ConvImpl.cpp index b52085139294021de2fe9d72e173ad74db028ea3..e48d69c89eb0d6d52a834b3f32a41d8621fdd42b 100644 --- a/unit_tests/operator/Test_ConvImpl.cpp +++ b/unit_tests/operator/Test_ConvImpl.cpp @@ -15,6 +15,7 @@ #include "aidge/data/Tensor.hpp" #include "aidge/operator/Conv.hpp" +#include "aidge/utils/TensorUtils.hpp" #include "aidge/backend/cpu.hpp" @@ -153,7 +154,7 @@ TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") { op->setDataType(DataType::Int32); op->setBackend("cpu"); myConv->forward(); - // op->getOutput(0)->print(); + op->getOutput(0)->print(); REQUIRE(*(op->getOutput(0)) == *myOutput); } SECTION("Point-wise") { @@ -251,4 +252,147 @@ TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") { REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001); } } + SECTION("Strided and dilated Conv") { + std::shared_ptr<Node> myConv = Conv(3,4,{3,3}, "myconv", {3,3},{2,2}); + auto op = std::static_pointer_cast<OperatorTensor>(myConv -> getOperator()); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,8,8> { + {{{ + {0.0107F, 0.5076F, 0.2293F, 0.0486F, 0.7375F, 0.2637F, 0.9615F, 0.9138F}, + {0.0678F, 0.5604F, 0.1940F, 0.0287F, 0.1029F, 0.2059F, 0.5058F, 0.9885F}, + {0.9904F, 0.2890F, 0.4606F, 0.1055F, 0.9028F, 0.1654F, 0.6499F, 0.4775F}, + {0.9499F, 0.4695F, 0.1713F, 0.0731F, 0.4913F, 0.8921F, 0.1782F, 0.1111F}, + {0.2479F, 0.4669F, 0.1078F, 0.6153F, 0.0299F, 0.6484F, 0.2397F, 0.1814F}, + {0.3779F, 0.9032F, 0.5651F, 0.3896F, 0.8439F, 0.6404F, 0.3813F, 0.0841F}, + {0.5566F, 0.8950F, 0.1226F, 0.8881F, 0.9870F, 0.6256F, 0.6387F, 0.0628F}, + {0.2857F, 0.0579F, 0.6247F, 0.1286F, 0.0951F, 0.1268F, 0.9510F, 0.3789F}}, + + {{0.7648F, 0.5340F, 0.1024F, 0.4098F, 0.9958F, 0.7941F, 0.1190F, 0.7328F}, + {0.4532F, 0.6598F, 0.9146F, 0.1690F, 0.6041F, 0.7230F, 0.5719F, 0.9282F}, + {0.2862F, 0.2329F, 0.7302F, 0.6717F, 0.1983F, 0.1876F, 0.4561F, 0.2126F}, + {0.7849F, 0.0239F, 0.7977F, 0.5935F, 0.9958F, 0.4703F, 0.4612F, 0.1627F}, + {0.6393F, 0.3544F, 0.8643F, 0.5039F, 0.8087F, 0.6521F, 0.5086F, 0.9331F}, + {0.7749F, 0.9798F, 0.6820F, 0.7869F, 0.5144F, 0.2941F, 0.8137F, 0.4561F}, + {0.6505F, 0.3974F, 0.6909F, 0.7019F, 0.2729F, 0.4240F, 0.0162F, 0.1536F}, + {0.3529F, 0.8821F, 0.1812F, 0.3426F, 0.3472F, 0.0300F, 0.8841F, 0.8088F}}, + + {{0.5099F, 0.3323F, 0.1488F, 0.3424F, 0.1494F, 0.6225F, 0.8103F, 0.5995F}, + {0.9198F, 0.5635F, 0.8908F, 0.9378F, 0.6689F, 0.3176F, 0.3755F, 0.3883F}, + {0.0626F, 0.5309F, 0.0307F, 0.3955F, 0.2794F, 0.1420F, 0.4758F, 0.7558F}, + {0.6154F, 0.5280F, 0.2318F, 0.3832F, 0.4435F, 0.3490F, 0.4043F, 0.5872F}, + {0.3705F, 0.3848F, 0.2182F, 0.8332F, 0.4559F, 0.5310F, 0.4611F, 0.4236F}, + {0.6141F, 0.8103F, 0.2260F, 0.9907F, 0.5615F, 0.4520F, 0.6949F, 0.0175F}, + {0.3969F, 0.5021F, 0.0970F, 0.9937F, 0.9270F, 0.4302F, 0.2868F, 0.3891F}, + {0.8693F, 0.5170F, 0.5348F, 0.2676F, 0.9769F, 0.3356F, 0.9427F, 0.3908F}} + }, + { + {{0.4803F, 0.5223F, 0.6395F, 0.8402F, 0.4442F, 0.6377F, 0.7852F, 0.9063F}, + {0.0361F, 0.0470F, 0.3104F, 0.6921F, 0.0543F, 0.4490F, 0.9541F, 0.7395F}, + {0.3832F, 0.3828F, 0.2236F, 0.2068F, 0.4369F, 0.7443F, 0.6952F, 0.6394F}, + {0.5309F, 0.8483F, 0.1991F, 0.9756F, 0.8969F, 0.7284F, 0.4657F, 0.5486F}, + {0.8839F, 0.3260F, 0.6892F, 0.4074F, 0.9473F, 0.5526F, 0.4147F, 0.4786F}, + {0.9674F, 0.0952F, 0.8379F, 0.2163F, 0.9420F, 0.4046F, 0.1339F, 0.5234F}, + {0.4213F, 0.8392F, 0.3184F, 0.4576F, 0.9349F, 0.8267F, 0.0931F, 0.8009F}, + {0.5570F, 0.5871F, 0.4175F, 0.5465F, 0.6679F, 0.9224F, 0.0049F, 0.9421F}}, + + {{0.3739F, 0.6230F, 0.7613F, 0.1337F, 0.8527F, 0.0557F, 0.6424F, 0.8463F}, + {0.7179F, 0.5638F, 0.2457F, 0.4579F, 0.0487F, 0.8693F, 0.8216F, 0.0415F}, + {0.1724F, 0.5108F, 0.9103F, 0.0850F, 0.0080F, 0.8927F, 0.7706F, 0.3600F}, + {0.7751F, 0.8828F, 0.7872F, 0.4541F, 0.3181F, 0.1855F, 0.2486F, 0.0033F}, + {0.5558F, 0.3500F, 0.6034F, 0.1763F, 0.7418F, 0.5190F, 0.5147F, 0.4090F}, + {0.4476F, 0.1249F, 0.8116F, 0.9091F, 0.1738F, 0.6150F, 0.3285F, 0.3133F}, + {0.5657F, 0.4447F, 0.5049F, 0.3425F, 0.7443F, 0.2718F, 0.2466F, 0.5586F}, + {0.3684F, 0.7616F, 0.5165F, 0.9621F, 0.2864F, 0.7747F, 0.8110F, 0.7045F}}, + + {{0.4570F, 0.4577F, 0.0373F, 0.6084F, 0.4632F, 0.3472F, 0.9917F, 0.2011F}, + {0.7921F, 0.2202F, 0.9525F, 0.7274F, 0.3357F, 0.0076F, 0.5786F, 0.3034F}, + {0.6510F, 0.0798F, 0.2757F, 0.1738F, 0.3046F, 0.2197F, 0.3872F, 0.5650F}, + {0.1532F, 0.3204F, 0.6094F, 0.3287F, 0.8903F, 0.9773F, 0.7950F, 0.2845F}, + {0.2482F, 0.3395F, 0.8795F, 0.4325F, 0.1395F, 0.2457F, 0.2968F, 0.5424F}, + {0.8636F, 0.7426F, 0.2151F, 0.6900F, 0.3938F, 0.0062F, 0.4980F, 0.4098F}, + {0.8026F, 0.0464F, 0.2662F, 0.7835F, 0.8444F, 0.0688F, 0.8796F, 0.7625F}, + {0.2764F, 0.5341F, 0.1773F, 0.6671F, 0.7555F, 0.5235F, 0.7142F, 0.9423F}}}} + }); + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,4> {{ 0.1902F, -0.1789F, -0.0314F, -0.0589F}}); + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,3,3,3> { //NCHW + { + { + {{ 0.0039F, 0.1098F, -0.0834F}, + {-0.0890F, 0.0725F, -0.1178F}, + { 0.1056F, -0.0924F, -0.0574F}}, + {{ 0.0070F, -0.0730F, -0.0674F}, + {-0.0380F, -0.1025F, -0.0085F}, + {-0.1451F, -0.0656F, 0.1137F}}, + {{ 0.1020F, 0.1025F, -0.0678F}, + { 0.0028F, 0.1512F, -0.0871F}, + { 0.1563F, -0.1446F, -0.1636F}} + }, + { + {{ 0.1472F, 0.0025F, -0.0281F}, + { 0.0350F, 0.0296F, -0.1711F}, + {-0.1197F, -0.1198F, -0.1130F}}, + {{-0.1492F, 0.1554F, -0.1044F}, + { 0.1203F, -0.1596F, 0.0589F}, + {-0.0436F, -0.1876F, -0.0816F}}, + {{ 0.1572F, -0.0982F, 0.1293F}, + { 0.1358F, 0.1559F, 0.1322F}, + { 0.0296F, -0.0354F, -0.0632F}} + }, + { + {{-0.0941F, -0.0479F, 0.0908F}, + {-0.1319F, -0.1333F, 0.1223F}, + {-0.1098F, 0.1924F, 0.1075F}}, + {{ 0.1796F, 0.0213F, 0.0626F}, + { 0.0275F, 0.1883F, -0.0818F}, + { 0.0363F, 0.0684F, 0.1094F}}, + {{ 0.1131F, 0.1258F, -0.0558F}, + { 0.1498F, 0.0322F, -0.0186F}, + {-0.1801F, -0.0358F, 0.1727F}} + }, + { + {{-0.1500F, -0.0554F, -0.0994F}, + {-0.0818F, -0.1223F, 0.1365F}, + { 0.1281F, 0.1507F, -0.0890F}}, + {{-0.0444F, -0.1071F, -0.1632F}, + { 0.0757F, -0.1235F, 0.0408F}, + { 0.0401F, -0.1914F, 0.1772F}}, + {{-0.0714F, 0.1582F, -0.0065F}, + {-0.0119F, 0.1375F, -0.0727F}, + {-0.1532F, -0.1826F, -0.0417F}} + } + } + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,2,2> { + { + { + {{-0.2174F, -0.0778F}, + {-0.2584F, 0.2303F}}, + {{-0.7686F, -0.3879F}, + {-0.1775F, 0.0119F}}, + {{ 0.5180F, 0.5087F}, + { 0.5398F, 0.3476F}}, + {{-0.5258F, -0.3128F}, + {-0.6673F, -0.1827F}} + }, + { + {{-0.1902F, -0.0467F}, + {-0.3327F, -0.1701F}}, + {{-0.5505F, -0.4875F}, + {-0.4119F, -0.5726F}}, + {{ 0.5777F, 0.4428F}, + { 0.6121F, 0.7221F}}, + {{-0.6009F, -0.6335F}, + {-0.5159F, -0.3353F}} + } + } + }); + op->associateInput(0,myInput); + op->associateInput(1,myWeights); + op->associateInput(2,myBias); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->forwardDims(); + myConv->forward(); + op->getOutput(0)->print(); + REQUIRE(approxEq<float>(*(op->getOutput(0)),*myOutput, 1e-3f, 1e-4f)); + } } \ No newline at end of file diff --git a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp index d5f2065b624de431b43edef9a83bf079905129dd..43af544871ad6c2ac319de09f3c6fce5065e60d5 100644 --- a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp +++ b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp @@ -124,7 +124,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling", dims_in[1]; // averaging per channel : 1 addition per element in // the channel + 1 division this for every batch // create out nb_elems - std::vector<std::size_t> dims_out{dims_in[0], dims_in[1]}; + std::vector<std::size_t> dims_out(dims_in.size(), 1); + dims_out[0] = dims_in[0]; + dims_out[1] = dims_in[1]; const std::size_t out_nb_elems = std::accumulate(dims_out.cbegin(), dims_out.cend(), std::size_t(1), std::multiplies<std::size_t>()); @@ -192,7 +194,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling", // the channel + 1 division this for every batch // create out nb_elems - std::vector<std::size_t> dims_out{dims_in[0], dims_in[1]}; + std::vector<std::size_t> dims_out(dims_in.size(), 1); + dims_out[0] = dims_in[0]; + dims_out[1] = dims_in[1]; const std::size_t out_nb_elems = std::accumulate(dims_out.cbegin(), dims_out.cend(), std::size_t(1), std::multiplies<std::size_t>()); @@ -253,7 +257,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling", SECTION("2D_img") { const std::vector<DimSize_t> in_dims{batch_size, channels, height, width}; - const std::vector<DimSize_t> out_dims{batch_size, channels}; + std::vector<std::size_t> out_dims(in_dims.size(), 1); + out_dims[0] = in_dims[0]; + out_dims[1] = in_dims[1]; DimSize_t in_nb_elems = batch_size * channels * height * width; DimSize_t out_nb_elems = batch_size * channels; number_of_operation += @@ -368,7 +374,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling", SECTION("3D_img") { const std::vector<DimSize_t> in_dims{batch_size, channels, height, width, depth}; - const std::vector<DimSize_t> out_dims{batch_size, channels}; + std::vector<std::size_t> out_dims(in_dims.size(), 1); + out_dims[0] = in_dims[0]; + out_dims[1] = in_dims[1]; DimSize_t in_nb_elems = batch_size * channels * height * width * depth; number_of_operation += diff --git a/unit_tests/operator/Test_PadImpl.cpp b/unit_tests/operator/Test_PadImpl.cpp index 75233c0b97fc6f9812020d0e3d3c695d8cd388f0..cdd3a5f979085f3782776ce69ddd92c0d53150c4 100644 --- a/unit_tests/operator/Test_PadImpl.cpp +++ b/unit_tests/operator/Test_PadImpl.cpp @@ -134,7 +134,7 @@ TEST_CASE("[cpu/operator] Pad(forward)", "[Pad][CPU]") { SECTION("Asymmetric Pad") { const int pv = 0; // pad value - std::shared_ptr<Node> myPad = Pad<2>({0, 1, 1, 0}, "mypad", PadBorderType::Constant, static_cast<double>(pv)); + std::shared_ptr<Node> myPad = Pad<2>({1, 0, 0, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv)); auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator()); std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW { diff --git a/unit_tests/operator/Test_ResizeImpl.cpp b/unit_tests/operator/Test_ResizeImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6b3520fc88d36660ff44403bd41a47cd7ed96256 --- /dev/null +++ b/unit_tests/operator/Test_ResizeImpl.cpp @@ -0,0 +1,249 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cstdint> +#include <memory> + +#include <aidge/data/Data.hpp> +#include <aidge/data/Interpolation.hpp> +#include <aidge/data/half.hpp> +#include <aidge/operator/Pad.hpp> +#include <aidge/utils/ArrayHelpers.hpp> +#include <catch2/catch_test_macros.hpp> + +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/operator/Resize.hpp" +#include "aidge/utils/TensorUtils.hpp" + +namespace Aidge { + +TEST_CASE("[cpu/operator] Resize(forward)", "[Resize][CPU]") { + + Log::setConsoleLevel(Log::Level::Debug); + + SECTION("Nearest") { + SECTION("Ceil") { + std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>(Array4D<std::int32_t, 1, 1, 2, 2>{{ + { + { + { 1, 2}, + { 3, 4} + } + } + }}); + Tensor expected_out_tensor = Tensor(Array4D<std::int32_t, 1, 1, 4, 4>{{ + { + { + { 1, 1, 1, 2}, + { 1, 1, 1, 2}, + { 1, 1, 1, 2}, + { 3, 3, 3, 4} + } + } + }}); + + std::vector<float> scales = {1.0f, 1.0f, 2.0f, 2.0f}; + auto resize_node = Resize(scales, {}, Interpolation::CoordinateTransformation::HalfPixel, Interpolation::Mode::Floor); + auto op = std::static_pointer_cast<Resize_Op>(resize_node->getOperator()); + op->associateInput(0, input_tensor); + + + op->setDataType(DataType::Int32); + op->setBackend("cpu"); + op->forwardDims(true); + op->forward(); + + op->getOutput(0)->print(); + expected_out_tensor.print(); + + CHECK(*(op->getOutput(0)) == expected_out_tensor); + } + } + + SECTION("1-sized input tensor (upscaling)") { + std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 1>{{{{{0.417022}}}}}); + + std::vector<std::size_t> sizes = {1, 1, 2, 2}; + auto resize_node = Resize({}, sizes, Interpolation::CoordinateTransformation::HalfPixel, Interpolation::Mode::Linear); + auto op = std::static_pointer_cast<Resize_Op>(resize_node->getOperator()); + op->associateInput(0, input_tensor); + + + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->forwardDims(true); + op->forward(); + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 1, 1, 2, 2>{ + {{{{0.417022, 0.417022}, {0.417022, 0.417022}}}}}); + op->getOutput(0)->print(); + CHECK(approxEq<float>(*op->getOutput(0), *expectedOutput) == true); + } + SECTION("Upscaling from 5x5 to 10x10 (linear)") { + std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>( + Array4D<float, 1, 1, 5, 5>{{{{{7.20324516e-01, + 1.14374816e-04, + 3.02332580e-01, + 1.46755889e-01, + 9.23385918e-02}, + {1.86260208e-01, + 3.45560730e-01, + 3.96767467e-01, + 5.38816750e-01, + 4.19194520e-01}, + {6.85219526e-01, + 2.04452246e-01, + 8.78117442e-01, + 2.73875929e-02, + 6.70467496e-01}, + {4.17304814e-01, + 5.58689833e-01, + 1.40386939e-01, + 1.98101491e-01, + 8.00744593e-01}, + {9.68261600e-01, + 3.13424170e-01, + 6.92322612e-01, + 8.76389146e-01, + 8.94606650e-01}}}}} + ); + + std::vector<std::size_t> sizes = {1, 1, 10, 10}; + auto resize_node = Resize({}, sizes, Interpolation::CoordinateTransformation::Asymmetric, Interpolation::Mode::Linear); + auto op = std::static_pointer_cast<Resize_Op>(resize_node->getOperator()); + op->associateInput(0, input_tensor); + + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->forwardDims(true); + op->forward(); + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>( + Array4D<float, 1, 1, 10, 10>{{{{{7.20324516e-01, + 3.60219449e-01, + 1.14374816e-04, + 1.51223481e-01, + 3.02332580e-01, + 2.24544227e-01, + 1.46755889e-01, + 1.19547240e-01, + 9.23385918e-02, + 9.23385918e-02}, + + {4.53292370e-01, + 3.13064963e-01, + 1.72837555e-01, + 2.61193782e-01, + 3.49550009e-01, + 3.46168160e-01, + 3.42786312e-01, + 2.99276441e-01, + 2.55766571e-01, + 2.55766571e-01}, + + {1.86260208e-01, + 2.65910476e-01, + 3.45560730e-01, + 3.71164083e-01, + 3.96767467e-01, + 4.67792094e-01, + 5.38816750e-01, + 4.79005635e-01, + 4.19194520e-01, + 4.19194520e-01}, + + {4.35739875e-01, + 3.55373204e-01, + 2.75006473e-01, + 4.56224471e-01, + 6.37442470e-01, + 4.60272312e-01, + 2.83102185e-01, + 4.13966596e-01, + 5.44831038e-01, + 5.44831038e-01}, + + {6.85219526e-01, + 4.44835901e-01, + 2.04452246e-01, + 5.41284859e-01, + 8.78117442e-01, + 4.52752531e-01, + 2.73875929e-02, + 3.48927557e-01, + 6.70467496e-01, + 6.70467496e-01}, + + {5.51262140e-01, + 4.66416597e-01, + 3.81571054e-01, + 4.45411623e-01, + 5.09252191e-01, + 3.10998380e-01, + 1.12744540e-01, + 4.24175322e-01, + 7.35606015e-01, + 7.35606015e-01}, + + {4.17304814e-01, + 4.87997323e-01, + 5.58689833e-01, + 3.49538386e-01, + 1.40386939e-01, + 1.69244215e-01, + 1.98101491e-01, + 4.99423027e-01, + 8.00744593e-01, + 8.00744593e-01}, + + {6.92783237e-01, + 5.64420104e-01, + 4.36057001e-01, + 4.26205903e-01, + 4.16354775e-01, + 4.76800054e-01, + 5.37245333e-01, + 6.92460477e-01, + 8.47675622e-01, + 8.47675622e-01}, + + {9.68261600e-01, + 6.40842915e-01, + 3.13424170e-01, + 5.02873421e-01, + 6.92322612e-01, + 7.84355879e-01, + 8.76389146e-01, + 8.85497928e-01, + 8.94606650e-01, + 8.94606650e-01}, + + {9.68261600e-01, + 6.40842915e-01, + 3.13424170e-01, + 5.02873421e-01, + 6.92322612e-01, + 7.84355879e-01, + 8.76389146e-01, + 8.85497928e-01, + 8.94606650e-01, + 8.94606650e-01}}}}}); + Log::notice("Expected result : dims = {}", expectedOutput->dims()); + expectedOutput->print(); + Log::notice("\nActual result: dims = {}", op->getOutput(0)->dims()); + op->getOutput(0)->print(); + CHECK(approxEq<float>(*op->getOutput(0), + *expectedOutput, + 1e-5f, + 1e-5f) == true); + } +} + +} // namespace Aidge diff --git a/unit_tests/operator/Test_RoundImpl.cpp b/unit_tests/operator/Test_RoundImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b4cf9ffbedc18b35b42ebbc05971f86e0fa584e3 --- /dev/null +++ b/unit_tests/operator/Test_RoundImpl.cpp @@ -0,0 +1,115 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <catch2/catch_test_macros.hpp> +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <chrono> +#include <iostream> +#include <memory> +#include <numeric> +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <iomanip> +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/Round.hpp" +#include "aidge/utils/TensorUtils.hpp" + +namespace Aidge { + +TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") { + constexpr std::uint16_t NBTRIALS = 15; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist(-15, 15); + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3)); + + // Create BitShift Operator + std::shared_ptr<Node> myRound = Round(); + auto op = std::static_pointer_cast<OperatorTensor>(myRound-> getOperator()); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + // Create 2 input Tensors + std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>(); + op->associateInput(0,T0); + T0->setDataType(DataType::Float32); + T0->setBackend("cpu"); + // Create results Tensor + std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(); + Tres->setDataType(DataType::Float32); + Tres->setBackend("cpu"); + + // To measure execution time of 'Round_Op::forward()' member function call + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + + SECTION("Round [Forward]") { + SECTION("Test Forward Kernel") { + std::size_t number_of_operation = 0; + + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + + // generate 2 random Tensors + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) { + dims.push_back(dimSizeDist(gen)); + } + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + + // without broadcasting + float* array0 = new float[nb_elements]; + float* result = new float[nb_elements]; + + for (std::size_t i = 0; i < nb_elements; ++i) { + array0[i] = valueDist(gen); + result[i] = std::nearbyint(array0[i]); + + } + + // input0 + T0->resize(dims); + T0 -> getImpl() -> setRawPtr(array0, nb_elements); + + // results + Tres->resize(dims); + Tres -> getImpl() -> setRawPtr(result, nb_elements); + + op->forwardDims(); + start = std::chrono::system_clock::now(); + myRound->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + bool is_eq = approxEq<float>(*(op->getOutput(0)), *Tres); + + auto Output = *(op->getOutput(0)); + + auto prt = Output.getImpl()->rawPtr(); + + REQUIRE(is_eq); + + + delete[] array0; + delete[] result; + + + } + std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; + std::cout << "total time: " << duration.count() << "μs" << std::endl; + } + } +} // namespace Aidge +} \ No newline at end of file diff --git a/unit_tests/recipies/Test_ConstantFolding.cpp b/unit_tests/recipies/Test_ConstantFolding.cpp index c4866b1258702b93a1bce80501d9acd094a65741..cd035fd5336d3cb66fc70b1c0a4e5c82c9bef0d8 100644 --- a/unit_tests/recipies/Test_ConstantFolding.cpp +++ b/unit_tests/recipies/Test_ConstantFolding.cpp @@ -22,12 +22,12 @@ using namespace Aidge; -TEST_CASE("[ConstantFolding] test") { +TEST_CASE("[ConstantFolding] forward", "[ConstantFolding][forward][CPU]") { // generate the original GraphView auto matmul0 = MatMul("matmul0"); - auto add0 = Add(2, "add0"); + auto add0 = Add("add0"); auto matmul1 = MatMul("matmul1"); - auto add1 = Add(2, "add1"); + auto add1 = Add("add1"); auto b0 = Producer(std::make_shared<Tensor>(Array1D<float,5>{{1, 2, 3, 4, 5}}), "B0", true); auto w0 = Producer(std::make_shared<Tensor>(Array2D<float,5,5>{{{1, 2, 3, 4, 5}, {6, 7, 8, 9, 0}, {1, 2, 3, 4, 5}, {6, 7, 8, 9, 0}, {1, 2, 3, 4, 5}}}), "W0", true); diff --git a/unit_tests/recipies/Test_MatMulTiling.cpp b/unit_tests/recipies/Test_MatMulTiling.cpp new file mode 100644 index 0000000000000000000000000000000000000000..46d5418fd557fbb716f7e1d9c54eb76d94b0061e --- /dev/null +++ b/unit_tests/recipies/Test_MatMulTiling.cpp @@ -0,0 +1,107 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cstddef> +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/recipes/Recipes.hpp" +#include "aidge/operator/MatMul.hpp" +#include "aidge/operator/AvgPooling.hpp" +#include "aidge/operator/MaxPooling.hpp" +#include "aidge/operator/GenericOperator.hpp" +#include "aidge/operator/Producer.hpp" +#include "aidge/graph/OpArgs.hpp" +#include "aidge/scheduler/SequentialScheduler.hpp" +#include "aidge/graph/Matching.hpp" +#include "aidge/utils/TensorUtils.hpp" + +using namespace Aidge; + +TEST_CASE("[MatMulTiling]") { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist(-1.0f, 1.0f); + + auto dataProvider = Producer({2, 3, 80, 80}, "dataProvider"); + auto w1 = Producer({2, 3, 80, 80}, "w1"); + auto matmul1 = MatMul("matmul1"); + auto w2 = Producer({2, 3, 80, 80}, "w1"); + auto matmul2 = MatMul("matmul2"); + auto w3 = Producer({2, 3, 80, 80}, "w1"); + auto matmul3 = MatMul("matmul3"); + + dataProvider->addChild(matmul1, 0, 0); + w1->addChild(matmul1, 0, 1); + matmul1->addChild(matmul2, 0, 0); + w2->addChild(matmul2, 0, 1); + matmul2->addChild(matmul3, 0, 0); + w3->addChild(matmul3, 0, 1); + + auto g1 = getConnectedGraphView(matmul1); + g1->setBackend("cpu"); + g1->forwardDims(); + g1->save("MatMulSplitting_graph"); + + // Fill random values + fmt::println("Fill random values"); + auto tData = std::static_pointer_cast<OperatorTensor>(dataProvider->getOperator())->getOutput(0); + for (size_t i = 0; i < tData->size(); ++i) { + tData->set<float>(i, valueDist(gen)); + } + auto tw1 = std::static_pointer_cast<OperatorTensor>(w1->getOperator())->getOutput(0); + for (size_t i = 0; i < tw1->size(); ++i) { + tw1->set<float>(i, valueDist(gen)); + } + auto tw2 = std::static_pointer_cast<OperatorTensor>(w2->getOperator())->getOutput(0); + for (size_t i = 0; i < tw2->size(); ++i) { + tw2->set<float>(i, valueDist(gen)); + } + auto tw3 = std::static_pointer_cast<OperatorTensor>(w3->getOperator())->getOutput(0); + for (size_t i = 0; i < tw3->size(); ++i) { + tw3->set<float>(i, valueDist(gen)); + } + + fmt::println("Schedule forward graph"); + auto s1 = SequentialScheduler(g1); + s1.forward(); + + const auto tOut = std::static_pointer_cast<OperatorTensor>(g1->getOrderedOutputs()[0].first->getOperator())->getOutput(0)->clone(); + + // Tiling + fmt::println("Tiling"); + matMulTiling(matmul1, {16, 16}); + removeIdentity(g1); + + g1->setBackend("cpu"); + g1->save("MatMulSplitting_graph_split"); + + auto gm = SinglePassGraphMatching(g1); + gm.addNodeLambda("16x16", [](const NodePtr& node) { + const auto op = + std::static_pointer_cast<OperatorTensor>(node->getOperator()); + const auto dims = op->getOutput(0)->dims(); + return (dims.end()[-2] == 16 && dims.end()[-1] == 16); + }); + + const auto results = gm.match("MatMul[16x16]"); + REQUIRE(results.size() == 25); + + // Check result + fmt::println("Schedule forward tiled graph"); + s1 = SequentialScheduler(g1); + s1.resetScheduling(); + s1.forward(); + + const auto tOutTiled = std::static_pointer_cast<OperatorTensor>(g1->getOrderedOutputs()[0].first->getOperator())->getOutput(0)->clone(); + REQUIRE(approxEq<float>(tOut, tOutTiled)); +} diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp index 16112628053a35ef71d5819a53aacc85425da88d..78a10c308a60f026b83ea64cfbd25a848099eb90 100644 --- a/unit_tests/scheduler/Test_Scheduler.cpp +++ b/unit_tests/scheduler/Test_Scheduler.cpp @@ -147,10 +147,13 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") { std::shared_ptr<GraphView> g = Sequential({Conv(1, 3, {3, 3}, "inputConv"), Parallel({ - Conv(3, 3, {1, 1}, "conv1.1"), - Conv(3, 3, {1, 1}, "conv1.2"), + Sequential({ + Parallel({ + Conv(3, 3, {1, 1}, "conv1.1"), + Conv(3, 3, {1, 1}, "conv1.2")}), + Add("add1")}), Conv(3, 3, {1, 1}, "conv1.3")}), - Add(3, "add1"), + Add("add2"), Conv(3, 2, {1, 1}, "conv2"), FC(18, 5, false, "out")}); @@ -216,9 +219,9 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") { std::shared_ptr<Tensor> biasTensor = std::make_shared<Tensor>( Array2D<int, 2, 3>{{{2, 0, 0}, {1, 0, 0}}}); - auto add1 = Add(2, "add1"); + auto add1 = Add("add1"); auto mem = Memorize(3, "mem1"); - auto add2 = Add(2, "add2"); + auto add2 = Add("add2"); auto bias = Producer(biasTensor, "bias"); auto init = Producer(initTensor, "init"); auto input = Producer(in, "input"); @@ -260,9 +263,9 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") { std::shared_ptr<Tensor> biasTensor = std::make_shared<Tensor>( Array2D<int, 2, 3>{{{2, 0, 0}, {1, 0, 0}}}); - auto add1 = Add(2, "add1"); + auto add1 = Add("add1"); auto mem = Memorize(3, "mem1"); - auto add2 = Add(2, "add2"); + auto add2 = Add("add2"); auto bias = Producer(biasTensor, "bias"); auto init = Producer(initTensor, "init"); auto input = Producer(in, "input"); diff --git a/version.txt b/version.txt index d15723fbe8de36b1c3ae302c77d8095459ea88e6..1d0ba9ea182b0f7354f3daf12120744ec5e0c2f8 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.3.2 +0.4.0