diff --git a/CMakeLists.txt b/CMakeLists.txt index 66ef8ff28503a70de816d546b72e21d8528f0e33..ce1b50629a3e0ca97c986e7b3ce8d3df743f75e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,6 +64,16 @@ if(NOT $ENV{AIDGE_INSTALL} STREQUAL "") endif() find_package(aidge_core REQUIRED) +find_package(OpenMP) + +find_package(OpenSSL QUIET) +if(OpenSSL_FOUND) + message(STATUS "OpenSSL found: ${OPENSSL_VERSION}") + add_definitions(-DWITH_OPENSSL) +else() + message(WARNING "OpenSSL not found, SHA256 will not be available.") +endif() + ############################################## # Create target and set properties file(GLOB_RECURSE src_files "src/*.cpp") @@ -78,6 +88,16 @@ target_link_libraries(${module_name} _aidge_core # _ is added because we link the exported target and not the project ) +if(OpenMP_CXX_FOUND) + target_link_libraries(${module_name} PRIVATE OpenMP::OpenMP_CXX) + set(AIDGE_REQUIRES_OPENMP TRUE) +endif() + +# Add definition _USE_MATH_DEFINES to enable math constant definitions from math.h/cmath. +if (WIN32) + target_compile_definitions(${module_name} PRIVATE _USE_MATH_DEFINES) +endif() + #Set target properties set_property(TARGET ${module_name} PROPERTY POSITION_INDEPENDENT_CODE ON) @@ -112,6 +132,12 @@ target_include_directories(${module_name} ${CMAKE_CURRENT_SOURCE_DIR}/src ) +set(AIDGE_REQUIRES_OPENSSL FALSE) +if(OpenSSL_FOUND) + target_link_libraries(${module_name} PRIVATE OpenSSL::SSL OpenSSL::Crypto) + set(AIDGE_REQUIRES_OPENSSL TRUE) +endif() + target_compile_features(${module_name} PRIVATE cxx_std_14) target_compile_options(${module_name} PRIVATE diff --git a/aidge_backend_cpu-config.cmake.in b/aidge_backend_cpu-config.cmake.in index d8e1372bc8a7b79bd09c79b654af4291c995ac58..35865c71a87aebbb04abe6cd964f54e0f08029a0 100644 --- a/aidge_backend_cpu-config.cmake.in +++ b/aidge_backend_cpu-config.cmake.in @@ -2,6 +2,14 @@ include(CMakeFindDependencyMacro) find_dependency(aidge_core) +set(AIDGE_REQUIRES_OPENMP @AIDGE_REQUIRES_OPENMP@) +if (AIDGE_REQUIRES_OPENMP) + find_dependency(OpenMP) +endif() +set(AIDGE_REQUIRES_OPENSSL @AIDGE_REQUIRES_OPENSSL@) +if (AIDGE_REQUIRES_OPENSSL) + find_dependency(OpenSSL) +endif() include(CMakeFindDependencyMacro) diff --git a/aidge_backend_cpu/__init__.py b/aidge_backend_cpu/__init__.py index bb320b2fe436a3be81dde8d643728bd5a30942e7..b88917a21cc2b9d134d9ff1894afeef55604a585 100644 --- a/aidge_backend_cpu/__init__.py +++ b/aidge_backend_cpu/__init__.py @@ -1,2 +1,3 @@ import aidge_core from aidge_backend_cpu.aidge_backend_cpu import * # import so generated by PyBind +from . import benchmark diff --git a/aidge_backend_cpu/benchmark.py b/aidge_backend_cpu/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..81dfc466d3070b1393b5495155bbe9c20dc37595 --- /dev/null +++ b/aidge_backend_cpu/benchmark.py @@ -0,0 +1,40 @@ +import time + +import numpy as np + +import aidge_core + +def prepare_model_scheduler_inputs(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> tuple[aidge_core.GraphView, aidge_core.SequentialScheduler]: + # update model and inputs backend + model.set_backend("cpu") + ordered_inputs = [aidge_core.Tensor(i[1]) for i in input_data] + for ordered_input in ordered_inputs: + ordered_input.set_backend("cpu") + + scheduler = aidge_core.SequentialScheduler(model) + scheduler.generate_scheduling() + + return model, scheduler, ordered_inputs + + +def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]: + model, scheduler, ordered_inputs = prepare_model_scheduler_inputs(model, input_data) + + timings = [] + # Warm-up runs. + for i in range(nb_warmup + nb_iterations): + if i < nb_warmup: + scheduler.forward(forward_dims=False, data=ordered_inputs) + else: + start = time.process_time() + scheduler.forward(forward_dims=False, data=ordered_inputs) + end = time.process_time() + timings.append((end - start)) + return timings + +def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]: + model, scheduler, ordered_inputs = prepare_model_scheduler_inputs(model, input_data) + + scheduler.forward(forward_dims=False, data=ordered_inputs) + + return [np.array(t[0].get_operator().get_output(t[1])) for t in model.get_ordered_outputs()] \ No newline at end of file diff --git a/aidge_backend_cpu/unit_tests/test_scheduler.py b/aidge_backend_cpu/unit_tests/test_scheduler.py index 494f34565ffd644971c97e9adfa06709dee9e36d..b60ff3f01307f22bd8bf635df2d776cb1267d0f5 100644 --- a/aidge_backend_cpu/unit_tests/test_scheduler.py +++ b/aidge_backend_cpu/unit_tests/test_scheduler.py @@ -57,9 +57,9 @@ class test_scheduler(unittest.TestCase): scheduler = aidge_core.SequentialScheduler(graph_view) scheduler.generate_scheduling() - self.assertEqual(len(scheduler.get_static_scheduling()), 10) + self.assertEqual(len(scheduler.get_sequential_static_scheduling()), 10) # Do not care about the order of execution of the producers - self.assertListEqual([i.name() for i in scheduler.get_static_scheduling()[-3:]], EXPECTED_SCHEDULE) + self.assertListEqual([i.name() for i in scheduler.get_sequential_static_scheduling()[-3:]], EXPECTED_SCHEDULE) def test_parallel_scheduling(self): @@ -83,9 +83,9 @@ class test_scheduler(unittest.TestCase): scheduler = aidge_core.SequentialScheduler(graph_view) scheduler.generate_scheduling() - self.assertEqual(len(scheduler.get_static_scheduling()), 11) + self.assertEqual(len(scheduler.get_sequential_static_scheduling()), 11) # Do not care about the order of execution of the producers - self.assertTrue([i.name() for i in scheduler.get_static_scheduling()[-4:]] in EXPECTED_SCHEDULE) + self.assertTrue([i.name() for i in scheduler.get_sequential_static_scheduling()[-4:]] in EXPECTED_SCHEDULE) if __name__ == '__main__': unittest.main() diff --git a/cmake/PybindModuleCreation.cmake b/cmake/PybindModuleCreation.cmake index a520039f6505a7178acefaca076fa3f659e41bcb..e3fe6a7383656e053fe7f89da2fda1083d6374ae 100644 --- a/cmake/PybindModuleCreation.cmake +++ b/cmake/PybindModuleCreation.cmake @@ -1,10 +1,10 @@ -function(generate_python_binding pybind_module_name target_to_bind) +function(generate_python_binding pybind_module_name target_to_bind) find_package(Python COMPONENTS Interpreter Development.Module) Include(FetchContent) - set(PYBIND_VERSION v2.10.4) + set(PYBIND_VERSION v2.13.6) message(STATUS "Retrieving pybind ${PYBIND_VERSION} from git") FetchContent_Declare( diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp index 5db19a2b7a2f88dae13d8baf24cf95f961e730a0..6d090403c40995ced7dd098dc5fe67847119335c 100644 --- a/include/aidge/backend/cpu.hpp +++ b/include/aidge/backend/cpu.hpp @@ -27,8 +27,12 @@ #include "aidge/backend/cpu/operator/ClipImpl.hpp" #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp" #include "aidge/backend/cpu/operator/ConvImpl.hpp" +#include "aidge/backend/cpu/operator/ConvTransposeImpl.hpp" #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp" +#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp" #include "aidge/backend/cpu/operator/DivImpl.hpp" +#include "aidge/backend/cpu/operator/DropoutImpl.hpp" +#include "aidge/backend/cpu/operator/EqualImpl.hpp" #include "aidge/backend/cpu/operator/ErfImpl.hpp" #include "aidge/backend/cpu/operator/ExpandImpl.hpp" #include "aidge/backend/cpu/operator/FCImpl.hpp" @@ -39,6 +43,8 @@ #include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp" #include "aidge/backend/cpu/operator/LnImpl.hpp" #include "aidge/backend/cpu/operator/MatMulImpl.hpp" +#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp" +#include "aidge/backend/cpu/operator/ModImpl.hpp" #include "aidge/backend/cpu/operator/MulImpl.hpp" #include "aidge/backend/cpu/operator/PadImpl.hpp" #include "aidge/backend/cpu/operator/PaddedConvImpl.hpp" @@ -54,9 +60,10 @@ #include "aidge/backend/cpu/operator/SliceImpl.hpp" #include "aidge/backend/cpu/operator/SoftmaxImpl.hpp" #include "aidge/backend/cpu/operator/SubImpl.hpp" +#include "aidge/backend/cpu/operator/TopKImpl.hpp" #include "aidge/backend/cpu/operator/TanhImpl.hpp" #include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp" #include "aidge/backend/cpu/data/TensorImpl.hpp" -#endif /* AIDGE_CPU_IMPORTS_H_ */ +#endif /* AIDGE_CPU_IMPORTS_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp index 16e5f9dee26a6f8b760e14a1ad66a40d8f0f7e93..e6474cf2cca459601f8a7a564ce45742e74f01b5 100644 --- a/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp @@ -20,14 +20,14 @@ namespace Aidge { template <class I, class O> -void AbsImpl_cpu_forward_kernel(std::size_t inputLenght, +void AbsImpl_cpu_forward_kernel(std::size_t inputLength, const void* input_, void* output_) { const I* input = static_cast<const I*>(input_); O* output = static_cast<O*>(output_); - for (std::size_t i = 0; i < inputLenght; ++i) { + for (std::size_t i = 0; i < inputLength; ++i) { output[i] = std::abs(input[i]); } } diff --git a/include/aidge/backend/cpu/operator/AddImpl.hpp b/include/aidge/backend/cpu/operator/AddImpl.hpp index e39c35b42fdb6065aa72aee092cd1cd23b2b1011..cfb85ecfa6a4c65d89079dc23944d6d85d99a785 100644 --- a/include/aidge/backend/cpu/operator/AddImpl.hpp +++ b/include/aidge/backend/cpu/operator/AddImpl.hpp @@ -25,7 +25,17 @@ namespace Aidge { // Operator implementation entry point for the backend using AddImpl_cpu = OperatorImpl_cpu<Add_Op, - void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>; + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*), + void(const std::size_t, + const std::size_t, + const std::size_t, + const std::vector<std::size_t>&, + const std::vector<std::size_t>&, + const std::vector<std::size_t>&, + const void*, + void*, + void*) +>; // Implementation entry point registration to Operator REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create); diff --git a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp index e6d13fcf3699824a8410015d35ff766adf617c11..4be47849db2fd5ee4e21d59a4f1199f13f60b3a9 100644 --- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp @@ -147,25 +147,71 @@ void AddImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, } } +template <class I, class O> +void AddImpl_cpu_backward_kernel(const std::size_t input0Length, + const std::size_t input1Length, + const std::size_t gradOutputLength, + const std::vector<std::size_t>& dims0, + const std::vector<std::size_t>& dims1, + const std::vector<std::size_t>& outputDims, + const void* grad_output_, + void* gradientInput0_, + void* gradientInput1_) +{ + // TODO: Remove input0/1 from the function + const O* gradOutput = static_cast<const O*>(grad_output_); + auto* gradInput0 = static_cast<I*>(gradientInput0_); + auto* gradInput1 = static_cast<I*>(gradientInput1_); + + std::fill_n(gradInput0, input0Length, static_cast<I>(0)); + std::fill_n(gradInput1, input1Length, static_cast<I>(0)); + + auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0); + auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1); + + for (std::size_t i = 0; i < gradOutputLength; ++i) { + auto idxOutputGrad = getMultiDimIndices(outputDims, i); + std::vector<std::size_t> idxInput0(broadcastedDims0.size()); + std::vector<std::size_t> idxInput1(broadcastedDims1.size()); + + for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) { + idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension]; + } + + for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) { + idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension]; + } + + auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0); + auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1); + + // For addition: gradient of both inputs is just the output gradient + // (unlike multiplication where we need to multiply by the other input, + // or subtraction where we need to negate one of them) + gradInput0[idx0] += static_cast<I>(gradOutput[i]); + gradInput1[idx1] += static_cast<I>(gradOutput[i]); + } +} + // Kernels registration to implementation entry point REGISTRAR(AddImpl_cpu, {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}}, - {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<float, float>, nullptr}); + {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<float, float>, Aidge::AddImpl_cpu_backward_kernel<float, float>}); REGISTRAR(AddImpl_cpu, {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}}, - {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, nullptr}); + {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, Aidge::AddImpl_cpu_backward_kernel<double, double>}); REGISTRAR(AddImpl_cpu, {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}}, - {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, nullptr}); + {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, Aidge::AddImpl_cpu_backward_kernel<std::int8_t, std::int8_t>}); REGISTRAR(AddImpl_cpu, {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}}, - {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, nullptr}); + {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, Aidge::AddImpl_cpu_backward_kernel<std::uint8_t, std::uint8_t>}); REGISTRAR(AddImpl_cpu, {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}}, - {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr}); + {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, Aidge::AddImpl_cpu_backward_kernel<std::int32_t, std::int32_t>}); REGISTRAR(AddImpl_cpu, {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}}, - {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr}); + {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, Aidge::AddImpl_cpu_backward_kernel<std::int64_t, std::int64_t>}); } // namespace Aidge -#endif /* AIDGE_CPU_OPERATOR_ADDIMPL_CPU_KERNELS_H_ */ \ No newline at end of file +#endif /* AIDGE_CPU_OPERATOR_ADDIMPL_CPU_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp index 73b710e021ac5031923eb1e9a2492502c02a3633..d7c8ebcf19f64cb60aa2b62f312f4b46351e6ec2 100644 --- a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp @@ -20,7 +20,7 @@ namespace Aidge { namespace { // suppose values are contiguous in memory template <class I, class O> -void equal_contiguous_arrays(const std::size_t input1size, +void and_contiguous_arrays(const std::size_t input1size, const std::size_t input2size, const std::size_t output1size, const I* input1, @@ -31,14 +31,14 @@ void equal_contiguous_arrays(const std::size_t input1size, { const std::size_t in1_id = (input1size != 1) ? i : 0; const std::size_t in2_id = (input2size != 1) ? i : 0; - output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]); + output[i] = static_cast<O>(input1[in1_id] && input2[in2_id]); } } } template <class I, class O> -void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, +void AndImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, const void* input0_, @@ -60,9 +60,8 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, // special case for equal dimensions, the kernel is called with the entire arrays at once if (dims0 == dims1) { const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); - for (std::size_t i = 0; i < input0_contiguous_size; ++i) - { - output[i] = static_cast<O>(input_0[i] == input_1[i]); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) { + output[i] = static_cast<O>(input_0[i] && input_1[i]); } return; } @@ -126,7 +125,7 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, std::size_t dim = contiguousIdx - 1; const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); for (std::size_t stack = 0; stack < nbStacks;) { - equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + and_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, input_0 + offsetIn0*input0_contiguous_size, input_1 + offsetIn1*input1_contiguous_size, output + offsetOut*output_contiguous_size); @@ -146,17 +145,17 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, // Kernels registration to implementation entry point REGISTRAR(AndImpl_cpu, - {DataType::Float32}, - {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}}, + {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float>, nullptr}); REGISTRAR(AndImpl_cpu, - {DataType::Float64}, - {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}}, + {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double>, nullptr}); REGISTRAR(AndImpl_cpu, - {DataType::Int32}, - {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}}, + {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr}); REGISTRAR(AndImpl_cpu, - {DataType::Int64}, - {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}}, + {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr}); } // namespace Aidge diff --git a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp index 2a786339503354514416705b61cfedfcc0b7c321..e82f34fcbd7c5dd2993e05184b76143c66976436 100644 --- a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp @@ -20,20 +20,20 @@ namespace Aidge { template <class I, class O> -void AtanImpl_cpu_forward_kernel(std::size_t inputLenght, +void AtanImpl_cpu_forward_kernel(std::size_t inputLength, const void* input_, void* output_) { const I* input = static_cast<const I*>(input_); O* output = static_cast<O*>(output_); - for (size_t i = 0; i < inputLenght; ++i) { + for (size_t i = 0; i < inputLength; ++i) { output[i] = static_cast<O>(atan(input[i])); } } template <class O, class GI, class GO> -void AtanImpl_cpu_backward_kernel(const std::size_t inputLenght, +void AtanImpl_cpu_backward_kernel(const std::size_t inputLength, const void* output_, const void* grad_output_, void* grad_input_) { const O* output = static_cast<const O*>(output_); @@ -41,9 +41,9 @@ void AtanImpl_cpu_backward_kernel(const std::size_t inputLenght, GI* grad_input = static_cast<GI*>(grad_input_); // Apply the derivative of atan for each element in the input array - for (size_t i = 0; i < inputLenght; ++i) { + for (size_t i = 0; i < inputLength; ++i) { // dx = dy * (1 / (1 + x^2)) - grad_input[i] = grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i])); + grad_input[i] += grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i])); } } diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp index adea96ca43a1ad9d2a49777426913ca4676e4f32..7c76657f7d100255f49ab1e675672407c5fbbaf8 100644 --- a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp +++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp @@ -28,8 +28,10 @@ namespace Aidge { using AvgPooling2D_Op = AvgPooling_Op<2>; using AvgPoolingImpl2D_cpu = OperatorImpl_cpu<AvgPooling_Op<2>, void(const std::array<DimSize_t, 2>&, + const std::array<DimSize_t, 2>&, const std::array<DimSize_t, 2>&, const std::array<DimSize_t, 4>&, + bool, const void *, void *)>; diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp index f6da9dcb026101b93de862499d42ae8734532d52..f9cc13b5b0be6e63aa2ac7da8d3eccbaf7c9cd2e 100644 --- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp @@ -23,6 +23,22 @@ #include "aidge/utils/Types.h" namespace Aidge { + +template <typename T> +using Acc_T = typename std::conditional<std::is_floating_point<T>::value, T, double>::type; + +template <typename T> +typename std::enable_if<std::is_floating_point<T>::value, T>::type +castFromFloat(T value) { + return value; +} + +template <typename T> +typename std::enable_if<!std::is_floating_point<T>::value, T>::type +castFromFloat(double value) { + return static_cast<T>(std::nearbyint(value)); +} + /** * @brief Forward kernel for 2D AvgPoolingolution on CPU backend. * @tparam I Input data type. @@ -35,66 +51,71 @@ namespace Aidge { template <class I, class O> void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, const std::array<DimSize_t, 2>& kernelDims, + const std::array<DimSize_t, 2>& dilations, const std::array<DimSize_t, 4> &dims, + bool ceilMode, const void *input_, void *output_) { - // FIXME: missing convolution attributes as arguments const I *input = static_cast<const I *>(input_); O *output = static_cast<O *>(output_); - // output H size - const std::size_t oxSize = - static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) / - static_cast<float>(strideDims[0]))); + const std::size_t oxSize = + ceilMode + ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) / + static_cast<float>(strideDims[0]))) + : static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) / + static_cast<float>(strideDims[0]))); // output W size - const std::size_t oySize = - static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) / - static_cast<float>(strideDims[1]))); - - // TODO: kernel computation - // output (batch, outCh, Xout, Yout) - // input (batch, ch, Xin, Yin) - // weight (outCh, ch, kernelX, kernelY) - // does not take Dilation attribute into account + const std::size_t oySize = + ceilMode + ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) / + static_cast<float>(strideDims[1]))) + : static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) / + static_cast<float>(strideDims[1]))); + using signedsize = std::make_signed<std::size_t>::type; - for (std::size_t batch = 0; batch < dims[0]; ++batch) { - for (std::size_t ch = 0; ch < dims[1]; ++ch) { - const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize; - const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3]; - std::fill(output + oIndex, output+(oIndex+oxSize*oySize), 0); + +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16) +#endif + for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) { + for (int ch = 0; ch < static_cast<int>(dims[1]); ++ch) { + const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize; + const std::size_t iIndex = (ch + batch * dims[1]) * dims[2] * dims[3]; + for (std::size_t ox = 0; ox < oxSize; ++ox) { - const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]); + const signedsize difx = static_cast<signedsize>(-ox * strideDims[0]); const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0))); const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx); + for (std::size_t oy = 0; oy < oySize; ++oy) { - const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]); + const signedsize dify = static_cast<signedsize>(-oy * strideDims[1]); const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0))); const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify); - const std::size_t oIndexFull = oIndex + ox*oySize + oy; + + const std::size_t oIndexFull = oIndex + ox * oySize + oy; const std::size_t ix = ox * strideDims[0]; const std::size_t iy = oy * strideDims[1]; - if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) { - output[oIndexFull] += static_cast<O>( - input[iIndex + (ix+0)*dims[3] + (iy+0)] + - input[iIndex + (ix+0)*dims[3] + (iy+1)] + - input[iIndex + (ix+0)*dims[3] + (iy+2)] + - input[iIndex + (ix+1)*dims[3] + (iy+0)] + - input[iIndex + (ix+1)*dims[3] + (iy+1)] + - input[iIndex + (ix+1)*dims[3] + (iy+2)] + - input[iIndex + (ix+2)*dims[3] + (iy+0)] + - input[iIndex + (ix+2)*dims[3] + (iy+1)] + - input[iIndex + (ix+2)*dims[3] + (iy+2)]) / O(9); - } else { - for (std::size_t sx = sxMin; sx < sxMax; ++sx) { - for (std::size_t sy = syMin; sy < syMax; ++sy) { - output[oIndexFull] += input[iIndex + (ix+sx)*dims[3] + (iy+sy)]; + Acc_T<I> sum = static_cast<Acc_T<I>>(0); + std::size_t count = 0; + + for (unsigned int sy = syMin; sy < syMax; ++sy) { + for (unsigned int sx = sxMin; sx < sxMax; ++sx) { + // Apply dilation factor + const std::size_t dilated_sx = sx * dilations[0]; + const std::size_t dilated_sy = sy * dilations[1]; + + // Ensure within bounds + if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) { + sum += static_cast<Acc_T<I>>(input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)]); + ++count; } } - // padding not used - output[oIndexFull] /= (sxMax - sxMin) * (syMax - syMin); } + + output[oIndexFull] = count > 0 ? castFromFloat<O>(sum / count) : 0; } } } diff --git a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp index cf97f7372ac528ef28d0f378beb2650af32bfa30..d1d7d529756c1bbad2880579a5dac57ebd9e07c7 100644 --- a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp @@ -53,8 +53,11 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std const DimSize_t featureMapSize = (dims.size() > 2) ? std::accumulate(dims.begin() + 2, dims.end(), 1, std::multiplies<DimSize_t>()) : 1; if ((freeze == true) || (momentum == 0.0f)) { - for (std::size_t batch = 0; batch < nbBatch; ++batch) { - for (std::size_t ch = 0; ch < nbChannels; ++ch) { +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (nbBatch * nbChannels >= 16) +#endif + for (int batch = 0; batch < static_cast<int>(nbBatch); ++batch) { + for (int ch = 0; ch < static_cast<int>(nbChannels); ++ch) { const std::size_t ioIndex = (ch + batch*nbChannels) * featureMapSize; std::fill(output + ioIndex, output + ioIndex + featureMapSize, shift[ch]); const P var = std::sqrt(batchVar[ch] + static_cast<P>(epsilon)); diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp index 807d2b972ba385f9382d4121173a75207600d098..79b0c5a3cc62c3fac7c5529186506d3c86cd9f3f 100644 --- a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp +++ b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp @@ -24,6 +24,7 @@ namespace Aidge { // Operator implementation entry point for the backend using BitShiftImpl_cpu = OperatorImpl_cpu<BitShift_Op, void(const BitShift_Op::BitShiftDirection, + const bool, std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp index 1f2561afe0be9997116cbd82f754c485a1760090..89921d36526ea7c95a2e06edb33013dd31225ada 100644 --- a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp @@ -27,6 +27,7 @@ namespace { template <class I1, class I2, class O> void bitshift_contiguous_arrays( const Aidge::BitShift_Op::BitShiftDirection direction, + const bool rounding, const std::size_t input1size, const std::size_t input2size, const std::size_t output1size, @@ -34,13 +35,18 @@ void bitshift_contiguous_arrays( const I2* input_2, O* output) { - if(direction == Aidge::BitShift_Op::BitShiftDirection::right) { + if (direction == Aidge::BitShift_Op::BitShiftDirection::right) { for (std::size_t i = 0; i < output1size; ++i) { const std::size_t idx1 = (input1size != 1) ? i : 0; const std::size_t idx2 = (input2size != 1) ? i : 0; - output[i]= input_1[idx1] >> input_2[idx2]; + const int shift = input_2[idx2]; + + if (rounding && shift > 0) { + output[i] = ((input_1[idx1] >> (shift - 1)) + 1) >> 1; + } else { + output[i] = input_1[idx1] >> shift; + } } - } else { for (std::size_t i = 0; i < output1size; ++i) { const std::size_t idx1 = (input1size != 1) ? i : 0; @@ -55,6 +61,7 @@ namespace Aidge { template <class I1, class I2, class O> void BitShiftImpl_cpu_forward_kernel( const BitShift_Op::BitShiftDirection direction, + const bool rounding, std::vector<std::size_t> dims0, std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, @@ -79,7 +86,7 @@ void BitShiftImpl_cpu_forward_kernel( // special case for equal dimensions, the kernel is called with the entire arrays at once if (dims0 == dims1) { const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); - bitshift_contiguous_arrays(direction, input0_contiguous_size, input0_contiguous_size, input0_contiguous_size, input_0, input_1, output); + bitshift_contiguous_arrays(direction, rounding, input0_contiguous_size, input0_contiguous_size, input0_contiguous_size, input_0, input_1, output); return; } @@ -142,7 +149,7 @@ void BitShiftImpl_cpu_forward_kernel( std::size_t dim = contiguousIdx - 1; const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); for (std::size_t stack = 0; stack < nbStacks;) { - bitshift_contiguous_arrays<I1,I2,O>(direction, input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + bitshift_contiguous_arrays<I1,I2,O>(direction,rounding,input0_contiguous_size, input1_contiguous_size, output_contiguous_size, input_0 + offsetIn0*input0_contiguous_size, input_1 + offsetIn1*input1_contiguous_size, output + offsetOut*output_contiguous_size); diff --git a/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp index 1afac4698be2a63790ebac671ecc1e59166c5f94..65bf5094debe887d2ef7018fbf4880916d4d48d1 100644 --- a/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp @@ -23,13 +23,14 @@ void ClipImpl_cpu_forward_kernel( float max_, const void* input_, const std::size_t length, - void* output_) + void* output_) { const I* input = static_cast<const I*>(input_); O* output = static_cast<O*>(output_); - + I minCasted = static_cast<I>(min_); + I maxCasted = static_cast<I>(max_); for (std::size_t i = 0; i < length; ++i) { - output[i] = std::min(std::max(static_cast<float>(input[i]), min_), max_); + output[i] = std::min(std::max(input[i], minCasted), maxCasted); } } @@ -38,16 +39,16 @@ void ClipImpl_cpu_backward_kernel( float min_, float max_, const std::size_t length, - const void* input_, + const void* input_, const void* grad_output_, - void* grad_input_) + void* grad_input_) { const I* input = static_cast<const I*>(input_); const GO* grad_output = static_cast<const GO*>(grad_output_); GI* grad_input = static_cast<GI*>(grad_input_); for (std::size_t i = 0; i < length; ++i) { - grad_input[i] = ((input[i] > min_) && (input[i] < max_)) ? grad_output[i] : 0; + grad_input[i] += ((input[i] > min_) && (input[i] < max_)) ? grad_output[i] : 0; } } diff --git a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp index 83e7e030f526e0db3cff4741eabe39e287130562..b595ec9300c27740408993fc501923600967edff 100644 --- a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp +++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp @@ -12,23 +12,21 @@ #ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_ #define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_ -#include <cstddef> #include <memory> -#include <vector> #include "aidge/backend/cpu/operator/OperatorImpl.hpp" #include "aidge/operator/ConstantOfShape.hpp" #include "aidge/utils/Registrar.hpp" -#include "aidge/utils/Types.h" namespace Aidge { + +class Tensor; // Operator implementation entry point for the backend using ConstantOfShapeImpl_cpu = OperatorImpl_cpu<ConstantOfShape_Op, - void(const std::vector<DimSize_t>, const Tensor&, void *)>; + void(const std::shared_ptr<Tensor>&, const Tensor&)>; // Implementation entry point registration to Operator REGISTRAR(ConstantOfShape_Op, "cpu", Aidge::ConstantOfShapeImpl_cpu::create); } // namespace Aidge #endif /* _AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_ */ - diff --git a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp index 18ab9c0a77c4545c955fc4fe1f1fc1cbcb763bf7..c42cc76a67dbd25564d3ebefa8580454ce34cf0d 100644 --- a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp @@ -30,20 +30,11 @@ namespace Aidge { template <class O> void ConstantOfShapeimpl_cpu_forward_kernel( - const std::vector<DimSize_t> output_dims, const Tensor &value, - void *output_) { + const std::shared_ptr<Tensor>& output_, const Tensor &value) { - O *output = static_cast<O *>(output_); - O val; - std::copy(static_cast<O *>(value.getImpl()->hostPtr()), - static_cast<O *>(value.getImpl()->hostPtr()) + - static_cast<NbElts_t>(1), - &val); - const size_t output_size = std::accumulate( - output_dims.begin(), output_dims.end(), 1, std::multiplies<DimSize_t>()); - for (size_t i = 0; i < output_size; ++i) { - output[i] = val; - } + O* output = static_cast<O*>(output_->getImpl()->hostPtr()); + const O val = *reinterpret_cast<O*>(value.getImpl()->hostPtr()); + std::fill_n(output, output_->size(), val); } // Kernels registration to implementation entry point diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp index 906ea1adf744353372c844fd3e16b9dbd13e7f7d..0e2f5a72e4ad1a7e2c8bd239e43914642121965f 100644 --- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp @@ -65,8 +65,11 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri // weight (outCh, ch, kernelX, kernelY) // does not take Dilation attribute into account using signedsize = std::make_signed<std::size_t>::type; - for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { - for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16) +#endif + for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { + for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize; B biasVal = (biases != nullptr) ? biases[ch] : B(0); std::fill(output + oIndex, output+(oIndex+oxSize), biasVal); @@ -152,16 +155,19 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri const std::size_t outChannels_s = oxSize * oySize; if (dilated_kernel_x ==3 && dilated_kernel_y == 3) { - for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { - for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { - +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16) +#endif + for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { + for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { B biasVal = (biases != nullptr) ? biases[ch] : B(0); + std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s; std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; const std::size_t wIndex = ch * 9; if (strideDims[0] == 1 && strideDims[1]==1) { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) { + for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) { for (std::size_t oy = 0; oy < oySize; ++oy) { output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2]; } @@ -175,7 +181,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri } } } else { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) { + for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) { for (std::size_t oy = 0; oy < oySize; ++oy) { output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2]; } @@ -189,24 +195,25 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri } } } - output += outChannels_s; } } } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) { - for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { - for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { - +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16) +#endif + for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { + for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { B biasVal = (biases != nullptr) ? biases[ch] : B(0); + std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s; std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; const std::size_t wIndex = ch; if (strideDims[0] == 1 && strideDims[1] == 1) { - for (std::size_t i = iIndex; i < iIndex + oxSize*oySize; ++i) { - output[i] = biasVal + weights[wIndex] * input[i]; + for (std::size_t i = 0; i < oxSize*oySize; ++i) { + output[oIndex + i] = biasVal + weights[wIndex] * input[iIndex + i]; } } else { - std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize * oySize; for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=strideDims[0]*inputDims[3]) { for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) { output[oIndex + oy] = biasVal + weights[wIndex]*input[iIndex+iy]; @@ -216,19 +223,22 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri } } } else { - for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { - for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { - - B biasVal = (biases != nullptr) ? biases[ch] : B(0); - std::fill(output, output+outChannels_s, biasVal); - +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16) +#endif + for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { + for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { + const std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s; const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1]; + B biasVal = (biases != nullptr) ? biases[ch] : B(0); + std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal); + for (std::size_t ox = 0; ox < oxSize; ++ox) { for (std::size_t oy = 0; oy < oySize; ++oy) { - const std::size_t oIndexFull = ox*oySize + oy; + const std::size_t oIndexFull = oIndex + ox*oySize + oy; const std::size_t ix = ox * strideDims[0]; const std::size_t iy = oy * strideDims[1]; @@ -240,7 +250,6 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri } } } - output += outChannels_s; } } } diff --git a/include/aidge/backend/cpu/operator/ConvImpl.hpp b/include/aidge/backend/cpu/operator/ConvImpl.hpp index c06d0912f419909013f930867ce3c3238c1a5555..e480697b6452440f043901140a07cb643f3cbdb6 100644 --- a/include/aidge/backend/cpu/operator/ConvImpl.hpp +++ b/include/aidge/backend/cpu/operator/ConvImpl.hpp @@ -13,45 +13,64 @@ #define AIDGE_CPU_OPERATOR_CONVIMPL_H_ #include <array> -#include <memory> -#include <tuple> -#include <vector> #include "aidge/backend/cpu/operator/OperatorImpl.hpp" #include "aidge/operator/Conv.hpp" #include "aidge/utils/Registrar.hpp" #include "aidge/utils/Types.h" -#include "aidge/backend/cpu/data/GetCPUPtr.h" namespace Aidge { + // Operator implementation entry point for the backend using Conv1D_Op = Conv_Op<1>; using ConvImpl1D_cpu = OperatorImpl_cpu<Conv_Op<1>, - void(const std::array<DimSize_t, 1>&, - const std::array<DimSize_t, 1>&, - const std::array<DimSize_t, 1>&, - const std::array<DimSize_t, 3> &, - DimSize_t, - const void *, - const void *, - const void *, - void *)>; + void(const std::array<DimSize_t, 1> &, + const std::array<DimSize_t, 1> &, + const std::array<DimSize_t, 1> &, + const std::array<DimSize_t, 3> &, + DimSize_t, + const void *, + const void *, + const void *, + void *), + void(const std::array<DimSize_t, 1> &, + const std::array<DimSize_t, 1> &, + const std::array<DimSize_t, 1> &, + const std::array<DimSize_t, 3> &, + const std::array<DimSize_t, 3> &, + const void *, + const void *, + const void *, + void *, + void *, + void *)>; using Conv2D_Op = Conv_Op<2>; -using ConvImpl2D_cpu = OperatorImpl_cpu<Conv_Op<2>, - void(const std::array<DimSize_t, 2>&, - const std::array<DimSize_t, 2>&, - const std::array<DimSize_t, 2>&, - const std::array<DimSize_t, 4> &, - DimSize_t, - const void *, - const void *, - const void *, - void *)>; +using ConvImpl2D_cpu = OperatorImpl_cpu<Conv2D_Op, + void(const std::array<DimSize_t, 2> &, + const std::array<DimSize_t, 2> &, + const std::array<DimSize_t, 2> &, + const std::array<DimSize_t, 4> &, + DimSize_t, + const void *, + const void *, + const void *, + void *), + void(const std::array<DimSize_t, 2> &, + const std::array<DimSize_t, 2> &, + const std::array<DimSize_t, 2> &, + const std::array<DimSize_t, 4> &, + const std::array<DimSize_t, 4> &, + const void *, + const void *, + const void *, + void *, + void *, + void *)>; // Implementation entry point registration to Operator REGISTRAR(Conv1D_Op, "cpu", Aidge::ConvImpl1D_cpu::create); REGISTRAR(Conv2D_Op, "cpu", Aidge::ConvImpl2D_cpu::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_CPU_OPERATOR_CONVIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp index 1229d5714e6b0cbae4e42ece9130c2c2305f133e..d2b942f6b6f72235f5d079c0fbb402b1b4ed1373 100644 --- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp @@ -13,18 +13,15 @@ #define AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_ #include <array> -#include <memory> -#include <tuple> -#include <vector> +#include <cstdint> -#include "aidge/backend/cpu/operator/OperatorImpl.hpp" #include "aidge/backend/cpu/operator/ConvImpl.hpp" -#include "aidge/operator/Conv.hpp" #include "aidge/utils/Registrar.hpp" #include "aidge/utils/Types.h" -#include "aidge/backend/cpu/data/GetCPUPtr.h" namespace Aidge { +using std::array; + /** * @brief Forward kernel for 1D Convolution on CPU backend. * @tparam I Input data type. @@ -39,16 +36,15 @@ namespace Aidge { * @param output_ Output Tensor. */ template <class I, class W, class B, class O> -void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims, - const std::array<DimSize_t, 1>& dilationDims, - const std::array<DimSize_t, 1>& kernelDims, - const std::array<DimSize_t, 3>& inputDims, - DimSize_t outChannels, - const void *input_, - const void *weights_, - const void *biases_, - void *output_) -{ +void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim, + const array<DimSize_t, 1> &dilationDim, + const array<DimSize_t, 1> &kernelDim, + const std::array<DimSize_t, 3> &inputDims, + DimSize_t outChannels, + const void *input_, + const void *weights_, + const void *biases_, + void *output_) { // FIXME: missing convolution attributes as arguments const I *input = static_cast<const I *>(input_); const W *weights = static_cast<const W *>(weights_); @@ -56,38 +52,192 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims, O *output = static_cast<O *>(output_); // output H size - const std::size_t oxSize = - static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) / - static_cast<float>(strideDims[0]))); - const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1; + const std::size_t oxSize = static_cast<std::size_t>(std::floor( + static_cast<float>(inputDims[2] - dilationDim[0] * (kernelDim[0] - 1) - + 1 + strideDim[0]) / + static_cast<float>(strideDim[0]))); + const DimSize_t dilated_kernel_x = dilationDim[0] * (kernelDim[0] - 1) + 1; - // TODO: kernel computation - // output (batch, outCh, Xout, Yout) - // input (batch, inCh, Xin, Yin) - // weight (outCh, inCh, kernelX, kernelY) - // does not take Dilation attribute into account using signedsize = std::make_signed<std::size_t>::type; - for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { - for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { - const std::size_t oIndex = (outCh + batch*outChannels) * oxSize; +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16) +#endif + for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { + for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { + const std::size_t oIndex = (outCh + batch * outChannels) * oxSize; // If bias = nullptr, set B(0) B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output + oIndex, output+(oIndex+oxSize), biasVal); + std::fill(output + oIndex, output + (oIndex + oxSize), biasVal); for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { - const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2]; - const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0]; + const std::size_t iIndex = + (inCh + batch * inputDims[1]) * inputDims[2]; + const std::size_t wIndex = + (inCh + outCh * inputDims[1]) * kernelDim[0]; for (std::size_t ox = 0; ox < oxSize; ++ox) { - // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]); - // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0))); - // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx); const std::size_t sxMin = 0; const std::size_t sxMax = dilated_kernel_x; const std::size_t oIndexFull = oIndex + ox; - const signedsize ix = static_cast<signedsize>(ox * strideDims[0]); + const signedsize ix = + static_cast<signedsize>(ox * strideDim[0]); + + for (std::size_t sx = sxMin; sx * dilationDim[0] < sxMax; + ++sx) { + output[oIndexFull] += + weights[wIndex + sx] * + input[iIndex + static_cast<std::size_t>( + ix + static_cast<signedsize>( + sx * dilationDim[0]))]; + } + } + } + } + } +} + +/** + * @brief perform 1D backpropagation for the data input + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * @note formula : + * for i in 0..input_size: + * for n in 0..weight_size: + * dL dYn dL + * ---- = ---- ---- + * dXi dXi Yn + * with : dYn / dXi = w_k + * for each input value + * for each weight + * for each output + * multiply the weight with the associated value + * @note kernel & stride are passed as single integers as they are just arrays + * of length 1 + * @note reminder that kernel dimensions are + * {outChannels, inChannels, {kernelDims}} + * <=> {oDims[1], iDims[1], kernelDim} + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam O Output data type. + * @param[in] stride stride parameter of the convolution operator + * @param[in] dilation dilation parameter of the convolution operator + * @param[in] kDims dimension of the kernel + * @param[in] kStrides nb of elements contained per dimension of the kernel + * @param[in] weights kernel weights + * @param[in] oDims dimensions of the output + * @param[in] oStrides nb of elements contained per dimension of the output + * @param[in] oGrad output gradient + * @param[in] iDims input dimensions + * @param[in] iStrides nb of elements contained per dimension of the input + * @param[inout] iGrad gradients of the input to update + */ +template <class I, class W, class O> +void conv1DBackwardInput(const array<DimSize_t, 1> &stride, + const array<DimSize_t, 1> &dilation, + const array<DimSize_t, 1> &kDim, + const array<DimSize_t, 2> &kStrides, + const W *weights, + const array<DimSize_t, 3> &oDims, + const array<DimSize_t, 2> &oStrides, + const O *oGrad, + const array<DimSize_t, 3> &iDims, + const array<DimSize_t, 2> &iStrides, + I *iGrad) { + + array<DimSize_t, 2> iOffsets{0, 0}; + array<DimSize_t, 2> oOffsets{0, 0}; + array<DimSize_t, 2> kOffsets{0, 0}; + + for (std::size_t batch = 0; batch < iDims[0]; ++batch) { + iOffsets[0] = batch * iStrides[0]; + oOffsets[0] = batch * oStrides[0]; + + for (DimSize_t oChannel = 0; oChannel < oDims[1]; oChannel++) { + oOffsets[1] = (oChannel * oStrides[1]) + oOffsets[0]; + kOffsets[0] = oChannel * kStrides[0]; + + for (std::size_t iChannel = 0; iChannel < iDims[1]; ++iChannel) { + iOffsets[1] = (iChannel * iStrides[1]) + iOffsets[0]; + kOffsets[1] = iChannel * kStrides[1] + kOffsets[0]; + + for (DimSize_t oX = 0; oX < oDims[2]; ++oX) { + auto iX = oX * stride[0]; + auto inIdx = iX + iOffsets[1]; + + for (DimSize_t kX = 0; kX < kDim[0]; ++kX) { + auto dilatedKernelIdx = kX * dilation[0]; + + iGrad[inIdx + dilatedKernelIdx] += + weights[kOffsets[1] + kX] * + oGrad[oOffsets[1] + oX]; + } + } + } + } + } +} + +/** + * @brief computes weight backpropagation for conv1D + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * weight grad + * for i in 0..weight_size: + * for n in 0..output_size: + * dL dYn dL + * ---- = ---- ---- + * dwi dwi Yn + * with : dYn / dwi = x_k + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam O Output data type. + * @param[in] stride stride parameter of the convolution operator + * @param[in] dilation dilation parameter of the convolution operator + * @param[in] iDims input dimensions + * @param[in] iStrides nb of elements contained per dimension of the input + * @param[inout] iGrad gradients of the input to update + * @param[in] oDims dimensions of the output + * @param[in] oStrides nb of elements contained per dimension of the output + * @param[in] oGrad output gradient + * @param[in] kDims dimension of the kernel + * @param[in] kStrides nb of elements contained per dimension of the kernel + * @param[in] weights kernel weights + */ +template <class I, class W, class O> +static void conv1DBackwardWeights(const array<DimSize_t, 1> &stride, + const array<DimSize_t, 1> &dilation, + const array<DimSize_t, 3> &iDims, + const array<DimSize_t, 2> iStrides, + const I *input, + const array<DimSize_t, 3> &oDims, + const array<DimSize_t, 2> oStrides, + const O *oGrad, + const array<DimSize_t, 1> &kDim, + const array<DimSize_t, 2> kStrides, + W *weightsGrad) { + + array<DimSize_t, 2> iOffsets{0, 0}; + array<DimSize_t, 2> oOffsets{0, 0}; + array<DimSize_t, 2> kOffsets{0, 0}; + + for (DimSize_t batch = 0; batch < oDims[0]; ++batch) { + iOffsets[0] = batch * iStrides[0]; + oOffsets[0] = batch * oStrides[0]; + + for (DimSize_t oChannel = 0; oChannel < oDims[1]; ++oChannel) { + oOffsets[1] = oChannel * oStrides[1] + oOffsets[0]; + kOffsets[0] = oChannel * kStrides[0]; + + for (DimSize_t iChannel = 0; iChannel < iDims[1]; ++iChannel) { + kOffsets[1] = iChannel * kStrides[1] + kOffsets[0]; + iOffsets[1] = iChannel * iStrides[1] + iOffsets[0]; + oOffsets[1] = oChannel * oStrides[1] + oOffsets[0]; + + for (DimSize_t kX = 0; kX < kDim[0]; ++kX) { + + for (DimSize_t oX = 0; oX < oDims[2]; ++oX) { + const DimSize_t iX = oX * stride[0] + kX * dilation[0] ; - for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) { - output[oIndexFull] += weights[wIndex + sx] * - input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))]; + weightsGrad[kOffsets[1] + kX] += + input[iOffsets[1] + iX] * oGrad[oOffsets[1] + oX]; } } } @@ -95,20 +245,191 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims, } } +/** + * @brief computes bias backpropagation for conv1D operation + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * @note formula : + * Bias grad: + * for i in 0..bias_size: + * for n in 0..output_size: + * dL dYn dL + * ---- = ---- ---- + * dbi dbi Yn + * with : dYn / dbi = 1 + * + * Hence the partial derivative of the loss wrt bias is the + * output loss. Hence the bias grad is just the sum of the + * loss values over the batch + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam B Bias data type. + * @tparam O Output data type. + * @param[in] oDims output tensor dimensions + * @param[in] oStrides nb of elements contained per dimension of the output + * tensor + * @param[in] oGrad output tensor gradients + * @param[inout] biasesGrad biases gradients + */ +template <class B, class O> +static void conv1DBackwardBias(const array<DimSize_t, 3> &oDims, + const array<DimSize_t, 2> &oStrides, + const O *oGrad, + B *biasesGrad) { + array<DimSize_t, 2> oOffsets{0, 0}; + + for (DimSize_t batchIdx = 0; batchIdx < oDims[0]; ++batchIdx) { + oOffsets[0] = batchIdx * oStrides[0]; + + for (DimSize_t oChannel = 0; oChannel < oDims[1]; ++oChannel) { + oOffsets[1] = oChannel * oStrides[1] + oOffsets[0]; + + for (DimSize_t oIdx = 0; oIdx < oDims[2]; oIdx++) { + biasesGrad[oChannel] += oGrad[oOffsets[1] + oIdx]; + } + } + } +} + +/** + * @brief Backward kernel for 1D Convolution on CPU backend. + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam B Bias data type. + * @tparam O Output data type. + * @param[in] const stride + * @param[in] const kernelDims + * @param[in] const iDims input data dimensions + * @param[in] const oDims output data dimmensions + * @param[in] const oChannels output channel number + * @param[in] const input_ const input Tensor. + * @param[in] const weights_ const weight Tensor. + * @param[in] const biases_ const Biais Tensor. + * @param[in] const output_ Output Tensor. + * @param[in] const oGrad_ gradients of output data + * @param[inout] iGrad_ gradients of input data + * @param[inout] weightsGrad_ gradients of the kernel weights + * @param[inout] biasesGrad_ gradients of the kernel biases + */ +template <class I, class W, class B, class O> +void ConvImpl1D_cpu_backward_kernel(const array<DimSize_t,1> &stride, + const array<DimSize_t,1> &dilation, + const array<DimSize_t,1> &kernelDim, + const array<DimSize_t, 3> &inputDims, + const array<DimSize_t, 3> &outputDims, + const void *input_, + const void *weights_, + const void *oGrad_, + void *iGrad_, + void *weightsGrad_, + void *biasesGrad_) { + + const I *input = static_cast<const I *>(input_); + I *iGrad = static_cast<I *>(iGrad_); + const I *oGrad = static_cast<const I *>(oGrad_); + const W *weights = static_cast<const W *>(weights_); + W *weightsGrad = static_cast<W *>(weightsGrad_); + + ////////////////////////////// + // COMPUTING STRIDES + ////////////////////////////// + // NOTE: The ...Stride var represent the number of values contained in + // each dimension they will be used to compute the index offset of + // values while iterating on each tensor + // NOTE: They are 1 item shorter than their corresponding tensor as the + // number of total elements is not used except for gradient initialization + + // {batch_stride, channel_stride, dim0_stride, dim1_stride} + const array<DimSize_t, 2> inputStrides{inputDims[1] * inputDims[2], + inputDims[2]}; + const DimSize_t nbEltsInput = inputDims[0] * inputStrides[0]; + + // {batch_stride, channel_stride, dim0_stride, dim1_stride} + const array<DimSize_t, 2> outputStrides{outputDims[1] * outputDims[2], + outputDims[2]}; + + // NOTE: kernel dims = {iChannel, oChannel, kernelDim0, kernelDim1} + // kernel_strides = {iChannel, oChannel, kernelDim0} + const array<DimSize_t, 2> kernelStrides{ + inputDims[1] * kernelDim[0], + kernelDim[0], + }; + const DimSize_t nbEltsKernel = outputDims[1] * kernelStrides[0]; + + std::fill(iGrad, iGrad + nbEltsInput, I(0)); + std::fill(weightsGrad, weightsGrad + nbEltsKernel, W(0)); + + conv1DBackwardInput(stride, + dilation, + kernelDim, + kernelStrides, + weights, + outputDims, + outputStrides, + oGrad, + inputDims, + inputStrides, + iGrad); + + conv1DBackwardWeights(stride, + dilation, + inputDims, + inputStrides, + input, + outputDims, + outputStrides, + oGrad, + kernelDim, + kernelStrides, + weightsGrad); + + if (biasesGrad_ != nullptr) { + B *biasesGrad = static_cast<B *>(biasesGrad_); + std::fill(biasesGrad, biasesGrad + outputDims[1], B(0)); + conv1DBackwardBias(outputDims, outputStrides, oGrad, biasesGrad); + } +} + // Kernels registration to implementation entry point REGISTRAR(ConvImpl1D_cpu, - {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr}); + {{DataType::Any, DataFormat::NCHW}, + {DataType::Float32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvImpl1D_cpu_forward_kernel<float, float, float, float>, + ConvImpl1D_cpu_backward_kernel<float, float, float, float>}); REGISTRAR(ConvImpl1D_cpu, - {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr}); + {{DataType::Any, DataFormat::NCHW}, + {DataType::Float16, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvImpl1D_cpu_forward_kernel<half_float::half, + half_float::half, + half_float::half, + half_float::half>, + ConvImpl1D_cpu_backward_kernel<half_float::half, + half_float::half, + half_float::half, + half_float::half>}); REGISTRAR(ConvImpl1D_cpu, - {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr}); + {{DataType::Any, DataFormat::NCHW}, + {DataType::Float64, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvImpl1D_cpu_forward_kernel<double, double, double, double>, + ConvImpl1D_cpu_backward_kernel<double, double, double, double>}); REGISTRAR(ConvImpl1D_cpu, - {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr}); - + {{DataType::Any, DataFormat::NCHW}, + {DataType::Int32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvImpl1D_cpu_forward_kernel<std::int32_t, + std::int32_t, + std::int32_t, + std::int32_t>, + ConvImpl1D_cpu_backward_kernel<std::int32_t, + std::int32_t, + std::int32_t, + std::int32_t>}); /** * @brief Forward kernel for 2D Convolution on CPU backend. @@ -124,16 +445,15 @@ REGISTRAR(ConvImpl1D_cpu, * @param output_ Output Tensor. */ template <class I, class W, class B, class O> -void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, - const std::array<DimSize_t, 2>& dilationDims, - const std::array<DimSize_t, 2>& kernelDims, - const std::array<DimSize_t, 4> &inputDims, - DimSize_t outChannels, - const void *input_, - const void *weights_, - const void *biases_, - void *output_) -{ +void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, + const array<DimSize_t, 2> &dilationDims, + const array<DimSize_t, 2> &kernelDims, + const array<DimSize_t, 4> &inputDims, + DimSize_t outChannels, + const void *input_, + const void *weights_, + const void *biases_, + void *output_) { // FIXME: missing convolution attributes as arguments const I *input = static_cast<const I *>(input_); const W *weights = static_cast<const W *>(weights_); @@ -141,136 +461,575 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, O *output = static_cast<O *>(output_); // output H size - const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1; - const std::size_t oxSize = - static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) / - static_cast<float>(strideDims[0]))); + const DimSize_t dilated_kernel_x = + dilationDims[0] * (kernelDims[0] - 1) + 1; + const std::size_t oxSize = static_cast<std::size_t>(std::floor( + static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) / + static_cast<float>(strideDims[0]))); // output W size - const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1; - const std::size_t oySize = - static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) / - static_cast<float>(strideDims[1]))); - + const DimSize_t dilated_kernel_y = + dilationDims[1] * (kernelDims[1] - 1) + 1; + const std::size_t oySize = static_cast<std::size_t>(std::floor( + static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) / + static_cast<float>(strideDims[1]))); // TODO: kernel computation // output (batch, outCh, Xout, Yout) // input (batch, inCh, Xin, Yin) // weight (outCh, inCh, kernelX, kernelY) // does not take Dilation attribute into account - const std::size_t outChannels_s = oxSize * oySize; + const std::size_t outChannels_s = oxSize * oySize; if (dilated_kernel_x == 3 && dilated_kernel_y == 3) { - for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { - for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16) +#endif + for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { + for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { + std::size_t oIndex = (outCh + batch*outChannels) * outChannels_s; + // If bias = nullptr, set B(0) B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output, output+outChannels_s, biasVal); + std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal); for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { - std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; - const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9; - if (strideDims[0] == 1 && strideDims[1]==1) { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) { + oIndex = (outCh + batch*outChannels) * outChannels_s; + std::size_t iIndex = (inCh + batch * inputDims[1]) * + inputDims[2] * inputDims[3]; + const std::size_t wIndex = + (inCh + outCh * inputDims[1]) * 9; + if (strideDims[0] == 1 && strideDims[1] == 1) { + for (std::size_t ox = 0; ox < oxSize; + ++ox, oIndex += oySize, iIndex -= inputDims[3]) { for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2]; + output[oIndex + oy] += + weights[wIndex + 0] * input[iIndex + oy] + + weights[wIndex + 1] * + input[iIndex + oy + 1] + + weights[wIndex + 2] * + input[iIndex + oy + 2]; } - iIndex+=inputDims[3]; + iIndex += inputDims[3]; for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2]; + output[oIndex + oy] += + weights[wIndex + 3] * input[iIndex + oy] + + weights[wIndex + 4] * + input[iIndex + oy + 1] + + weights[wIndex + 5] * + input[iIndex + oy + 2]; } - iIndex+=inputDims[3]; + iIndex += inputDims[3]; for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2]; + output[oIndex + oy] += + weights[wIndex + 6] * input[iIndex + oy] + + weights[wIndex + 7] * + input[iIndex + oy + 1] + + weights[wIndex + 8] * + input[iIndex + oy + 2]; } } } else { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) { + for (std::size_t ox = 0; ox < oxSize; ++ox, + oIndex += oySize, + iIndex += (strideDims[0] - + 2) * inputDims[3]) { for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2]; + output[oIndex + oy] += + weights[wIndex + 0] * + input[iIndex + oy * strideDims[1]] + + weights[wIndex + 1] * + input[iIndex + oy * strideDims[1] + + 1] + + weights[wIndex + 2] * + input[iIndex + oy * strideDims[1] + 2]; } - iIndex+=inputDims[3]; + iIndex += inputDims[3]; for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy*strideDims[1]]+weights[wIndex+4]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+5]*input[iIndex+oy*strideDims[1]+2]; + output[oIndex + oy] += + weights[wIndex + 3] * + input[iIndex + oy * strideDims[1]] + + weights[wIndex + 4] * + input[iIndex + oy * strideDims[1] + + 1] + + weights[wIndex + 5] * + input[iIndex + oy * strideDims[1] + 2]; } - iIndex+=inputDims[3]; + iIndex += inputDims[3]; for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy*strideDims[1]]+weights[wIndex+7]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+8]*input[iIndex+oy*strideDims[1]+2]; + output[oIndex + oy] += + weights[wIndex + 6] * + input[iIndex + oy * strideDims[1]] + + weights[wIndex + 7] * + input[iIndex + oy * strideDims[1] + + 1] + + weights[wIndex + 8] * + input[iIndex + oy * strideDims[1] + 2]; } } } } - output += outChannels_s; } } } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) { - for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { - for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16) +#endif + for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { + for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { + std::size_t oIndex = (outCh + batch*outChannels) * outChannels_s; + // If bias = nullptr, set B(0) B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output, output+outChannels_s, biasVal); + std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal); for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { - std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; - const std::size_t wIndex = (inCh + outCh*inputDims[1]); + oIndex = (outCh + batch*outChannels) * outChannels_s; + std::size_t iIndex = (inCh + batch * inputDims[1]) * + inputDims[2] * inputDims[3]; + const std::size_t wIndex = (inCh + outCh * inputDims[1]); if (strideDims[0] == 1 && strideDims[1] == 1) { - for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) { - output[oIndex] += weights[wIndex] * input[iIndex]; + for (std::size_t i = 0; i < outChannels_s; ++i) { + output[oIndex + i] += weights[wIndex] * input[iIndex + i]; } - } else { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) { - for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) { - output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy]; + } else { + for (std::size_t ox = 0; ox < oxSize; + ++ox, + oIndex += oySize, + iIndex += + inputDims[3] * strideDims[0]) { + for (std::size_t oy = 0, iy = 0; oy < oySize; + ++oy, iy += strideDims[1]) { + output[oIndex + oy] += + weights[wIndex + 0] * input[iIndex + iy]; } } } } - output += outChannels_s; } } } else { - for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { - for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16) +#endif + for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { + for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { + std::size_t oIndex = (outCh + batch*outChannels) * outChannels_s; + // If bias = nullptr, set B(0) B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output, output+outChannels_s, biasVal); + std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal); for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { - std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; - const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1]; + oIndex = (outCh + batch*outChannels) * outChannels_s; + std::size_t iIndex_channel = + (inCh + batch * inputDims[1]) * inputDims[2] * + inputDims[3]; + const std::size_t wIndex = (inCh + outCh * inputDims[1]) * + kernelDims[0] * kernelDims[1]; // loop over each ouput line - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) { + for (std::size_t ox = 0; ox < oxSize; + ++ox, + oIndex += oySize, + iIndex_channel += + inputDims[3] * strideDims[0]) { // loop over associated input line - for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) { + for (std::size_t ky = 0, ix = 0; ky < kernelDims[1]; + ++ky, ix += inputDims[3] * dilationDims[0]) { // loop over the entire line - for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) { - const std::size_t iIndex = iIndex_channel + ix + iy; - // loop over elements assosicated with one output - for (std::size_t kx = 0; kx < kernelDims[0]; ++kx) { - output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]]; + for (std::size_t oy = 0, iy = 0; oy < oySize; + ++oy, iy += strideDims[1]) { + const std::size_t iIndex = + iIndex_channel + ix + iy; + // loop over elements assosicated with one + // output + for (std::size_t kx = 0; kx < kernelDims[0]; + ++kx) { + output[oIndex + oy] += + weights[wIndex + kernelDims[0] * ky + + kx] * + input[iIndex + kx * dilationDims[1]]; } } } } } - output += outChannels_s; } } } } +/** + * @brief perform backpropagation for the input + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * @note formula : + * for i in 0..input_size: + * for n in 0..weight_size: + * dL dYn dL + * ---- = ---- ---- + * dXi dXi Yn + * with : dYn / dXi = w_k + * for each input value + * for each weight + * for each output + * multiply the weight with the associated value + * @note kernel & stride are passed as single integers as they are just arrays + * of length 1 + * @note reminder that kernel dimensions are + * {outChannels, inChannels, {kernelDims}} + * <=> {oDims[1], iDims[1], kernelDim} + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam O Output data type. + * @param[in] stride stride parameter of the convolution operator + * @param[in] dilation dilation parameter of the convolution operator + * @param[in] kDims dimension of the kernel + * @param[in] kStrides nb of elements contained per dimension of the kernel + * @param[in] weights weights values + * @param[in] oDims dimensions of the output + * @param[in] oStrides nb of elements contained per dimension of the output + * @param[in] oGrad output gradient + * @param[in] iDims input dimensions + * @param[in] iStrides nb of elements contained per dimension of the input + * @param[inout] iGrad gradients of the input to update + */ +template <class I, class W, class O> +void conv2DBackwardInput(const array<DimSize_t, 2> &stride, + const array<DimSize_t, 2> &dilation, + const array<DimSize_t, 2> &kDims, + const array<DimSize_t, 3> &kStrides, + const W *weights, + const array<DimSize_t, 4> &oDims, + const array<DimSize_t, 3> &oStrides, + const O *oGrad, + const array<DimSize_t, 4> &iDims, + const array<DimSize_t, 3> &iStrides, + I *iGrad) { + // records index offsets for each dimension that have a stride (== all + // dimension except the last) for every parsed tensor + array<DimSize_t, 3> kOffset{}; + array<DimSize_t, 3> iOffset{}; + array<DimSize_t, 3> oOffset{}; + + for (std::size_t batch = 0; batch < iDims[0]; ++batch) { + iOffset[0] = batch * iStrides[0]; + oOffset[0] = batch * oStrides[0]; + + for (DimSize_t oChannel = 0; oChannel < oDims[1]; oChannel++) { + oOffset[1] = (oChannel * oStrides[1]) + oOffset[0]; + kOffset[0] = (oChannel * kStrides[0]); + + for (std::size_t iChannel = 0; iChannel < iDims[1]; ++iChannel) { + iOffset[1] = (iChannel * iStrides[1]) + iOffset[0]; + kOffset[1] = iChannel * kStrides[1] + kOffset[0]; + + for (DimSize_t oX = 0; oX < oDims[2]; ++oX) { + oOffset[2] = (oX * oStrides[2]) + oOffset[1]; + + auto iX = oX * stride[0]; + iOffset[2] = (iX * iStrides[2]) + iOffset[1]; + + for (DimSize_t oY = 0; oY < oDims[3]; ++oY) { + auto oIdx = oOffset[2] + oY; + + auto iY = oY * stride[1]; + auto iIdx = iOffset[2] + iY; + + for (DimSize_t kX = 0; kX < kDims[0]; ++kX) { + auto kDilX = kX * dilation[0]; + auto iDilKXOffset = kDilX * iStrides[2]; + + kOffset[2] = (kX * kStrides[2]) + kOffset[1]; + + for (DimSize_t kY = 0; kY < kDims[1]; ++kY) { + auto kDilY = kY * dilation[1]; + + iGrad[iIdx + iDilKXOffset + kDilY] += + weights[kOffset[2] + kY] * oGrad[oIdx]; + } + } + } + } + } + } + } +} + +/** + * @brief computes weight backpropagation for conv2D operation + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * weight grad + * for i in 0..weight_size: + * for n in 0..output_size: + * dL dYn dL + * ---- = ---- ---- + * dwi dwi Yn + * with : dYn / dwi = x_k + * @tparam I input dtype + * @tparam W weight dtype + * @tparam O output dtype + * @param[in] iDims input data dimensions + * @param[in] iBatchStride nb element in each input data batch + * @param[in] iChannelStride nb element in each input data channel + * @param[in] input input data + * @param[in] oDims output data dimmensions + * @param[in] oBatchStride nb element in each output data batch + * @param[in] oChannelStride nb element in each output data channel + * @param[in] oGrad gradients of output data + * @param[in] stride + * @param[in] kernelDims + * @param[inout] weightsGrad gradients of the kernel weights + */ +template <class I, class W, class O> +void conv2DBackwardWeights(const array<DimSize_t, 4> &iDims, + const array<DimSize_t, 3> &iStrides, + const I *input, + const array<DimSize_t, 4> &oDims, + const array<DimSize_t, 3> &oStrides, + const O *oGrad, + const array<DimSize_t, 2> &kDim, + const array<DimSize_t, 3> &kStrides, + const array<DimSize_t, 2> &stride, + const array<DimSize_t, 2> &dilation, + W *weightsGrad) { + // records index offsets for each dimension that have a stride (== all + // dimension except the last) for every parsed tensor + array<DimSize_t, 3> iOffsets{0, 0, 0}; + array<DimSize_t, 3> oOffsets{0, 0, 0}; + array<DimSize_t, 3> kOffsets{0, 0, 0}; + + for (DimSize_t batchIdx = 0; batchIdx < oDims[0]; ++batchIdx) { + iOffsets[0] = batchIdx * iStrides[0]; + oOffsets[0] = batchIdx * oStrides[0]; + + for (DimSize_t iChannel = 0; iChannel < iDims[1]; ++iChannel) { + iOffsets[1] = iChannel * iStrides[1] + iOffsets[0]; + kOffsets[0] = iChannel * kStrides[0]; + + for (DimSize_t oChannel = 0; oChannel < oDims[1]; ++oChannel) { + oOffsets[1] = oChannel * oStrides[1] + oOffsets[0]; + kOffsets[1] = oChannel * kStrides[1] + kOffsets[0]; + + for (DimSize_t kX = 0; kX < kDim[0]; ++kX) { + kOffsets[2] = kX * kStrides[2] + kOffsets[1]; + for (DimSize_t kY = 0; kY < kDim[1]; ++kY) { + + for (DimSize_t oX = 0; oX < oDims[2]; ++oX) { + const DimSize_t iX = + oX * stride[0] + kX * dilation[0]; + + oOffsets[2] = oX * oStrides[2] + oOffsets[1]; + iOffsets[2] = iX * iStrides[2] + iOffsets[1]; + + for (DimSize_t oY = 0; oY < oDims[3]; ++oY) { + const DimSize_t iY = + oY * stride[1] + kY * dilation[1]; + + weightsGrad[kOffsets[2] + kY] += + input[iOffsets[2] + iY] * + oGrad[oOffsets[2] + oY]; + } + } + } + } + } + } + } +} + +/** + * @brief computes bias backpropagation for conv2D operation + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * @note formula : + * Bias grad: + * for i in 0..bias_size: + * for n in 0..output_size: + * dL dYn dL + * ---- = ---- ---- + * dbi dbi Yn + * with : dYn / dbi = 1 + * + * Hence the partial derivative of the loss wrt bias is the + * output loss Hence the bias grad is just the sum of the + * loss values over the batch + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam B Bias data type. + * @tparam O Output data type. + * @param[in] oDims output tensor dimensions + * @param[in] oStrides nb of elements contained per dimension of the + * output + * @param[in] oGrad output tensor gradients + * @param[inout] biasesGrad biases gradients + */ +template <class B, class O> +static void conv2DBackwardBias(const array<DimSize_t, 4> &oDims, + const array<DimSize_t, 3> &oStrides, + const O *oGrad, + B *biasesGrad) { + // records all index offsets for output tensor + array<DimSize_t, 3> oOffsets{}; + for (DimSize_t batchIdx = 0; batchIdx < oDims[0]; ++batchIdx) { + oOffsets[0] = batchIdx * oStrides[0]; + + for (DimSize_t oChannel = 0; oChannel < oDims[1]; ++oChannel) { + oOffsets[1] = oChannel * oStrides[1] + oOffsets[0]; + for (DimSize_t oX = 0; oX < oDims[2]; ++oX) { + oOffsets[2] = oX * oStrides[2] + oOffsets[1]; + + for (DimSize_t oY = 0; oY < oDims[3]; ++oY) { + biasesGrad[oChannel] += oGrad[oOffsets[2] + oY]; + } + } + } + } +} + +/** + * @brief Backward kernel for 2D Convolution on CPU backend. + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam B Bias data type. + * @tparam O Output data type. + * @param[in] const stride attribute of conv operator + * @param[in] const dilation attribute of conv operator + * @param[in] const kernelDims + * @param[in] const iDims input data dimensions + * @param[in] const oDims output data dimmensions + * @param[in] const input_ input tensor. + * @param[in] const weights_ kernel tensor. + * @param[in] const oGrad_ output tensor gradient. + * @param[inout] iGrad_ input tensor gradient. + * @param[inout] weightsGrad_ kernel weights tensor gradients + * @param[inout] biasesGrad_ kernel biases tensor gradients + */ +template <class I, class W, class B, class O> +void ConvImpl2D_cpu_backward_kernel(const array<DimSize_t, 2> &stride, + const array<DimSize_t, 2> &dilation, + const array<DimSize_t, 2> &kernelDims, + const array<DimSize_t, 4> &inputDims, + const array<DimSize_t, 4> &outputDims, + const void *input_, + const void *weights_, + const void *oGrad_, + void *iGrad_, + void *weightsGrad_, + void *biasesGrad_) { + + const I *input = static_cast<const I *>(input_); + I *iGrad = static_cast<I *>(iGrad_); + const I *outputGrad = static_cast<const I *>(oGrad_); + const W *weights = static_cast<const W *>(weights_); + W *weightsGrad = static_cast<W *>(weightsGrad_); + + ////////////////////////////// + // COMPUTING STRIDES + ////////////////////////////// + // NOTE: The ...Stride var represent the number of values contained in + // each dimension they will be used to compute the index offset of + // values while iterating on each tensor + // NOTE: They are 1 item shorter than their corresponding tensor as the + // number of total elements is not used except for gradient initialization + + // {batch_stride, channel_stride, dim0_stride, dim1_stride} + const array<DimSize_t, 3> inputStrides{ + inputDims[1] * inputDims[2] * inputDims[3], + inputDims[2] * inputDims[3], + inputDims[3]}; + const DimSize_t nbEltsInput = inputDims[0] * inputStrides[0]; + + // {batch_stride, channel_stride, dim0_stride, dim1_stride} + const array<DimSize_t, 3> outputStrides{ + outputDims[1] * outputDims[2] * outputDims[3], + outputDims[2] * outputDims[3], + outputDims[3]}; + + // NOTE: kernel dims = {iChannel, oChannel, kernelDim0, kernelDim1} + // kernel_strides = {iChannel, oChannel, kernelDim0} + const array<DimSize_t, 3> kernelStrides{ + inputDims[1] * kernelDims[0] * kernelDims[1], + kernelDims[0] * kernelDims[1], + kernelDims[1]}; + + const DimSize_t nbEltsKernel = outputDims[1] * kernelStrides[0]; + + //////////////////////////// + // prepping gradient arrays + std::fill(iGrad, iGrad + nbEltsInput, I(0)); + std::fill(weightsGrad, weightsGrad + nbEltsKernel, W(0)); + + conv2DBackwardInput(stride, + dilation, + kernelDims, + kernelStrides, + weights, + outputDims, + outputStrides, + outputGrad, + inputDims, + inputStrides, + iGrad); + + conv2DBackwardWeights(inputDims, + inputStrides, + input, + outputDims, + outputStrides, + outputGrad, + kernelDims, + kernelStrides, + stride, + dilation, + weightsGrad); + + if (biasesGrad_ != nullptr) { + B *biasesGrad = static_cast<B *>(biasesGrad_); + std::fill(biasesGrad, biasesGrad + outputDims[1], B(0)); + conv2DBackwardBias(outputDims, outputStrides, outputGrad, biasesGrad); + } +} // Kernels registration to implementation entry point REGISTRAR(ConvImpl2D_cpu, - {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr}); -REGISTRAR(ConvImpl2D_cpu, - {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr}); -REGISTRAR(ConvImpl2D_cpu, - {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr}); + {{DataType::Any, DataFormat::NCHW}, + {DataType::Float32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>, + Aidge::ConvImpl2D_cpu_backward_kernel<float, float, float, float>}); REGISTRAR(ConvImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, + {DataType::Float16, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, + half_float::half, + half_float::half, + half_float::half>, + Aidge::ConvImpl2D_cpu_backward_kernel<half_float::half, + half_float::half, + half_float::half, + half_float::half>}); +REGISTRAR( + ConvImpl2D_cpu, {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr}); -} // namespace Aidge + {ProdConso::inPlaceModel, + Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>, + Aidge::ConvImpl2D_cpu_backward_kernel<double, double, double, double>}); +REGISTRAR(ConvImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, + {DataType::Int32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvImpl2D_cpu_forward_kernel<std::int32_t, + std::int32_t, + std::int32_t, + std::int32_t>, + ConvImpl2D_cpu_backward_kernel<std::int32_t, + std::int32_t, + std::int32_t, + std::int32_t>}); +} // namespace Aidge #endif /* AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/ConvTransposeImpl.hpp b/include/aidge/backend/cpu/operator/ConvTransposeImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7604a96a18e7be44f4c2e8970a0b60b1c4ad918b --- /dev/null +++ b/include/aidge/backend/cpu/operator/ConvTransposeImpl.hpp @@ -0,0 +1,59 @@ + +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_CONVTRANSPOSEIMPL_H_ +#define AIDGE_CPU_OPERATOR_CONVTRANSPOSEIMPL_H_ + +#include <array> + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/ConvTranspose.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { + +using std::array; + +// Operator implementation entry point for the backend +using ConvTranspose1D_Op = ConvTranspose_Op<1>; +using ConvTransposeImpl1D_cpu = + OperatorImpl_cpu<ConvTranspose1D_Op, + void(const array<DimSize_t,1> &, + const array<DimSize_t,1> &, + const array<DimSize_t,1> &, + const array<DimSize_t, 3> &, + const array<DimSize_t, 3> &, + const void *, + const void *, + const void *, + void *)>; + +using ConvTranspose2D_Op = ConvTranspose_Op<2>; +using ConvTransposeImpl2D_cpu = + OperatorImpl_cpu<ConvTranspose2D_Op, + void(const array<DimSize_t, 2> &, + const array<DimSize_t, 2> &, + const array<DimSize_t, 2> &, + const array<DimSize_t, 4> &, + const array<DimSize_t, 4> &, + const void *, + const void *, + const void *, + void *)>; + +// Implementation entry point registration to Operator +REGISTRAR(ConvTranspose1D_Op, "cpu", ConvTransposeImpl1D_cpu::create); +REGISTRAR(ConvTranspose2D_Op, "cpu", ConvTransposeImpl2D_cpu::create); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_CONVTRANSPOSEIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/ConvTransposeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvTransposeImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e11dd2625ae1645a8e7c5482b1635b85fb475b06 --- /dev/null +++ b/include/aidge/backend/cpu/operator/ConvTransposeImpl_kernels.hpp @@ -0,0 +1,305 @@ +/******************************************************************************** + * Copyright (c) 2025 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_CONVTRANSPOSEIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_CONVTRANSPOSEIMPL_KERNELS_H_ + +#include <array> + +#include "aidge/backend/cpu/operator/ConvTransposeImpl.hpp" +#include "aidge/utils/Registrar.hpp" +#include <aidge/backend/cpu/operator/ConvImpl_kernels.hpp> +#include <aidge/data/Data.hpp> +#include <aidge/data/half.hpp> +#include <aidge/scheduler/ProdConso.hpp> +#include <aidge/utils/Types.h> + +namespace Aidge { + +using std::array; + +//////////////////////////////////////////////////////// +//////////////////////////////////////////////////////// +// 1D +//////////////////////////////////////////////////////// +//////////////////////////////////////////////////////// + +/** + * @brief performs forward bias operation for convtranspose operator + * + * @tparam B Bias data type. + * @tparam O Output data type. + * @param[in] bias bias values + * @param[in] oDims dimensions of the output + * @param[in] oStrides nb of elements contained per dimension of the output + * @param[out] output + */ +template <class B, class O> +static void convTranspose1DForwardBias(const B *biases, + const array<DimSize_t, 3> &oDims, + const array<DimSize_t, 2> &oStrides, + O *output) { + array<DimSize_t, 2> outOffsets{0, 0}; + for (DimSize_t batch = 0; batch < oDims[0]; ++batch) { + outOffsets[0] = batch * oStrides[0]; + for (DimSize_t outCh = 0; outCh < oDims[1]; ++outCh) { + outOffsets[1] = outCh * oStrides[1] + outOffsets[0]; + // If bias = nullptr, set B(0) + B biasVal = (biases != nullptr) ? biases[outCh] : B(0); + std::fill(output + outOffsets[1], + output + (outOffsets[1] + oDims[2]), + biasVal); + } + } +} + +/** + * @brief forward kernel for convtranspose + * @note ConvTranspose forward is simply convolution backward kernel. + * Check convolution functions for more in-depth details on how the + subfunctions are built. + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam B Bias data type. + * @tparam O Output data type. + * @param[in] stride stride parameter of the convTranspose operator + * @param[in] dilation dilation parameter of the convTranspose operator + * @param[in] inputDims input dimensions + * @param[in] outputDims output tensor dimensions + * @param[in] oStrides nb of elements contained per dimension of the output + * @param[in] input_ values + * @param[in] weight_ values + * @param[in] biases_ values + * @param[out] output + */ +template <class I, class W, class B, class O> +void ConvTransposeImpl1D_cpu_forward_kernel( + const array<DimSize_t, 1> &stride, + const array<DimSize_t, 1> &dilation, + const array<DimSize_t, 1> &kernelDim, + const array<DimSize_t, 3> &inputDims, + const array<DimSize_t, 3> &outputDims, + const void *input_, + const void *weights_, + const void *biases_, + void *output_) { + + const I *input = static_cast<const I *>(input_); + const W *weights = static_cast<const W *>(weights_); + O *output = static_cast<O *>(output_); + + // {batch_stride, channel_stride, dim0_stride} + const array<DimSize_t, 2> inputStrides{inputDims[1] * inputDims[2], + inputDims[2]}; + + // {batch_stride, channel_stride, dim0_stride} + const array<DimSize_t, 2> outputStrides{outputDims[1] * outputDims[2], + outputDims[2]}; + + // NOTE: kernel dims = {inChannels, outChannels, kernelDims[0]} + const array<DimSize_t, 2> kernelStrides{ + outputDims[1] * kernelDim[0], + kernelDim[0], + }; + + if (biases_ != nullptr) { + const B *biases = static_cast<const B *>(biases_); + convTranspose1DForwardBias(biases, outputDims, outputStrides, output); + } + + conv1DBackwardInput(stride, + dilation, + kernelDim, + kernelStrides, + weights, + inputDims, + inputStrides, + input, + outputDims, + outputStrides, + output); +} + +REGISTRAR(ConvTransposeImpl1D_cpu, + {{DataType::Any, DataFormat::NCHW}, + {DataType::Int32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvTransposeImpl1D_cpu_forward_kernel<std::int32_t, + std::int32_t, + std::int32_t, + std::int32_t>, + nullptr}); +REGISTRAR(ConvTransposeImpl1D_cpu, + {{DataType::Any, DataFormat::NCHW}, + {DataType::Float32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvTransposeImpl1D_cpu_forward_kernel<float, float, float, float>, + nullptr}); +REGISTRAR(ConvTransposeImpl1D_cpu, + {{DataType::Any, DataFormat::NCHW}, + {DataType::Float16, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvTransposeImpl1D_cpu_forward_kernel<half_float::half, + half_float::half, + half_float::half, + half_float::half>, + nullptr}); +REGISTRAR( + ConvTransposeImpl1D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvTransposeImpl1D_cpu_forward_kernel<double, double, double, double>, + nullptr}); + +//////////////////////////////////////////////////////// +//////////////////////////////////////////////////////// +// 2D +//////////////////////////////////////////////////////// +//////////////////////////////////////////////////////// + +/** + * @brief performs forward bias operation for convtranspose operator + * + * @tparam B Bias data type. + * @tparam O Output data type. + * @param[in] bias bias values + * @param[in] oDims dimensions of the output + * @param[in] oStrides nb of elements contained per dimension of the output + * @param[out] output + */ +template <class B, class O> +static void convTranspose2DForwardBias(const B *biases, + const array<DimSize_t, 4> &oDims, + const array<DimSize_t, 3> &oStrides, + O *output) { + array<DimSize_t, 2> outOffsets{0, 0}; + + for (DimSize_t batch = 0; batch < oDims[0]; ++batch) { + outOffsets[0] = batch * oStrides[0]; + + for (DimSize_t outCh = 0; outCh < oDims[1]; ++outCh) { + outOffsets[1] = outCh * oStrides[1] + outOffsets[0]; + // If bias = nullptr, set B(0) + B biasVal = (biases != nullptr) ? biases[outCh] : B(0); + std::fill(output + outOffsets[1], + (output + outOffsets[1]) + oStrides[1], + biasVal); + } + } +} + +/** + * @brief forward kernel for convtranspose + * @note ConvTranspose forward is simply convolution backward kernel. + * Check convolution functions for more in-depth details on how the + subfunctions are built. + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam B Bias data type. + * @tparam O Output data type. + * @param[in] stride stride parameter of the convTranspose operator + * @param[in] dilation dilation parameter of the convTranspose operator + * @param[in] inputDims input dimensions + * @param[in] outputDims output tensor dimensions + * @param[in] oStrides nb of elements contained per dimension of the output + * @param[in] input_ values + * @param[in] weight_ values + * @param[in] biases_ values + * @param[out] output + */ +template <class I, class W, class B, class O> +void ConvTransposeImpl2D_cpu_forward_kernel( + const array<DimSize_t, 2> &stride, + const array<DimSize_t, 2> &dilation, + const array<DimSize_t, 2> &kernelDims, + const array<DimSize_t, 4> &inputDims, + const array<DimSize_t, 4> &outputDims, + const void *input_, + const void *weights_, + const void *biases_, + void *output_) { + + auto input = static_cast<const I *>(input_); + auto weights = static_cast<const W *>(weights_); + auto output = static_cast<O *>(output_); + + // {channel_stride, dim0_stride, dim1_stride} + const array<DimSize_t, 3> inputStrides{ + inputDims[1] * inputDims[2] * inputDims[3], + inputDims[2] * inputDims[3], + inputDims[3]}; + + // {channel_stride, dim0_stride, dim1_stride} + const array<DimSize_t, 3> outputStrides{ + outputDims[1] * outputDims[2] * outputDims[3], + outputDims[2] * outputDims[3], + outputDims[3]}; + + // NOTE: kernel dims = {inChannels, outChannels, kernelDims[0], + // kernelDims[1]} + const array<DimSize_t, 3> kernelStrides{ + outputDims[1] * kernelDims[0] * kernelDims[1], + kernelDims[0] * kernelDims[1], + kernelDims[1], + }; + + if (biases_ != nullptr) { + auto biases = static_cast<const B *>(biases_); + convTranspose2DForwardBias(biases, outputDims, outputStrides, output); + } + + conv2DBackwardInput(stride, + dilation, + kernelDims, + kernelStrides, + weights, + inputDims, + inputStrides, + input, + outputDims, + outputStrides, + output); +} + +REGISTRAR(ConvTransposeImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, + {DataType::Int32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvTransposeImpl2D_cpu_forward_kernel<std::int32_t, + std::int32_t, + std::int32_t, + std::int32_t>, + nullptr}); +REGISTRAR(ConvTransposeImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, + {DataType::Float16, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvTransposeImpl2D_cpu_forward_kernel<half_float::half, + half_float::half, + half_float::half, + half_float::half>, + nullptr}); +REGISTRAR(ConvTransposeImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, + {DataType::Float32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvTransposeImpl2D_cpu_forward_kernel<float, float, float, float>, + nullptr}); +REGISTRAR( + ConvTransposeImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvTransposeImpl2D_cpu_forward_kernel<double, double, double, double>, + nullptr}); + +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_CONVTRANSPOSEIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/CryptoHashImpl.hpp b/include/aidge/backend/cpu/operator/CryptoHashImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8b616c1a2e7fb1a2e0d38abe906951d4b92efefa --- /dev/null +++ b/include/aidge/backend/cpu/operator/CryptoHashImpl.hpp @@ -0,0 +1,36 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_H_ +#define AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_H_ + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/CryptoHash.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include <memory> +#include <vector> + +#ifdef WITH_OPENSSL +#include <openssl/sha.h> + +namespace Aidge { +// Operator implementation entry point for the backend +using CryptoHashImpl_cpu = OperatorImpl_cpu<CryptoHash_Op, + void(const std::size_t, const void*, void*)>; + +// Implementation entry point registration to Operator +REGISTRAR(CryptoHash_Op, "cpu", Aidge::CryptoHashImpl_cpu::create); +} // namespace Aidge +#endif + +#endif /* AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/CryptoHashImpl_kernels.hpp b/include/aidge/backend/cpu/operator/CryptoHashImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cd596b6905988050666c7c2dff15a4cf8078e52a --- /dev/null +++ b/include/aidge/backend/cpu/operator/CryptoHashImpl_kernels.hpp @@ -0,0 +1,52 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_KERNELS_H_ + +#include "aidge/utils/Registrar.hpp" + +#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp" + +#ifdef WITH_OPENSSL +namespace Aidge { +template <class I, class O> +void CryptoHashImpl_cpu_forward_kernel(std::size_t inputLength, + const void* input_, + void* output_) { + + const I* input = static_cast<const I*>(input_); + O* output = static_cast<O*>(output_); + + // output must be at least SHA256_DIGEST_LENGTH bytes length + SHA256(reinterpret_cast<const uint8_t*>(input), inputLength * sizeof(I), reinterpret_cast<uint8_t*>(output)); +} + +// Kernels registration to implementation entry point +REGISTRAR(CryptoHashImpl_cpu, + {{DataType::UInt8, DataFormat::Any}, {DataType::UInt8}}, + {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<uint8_t, uint8_t>, nullptr}); +REGISTRAR(CryptoHashImpl_cpu, + {{DataType::UInt8, DataFormat::Any}, {DataType::UInt64}}, + {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<uint8_t, uint64_t>, nullptr}); +REGISTRAR(CryptoHashImpl_cpu, + {{DataType::Float32, DataFormat::Any}, {DataType::UInt8}}, + {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<float, uint8_t>, nullptr}); +REGISTRAR(CryptoHashImpl_cpu, + {{DataType::Float32, DataFormat::Any}, {DataType::UInt64}}, + {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<float, uint64_t>, nullptr}); +REGISTRAR(CryptoHashImpl_cpu, + {{DataType::Float64, DataFormat::Any}, {DataType::UInt8}}, + {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<double, uint8_t>, nullptr}); +} // namespace Aidge +#endif + +#endif /* AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/DivImpl.hpp b/include/aidge/backend/cpu/operator/DivImpl.hpp index 40c1b678a78713d6c3b27629ae898c715797b9b2..a507690b28f115b355423c296186b2fa4a9fcb72 100644 --- a/include/aidge/backend/cpu/operator/DivImpl.hpp +++ b/include/aidge/backend/cpu/operator/DivImpl.hpp @@ -24,7 +24,18 @@ namespace Aidge { // Operator implementation entry point for the backend using DivImpl_cpu = OperatorImpl_cpu<Div_Op, - void(const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)>; + void(const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*), + void(const std::size_t, + const std::size_t, + const std::size_t, + const std::vector<std::size_t>, + const std::vector<std::size_t>, + const std::vector<std::size_t>, + const void*, + const void*, + const void*, + void*, + void*)>; // Implementation entry point registration to Operator REGISTRAR(Div_Op, "cpu", Aidge::DivImpl_cpu::create); diff --git a/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp b/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp index ed6e55a79acbe23a689a67c22477f64f785a3aef..5d3ee7f656cc3599f199d08b7c7d319bdc6cb1bb 100644 --- a/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp @@ -17,6 +17,7 @@ #include <cstdint> // std::int32_t, std::int64_t #include <functional> // std::multiplies +#include "aidge/backend/cpu/operator/MulImpl_kernels.hpp" #include "aidge/utils/Registrar.hpp" #include "aidge/backend/cpu/data/Broadcasting.hpp" @@ -69,16 +70,70 @@ constexpr void DivImpl_cpu_forward_kernel(const std::size_t input1size_, } } + +template <class I1, class I2, class O> +void DivImpl_cpu_backward_kernel(const std::size_t input0Length, + const std::size_t input1Length, + const std::size_t gradOutputLength, + const std::vector<std::size_t>& dims0, + const std::vector<std::size_t>& dims1, + const std::vector<std::size_t>& outputDims, + const void* input0_, + const void* input1_, + const void* grad_output_, + void* gradientInput0_, + void* gradientInput1_) +{ + const I1* input0 = static_cast<const I1*>(input0_); // a + const I2* input1 = static_cast<const I2*>(input1_); // b + const O* grad_output = static_cast<const O*>(grad_output_); + auto* grad_input_0 = static_cast<I1*>(gradientInput0_); // gradient w.r.t. a + auto* grad_input_1 = static_cast<I2*>(gradientInput1_); // gradient w.r.t. b + + std::fill_n(grad_input_0, input0Length, static_cast<I1>(0)); + std::fill_n(grad_input_1, input1Length, static_cast<I2>(0)); + + // Broadcast dims0 and dims1 to match the shape of outputDims + auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0); + auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1); + + for (std::size_t i = 0; i < gradOutputLength; ++i) { + auto idxOutputGrad = getMultiDimIndices(outputDims, i); + std::vector<std::size_t> idxInput0(broadcastedDims0.size()); + std::vector<std::size_t> idxInput1(broadcastedDims1.size()); + + // Map output indices to input indices, considering broadcasting + for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) { + idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension]; + } + + for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) { + idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension]; + } + + auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0); + auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1); + + // grad_a = grad_output * (1/b) + grad_input_0[idx0] += static_cast<I1>(grad_output[i] / input1[idx1]); + + // grad_b = grad_output * (-a/b²) + grad_input_1[idx1] += static_cast<I2>(grad_output[i] * (-input0[idx0] / (input1[idx1] * input1[idx1]))); + } +} + + // Kernels registration to implementation entry point REGISTRAR(DivImpl_cpu, {DataType::Float32}, - {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<float, float, float>, nullptr}); + {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<float, float, float>, Aidge::DivImpl_cpu_backward_kernel<float, float, float>}); REGISTRAR(DivImpl_cpu, {DataType::Float64}, - {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<double, double, double>, nullptr}); + {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<double, double, double>, Aidge::DivImpl_cpu_backward_kernel<double, double, double>}); REGISTRAR(DivImpl_cpu, {DataType::Int32}, - {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr}); + {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, + Aidge::DivImpl_cpu_backward_kernel<std::int32_t, std::int32_t, std::int32_t>}); } // namespace Aidge #endif /* AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/DropoutImpl.hpp b/include/aidge/backend/cpu/operator/DropoutImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e3f0d41ee3bc029a81f2895b5494b4d762131ac3 --- /dev/null +++ b/include/aidge/backend/cpu/operator/DropoutImpl.hpp @@ -0,0 +1,35 @@ +/******************************************************************************** + * Copyright (c) 2025 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_DROPOUTIMPL_H_ +#define AIDGE_CPU_OPERATOR_DROPOUTIMPL_H_ + +#include <cstddef> // std::size_t + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/Dropout.hpp" +#include "aidge/utils/Registrar.hpp" + +namespace Aidge { + +// Operator implementation entry point for the backend +using DropoutImpl_cpu = OperatorImpl_cpu<Dropout_Op, + void(float, + std::size_t, + unsigned int, + const void*, + void*)>; + +// Implementation entry point registration to Operator +REGISTRAR(Dropout_Op, "cpu", Aidge::DropoutImpl_cpu::create); + +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_DROPOUTIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/DropoutImpl_kernels.hpp b/include/aidge/backend/cpu/operator/DropoutImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..61e616804ec328e4b7ad5c49278d2b8b827800ff --- /dev/null +++ b/include/aidge/backend/cpu/operator/DropoutImpl_kernels.hpp @@ -0,0 +1,61 @@ +/******************************************************************************** + * Copyright (c) 2025 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_DROPOUTIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_DROPOUTIMPL_KERNELS_H_ + +#include <cstddef> // std::size_t +#include <memory> +#include <random> + +#include "aidge/backend/cpu/operator/DropoutImpl.hpp" +#include "aidge/data/DataType.hpp" +#include "aidge/utils/Registrar.hpp" + + +namespace Aidge { + +template <DataType DT_I, DataType DT_O = DT_I> +void DropoutImpl_cpu_forward_kernel(float probability, + std::size_t nb_elements, + unsigned int seed, + const void* input_, + void* output_) +{ + using I = cpptype_t<DT_I>; + using O = cpptype_t<DT_O>; + const I *input = static_cast<const I *>(input_); + O *output = static_cast<O *>(output_); + + // const unsigned int seed = static_cast<unsigned int>(std::random_device{}()); + std::mt19937 rng(seed); + std::bernoulli_distribution bernoulli_dist(1.0f - probability); //bernoulli keep_prob + + const I scale = I(1.0) / static_cast<I>(1.0f - probability); + + for (std::size_t i = 0; i < nb_elements; ++i) + { + output[i] = bernoulli_dist(rng) ? static_cast<O>(input[i] * scale) : static_cast<O>(0.0); + } + +} + +REGISTRAR(DropoutImpl_cpu, + {DataType::Float32}, + {ProdConso::defaultModel, DropoutImpl_cpu_forward_kernel<DataType::Float32>, nullptr}); + +REGISTRAR(DropoutImpl_cpu, + {DataType::Float64}, + {ProdConso::defaultModel, DropoutImpl_cpu_forward_kernel<DataType::Float64>, nullptr}); + +} // namespace aidge + +#endif // AIDGE_CPU_OPERATOR_DROPOUTIMPL_KERNELS_H_ diff --git a/include/aidge/backend/cpu/operator/EqualImpl.hpp b/include/aidge/backend/cpu/operator/EqualImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e2489096067a49139f6291898056d525a77db522 --- /dev/null +++ b/include/aidge/backend/cpu/operator/EqualImpl.hpp @@ -0,0 +1,32 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_EQUALIMPL_H_ +#define AIDGE_CPU_OPERATOR_EQUALIMPL_H_ + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/Equal.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include <memory> +#include <vector> + +namespace Aidge { +// Operator implementation entry point for the backend +using EqualImpl_cpu = OperatorImpl_cpu<Equal_Op, + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>; + +// Implementation entry point registration to Operator +REGISTRAR(Equal_Op, "cpu", Aidge::EqualImpl_cpu::create); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_EQUALIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp b/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3c8ff0f4742e0393efd8cbbf637822c443edffb3 --- /dev/null +++ b/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp @@ -0,0 +1,163 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_ + +#include "aidge/backend/cpu/operator/EqualImpl.hpp" +#include "aidge/utils/Registrar.hpp" + +namespace Aidge { + +namespace { +// suppose values are contiguous in memory +template <class I, class O> +void equal_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I* input1, + const I* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]); + } +} +} + + +template <class I, class O> +void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, + const std::vector<std::size_t>& outputDims, + const void* input0_, + const void* input1_, + void* output_) { + + const I* input_0 = static_cast<const I*>(input0_); + const I* input_1 = static_cast<const I*>(input1_); + O* output = static_cast<O*>(output_); + + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(input_0[i] == input_1[i]); + } + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } +} + +// Kernels registration to implementation entry point +REGISTRAR(EqualImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}}, + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr}); +REGISTRAR(EqualImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}}, + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr}); +REGISTRAR(EqualImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}}, + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr}); +REGISTRAR(EqualImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}}, + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr}); + +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp index 02041f55ce9a1b2476db575b40340b1bb6517ce1..709f4a6ff208aa384478f3787710fdb010835bdf 100644 --- a/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp @@ -20,14 +20,14 @@ namespace Aidge { template <class I, class O> -void ErfImpl_cpu_forward_kernel(std::size_t inputLenght, +void ErfImpl_cpu_forward_kernel(std::size_t inputLength, const void* input_, void* output_) { const I* input = static_cast<const I*>(input_); O* output = static_cast<O*>(output_); - for (std::size_t i = 0; i < inputLenght; ++i) { + for (std::size_t i = 0; i < inputLength; ++i) { output[i] = std::erf(input[i]); } } diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp index c57f86e6ac6e74acebb48f471991e7181920f7c3..b03e7f58c19b119ec72306f7d9979607a707cde7 100644 --- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp @@ -96,21 +96,16 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, const B* biases = static_cast<const B*>(biases_); O* output = static_cast<O*>(output_); - if (biases == nullptr) { - std::fill(output, output+(batchSize*outputFeatureSize), B(0)); - } - else { - for (std::size_t batch = 0; batch < batchSize; ++batch) { - std::copy(biases, biases+outputFeatureSize, output+(batch*outputFeatureSize)); - } - } - - for (std::size_t batch = 0; batch < batchSize; ++batch) { - for (std::size_t out = 0; out < outputFeatureSize; ++out) { +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (batchSize * outputFeatureSize >= 16) +#endif + for (int batch = 0; batch < static_cast<int>(batchSize); ++batch) { + for (int out = 0; out < static_cast<int>(outputFeatureSize); ++out) { + const auto biasVal = (biases) ? biases[out] : B(0); output[out + batch*outputFeatureSize] = std::inner_product(input + batch*inputFeatureSize, input + (batch + 1)*inputFeatureSize, weights + out*inputFeatureSize, - output[out + batch*outputFeatureSize]); + biasVal); } } } @@ -136,15 +131,13 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, // bias grad - if (biasesGrad == nullptr) { // no bias - std::fill(biasesGrad, biasesGrad + outputFeatureSize, B(0)); - } else { + if (biasesGrad != nullptr) { for (std::size_t o = 0; o < outputFeatureSize; ++o) { // nb outputs B sum{0}; for (std::size_t b = 0; b < batchSize; ++b) { sum += input[b*outputFeatureSize + o]; } - biasesGrad[o] = sum; + biasesGrad[o]+= sum; } } @@ -155,7 +148,7 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, for (std::size_t b = 0; b < batchSize; ++b) { sum += originalInput[b*inputFeatureSize + c]*input[b*outputFeatureSize + o]; } - weightGrad[o*inputFeatureSize + c] = sum; + weightGrad[o*inputFeatureSize + c]+= sum; } } @@ -166,7 +159,7 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, for (std::size_t o = 0; o < outputFeatureSize; ++o) { sum += weight[o*inputFeatureSize + c] * input[b*outputFeatureSize + o]; } - output[b*inputFeatureSize + c] = sum; + output[b*inputFeatureSize + c]+= sum; } } } diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp index 4e04b1a595a8660b1528e49921e7e3e7a567829a..a71174c03216dc04e27325d59062d0383f5224ea 100644 --- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp +++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp @@ -18,12 +18,11 @@ #include "aidge/backend/cpu/operator/OperatorImpl.hpp" #include "aidge/operator/GlobalAveragePooling.hpp" #include "aidge/utils/Registrar.hpp" -#include "aidge/utils/Types.h" namespace Aidge { // Operator implementation entry point for the backend using GlobalAveragePoolingImpl_cpu = OperatorImpl_cpu<GlobalAveragePooling_Op, - void(const std::vector<DimSize_t> &, const void *, void *)>; + void(const std::shared_ptr<Tensor>&, void *)>; // Implementation entry point registration to Operator REGISTRAR(GlobalAveragePooling_Op, "cpu", Aidge::GlobalAveragePoolingImpl_cpu::create); diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp index d5e5561d02aacd8532f74d2bfd4ee2fb5a5b5dc3..3cab0ad9647a974170bf682fcf3b57b306bd76bd 100644 --- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp @@ -12,92 +12,90 @@ #ifndef AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_ #define AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_ -#include <cstddef> -#include <functional> // std::multiplies -#include <numeric> // std::accumulate +#include <cstddef> // std::size_t #include <vector> #include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp" -#include "aidge/data/Data.hpp" -#include "aidge/utils/ErrorHandling.hpp" +#include "aidge/data/Tensor.hpp" #include "aidge/utils/Registrar.hpp" #include "aidge/utils/Types.h" - namespace Aidge { template <typename T> -typename std::enable_if<std::is_floating_point<T>::value, T>::type -stableMean(const T* vec, size_t size) { - T mean = 0; - for (size_t i = 0; i < size; ++i) { - mean = std::fma<T>(vec[i] - mean, 1.0f / (i + 1), mean); - } - return mean; +typename std::enable_if_t<std::is_floating_point<T>::value, T> +static stableMean(const T* vec, std::size_t size) { + T mean{0}; + for (std::size_t i = 0; i < size; ++i) { + mean = std::fma(vec[i] - mean, static_cast<T>(1) / static_cast<T>(i + 1), mean); + } + return mean; } // Specialization for integers: perform the mean computation in float template <typename T> -typename std::enable_if<!std::is_floating_point<T>::value, T>::type -stableMean(const T* vec, size_t size) { - double mean = 0; - for (size_t i = 0; i < size; ++i) { - mean = std::fma<double>(vec[i] - mean, 1.0f / (i + 1), mean); - } - return mean; +typename std::enable_if_t<!std::is_floating_point<T>::value, double> +static stableMean(const T* vec, std::size_t size) { + double mean{0}; + for (std::size_t i = 0; i < size; ++i) { + mean = std::fma<double>(static_cast<double>(vec[i]) - mean, 1.0 / static_cast<double>(i + 1), mean); + } + return mean; } template <typename T> -typename std::enable_if<std::is_floating_point<T>::value, T>::type -castFromFloat(T value) { - return value; +typename std::enable_if_t<std::is_floating_point<T>::value, T> +static castFromFloat(T value) { + return value; } template <typename T> -typename std::enable_if<!std::is_floating_point<T>::value, T>::type -castFromFloat(double value) { - return static_cast<T>(std::nearbyint(value)); +typename std::enable_if_t<!std::is_floating_point<T>::value, T> +static castFromFloat(double value) { + return static_cast<T>(std::nearbyint(value)); } -template <class I, class O> -void GlobalAveragePoolingImpl_cpu_forward_kernel( - const std::vector<DimSize_t> &dims, const void *input_, void *output_) { - // error checking - AIDGE_ASSERT(dims.size() >= 3,"GlobalAveragePool needs at least a 3 dimensions " - "input, number of input dim : {}", - dims.size()); +template <DataType DT_I, DataType DT_O = DT_I> +void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>& inputTensor, void *output_) { - // computation - const I *input = static_cast<const I *>(input_); - O *output = static_cast<O *>(output_); + // computation + using I = cpptype_t<DT_I>; + using O = cpptype_t<DT_O>; + const I *input = static_cast<const I *>(inputTensor->getImpl()->rawPtr()); + O *output = static_cast<O *>(output_); - DimSize_t nb_elems = std::accumulate(dims.begin(), dims.end(), std::size_t(1), - std::multiplies<std::size_t>()); + const auto& dims = inputTensor->dims(); + DimSize_t nb_elems = std::accumulate(dims.begin(), dims.end(), std::size_t(1), + std::multiplies<std::size_t>()); + + const DimSize_t in_batch_nb_elems{nb_elems / dims[0]}; + const DimSize_t in_channel_nb_elems{in_batch_nb_elems / dims[1]}; + const DimSize_t out_batch_nb_elems{dims[1]}; - const DimSize_t in_batch_nb_elems{nb_elems / dims[0]}; - const DimSize_t in_channel_nb_elems{in_batch_nb_elems / dims[1]}; - const DimSize_t out_batch_nb_elems{dims[1]}; - // parse channel by channel and fill each output with the average of the - // values in the channel - for (DimSize_t batch = 0; batch < dims[0]; ++batch) { - for (DimSize_t channel = 0; channel < dims[1]; ++channel) { - const I *filter_start = std::next( - input, (batch * in_batch_nb_elems) + (channel * in_channel_nb_elems)); - output[batch * out_batch_nb_elems + channel] = castFromFloat<O>(stableMean<I>(filter_start, in_channel_nb_elems)); + // parse channel by channel and fill each output with the average of the + // values in the channel +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16) +#endif + for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) { + for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel) { + const I *filter_start = std::next( + input, (batch * in_batch_nb_elems) + (channel * in_channel_nb_elems)); + output[batch * out_batch_nb_elems + channel] = castFromFloat<O>(stableMean<I>(filter_start, in_channel_nb_elems)); + } } - } } // Kernels registration to implementation entry point REGISTRAR(GlobalAveragePoolingImpl_cpu, {DataType::Float32}, - {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<float, float>, nullptr}); + {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<DataType::Float32>, nullptr}); REGISTRAR(GlobalAveragePoolingImpl_cpu, {DataType::Float64}, - {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<double, double>, nullptr}); + {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<DataType::Float64>, nullptr}); REGISTRAR(GlobalAveragePoolingImpl_cpu, {DataType::Int32}, - {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr}); + {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<DataType::Int32>, nullptr}); } // namespace Aidge #endif /* AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl.hpp index 7a3ba9add1e98580c51a8416adc0d1feb5e1317a..877fa2a9c1fbf126fee5d1f3ce4db2db808cbc92 100644 --- a/include/aidge/backend/cpu/operator/HeavisideImpl.hpp +++ b/include/aidge/backend/cpu/operator/HeavisideImpl.hpp @@ -23,7 +23,7 @@ namespace Aidge { using HeavisideImplCpu = OperatorImpl_cpu<Heaviside_Op, void(std::size_t, const void *, void *, const float), - void(const float, std::size_t, const void *, void *)>; + void(std::size_t, const void *, const void *, void *)>; // Implementation entry point registration for operator Heaviside REGISTRAR(Heaviside_Op, "cpu", HeavisideImplCpu::create); diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp index 3fd6ca7de348ff18e75b2a88281d4db980b58774..c823b2942ea6cbc975bac71f30b3c94aca546c11 100644 --- a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp @@ -15,32 +15,55 @@ #include "aidge/utils/Registrar.hpp" #include <cstddef> // std::size_t +#include <cmath> #include "aidge/backend/cpu/operator/HeavisideImpl.hpp" #include "aidge/utils/ErrorHandling.hpp" - namespace Aidge { template <class I, class O> -void HeavisideImplCpuForwardKernel(std::size_t inputLenght, +void HeavisideImplCpuForwardKernel(std::size_t inputLength, const void *input_, void *output_, const float value) { const I *input = static_cast<const I *>(input_); O *output = static_cast<O *>(output_); - for (std::size_t i = 0; i < inputLenght; ++i) { + for (std::size_t i = 0; i < inputLength; ++i) { output[i] = (input[i] > 0) ? 1 : (input[i] == 0 ? value : 0); } } + +// Surrogate Gradient +template <class O, class GO, class GI> +void HeavisideImplCpuBackwardKernel(std::size_t inputLength, + const void* output_, + const void* grad_output_, + void* grad_input_) { + + /* + * Heaviside is approximated by an arctan function for backward : + * S ~= \frac{1}{\pi}\text{arctan}(\pi U \frac{\alpha}{2}) + * \frac{dS}{dU} = \frac{\alpha}{2} \frac{1}{(1+(\frac{\pi U \alpha}{2})^2)}} + * */ + + const O* output = static_cast<const O*>(output_); + const GO* grad_output = static_cast<const GO*>(grad_output_); + GI* grad_input = static_cast<GI*>(grad_input_); + + for (size_t i = 0; i < inputLength; ++i) { + grad_input[i] += grad_output[i] * static_cast<O>(1.0 / (1.0 + (output[i] * M_PI) * (output[i] * M_PI))); + } +} + // Kernels registration to implementation entry point REGISTRAR(HeavisideImplCpu, {DataType::Float32}, {ProdConso::inPlaceModel, Aidge::HeavisideImplCpuForwardKernel<float, float>, - nullptr}); + Aidge::HeavisideImplCpuBackwardKernel<float,float,float>}); } // namespace Aidge #endif // AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_KERNELS_H__H_ diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp index 1e8c1a14435f53ad7a63b327944e0bb8c70c8661..d4037901a5b0c7da5396dc435e237493023fb6f2 100644 --- a/include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp +++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp @@ -32,6 +32,7 @@ using LeakyReLUImpl_cpu = OperatorImpl_cpu<LeakyReLU_Op, void(const float, std::size_t, const void*, + const void*, void*)>; // Implementation entry point registration to Operator diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp index bc856f703aee8ba422887d43cb96db2132fc4603..236038c689fa2ef5005b8753f5fc0cbdbce68bd1 100644 --- a/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp @@ -19,7 +19,7 @@ namespace Aidge { template <class I, class O> void LeakyReLUImpl_cpu_forward_kernel(const float negativeSlope_, - std::size_t inputLenght, + std::size_t inputLength, const void* input_, void* output_) { @@ -27,23 +27,25 @@ void LeakyReLUImpl_cpu_forward_kernel(const float negativeSlope_, O* output = static_cast<O*>(output_); const I negativeSlope = static_cast<const I>(negativeSlope_); - for (std::size_t i = 0; i < inputLenght; ++i) { + for (std::size_t i = 0; i < inputLength; ++i) { output[i] = (input[i] >= 0) ? input[i] : input[i] * negativeSlope; } } template <class I, class O> void LeakyReLUImpl_cpu_backward_kernel(const float negativeSlope_, - std::size_t inputLenght, + std::size_t inputLength, const void* input_, - void* output_) { + const void* grad_output_, + void* grad_input_) { - const I* input = static_cast<const I*>(input_); - O* output = static_cast<O*>(output_); + const O* input = static_cast<const O*>(input_); + const I* grad_output = static_cast<const I*>(grad_output_); + O* grad_input = static_cast<O*>(grad_input_); const I negativeSlope = static_cast<const I>(negativeSlope_); - for (std::size_t i = 0; i < inputLenght; ++i) { - output[i] = (input[i] > 0) ? input[i] : negativeSlope*input[i]; + for (std::size_t i = 0; i < inputLength; ++i) { + grad_input[i] += (input[i] > 0) ? grad_output[i] : negativeSlope*grad_output[i]; } } diff --git a/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp b/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp index b30b05bb806de08d4e70c67e66979fb3138980df..8b57b417f64b3fcdd803483f0ea69a8a57c5cc7d 100755 --- a/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp @@ -18,7 +18,7 @@ namespace Aidge { template <class I, class O> -void LnImpl_cpu_forward_kernel(std::size_t inputLenght, +void LnImpl_cpu_forward_kernel(std::size_t inputLength, const void* input_, void* output_) { @@ -26,8 +26,8 @@ void LnImpl_cpu_forward_kernel(std::size_t inputLenght, O* output = static_cast<O*>(output_); const float eps = 1.0e-20f; -//#pragma omp parallel for if (inputLenght > 1024) - for (std::size_t i = 0; i < inputLenght; ++i) { +//#pragma omp parallel for if (inputLength > 1024) + for (std::size_t i = 0; i < inputLength; ++i) { if (input[i] > I(eps)) { output[i] = std::log(input[i]); } else { @@ -37,7 +37,7 @@ void LnImpl_cpu_forward_kernel(std::size_t inputLenght, } template <class I, class GI, class GO> -void LnImpl_cpu_backward_kernel(const std::size_t inputLenght, +void LnImpl_cpu_backward_kernel(const std::size_t inputLength, const void* input_, const void* grad_output_, void* grad_input_) { @@ -46,11 +46,9 @@ void LnImpl_cpu_backward_kernel(const std::size_t inputLenght, GI* grad_input = static_cast<GI*>(grad_input_); const float eps = 1.0e-20f; - for (std::size_t i = 0; i < inputLenght; ++i) { + for (std::size_t i = 0; i < inputLength; ++i) { if (input[i] > I(eps)) { - grad_input[i] = grad_output[i] / input[i]; - } else { - grad_input[i] = GI(0); + grad_input[i] += grad_output[i] / input[i]; } } } diff --git a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp index 5fc13baf49b1d0606eb4af5a54eec83fa5dce22a..adcc8ddc26a379e3a310aa1ab405841f7964037d 100644 --- a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp @@ -26,7 +26,10 @@ void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, con std::memset(output, O(0), n * m * sizeof(O)); - for (std::size_t i = 0; i < n; ++i) { +#ifdef _OPENMP + #pragma omp parallel for if (n >= 16) +#endif + for (int i = 0; i < static_cast<int>(n); ++i) { for (std::size_t l = 0; l < k; ++l) { for (std::size_t j = 0; j < m; ++j) { output[i*m + j] += static_cast<O>(input1[i*k + l] * input2[l*m + j]); diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp index 68cc3621514de97d9837e10bcf90218abe559aaa..804fb33a6420111b38910b22b271c79d42e93828 100644 --- a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp +++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp @@ -28,6 +28,14 @@ namespace Aidge { using MaxPooling2D_Op = MaxPooling_Op<2>; using MaxPoolingImpl2D_cpu = OperatorImpl_cpu<MaxPooling_Op<2>, void(const std::array<DimSize_t, 2>&, + const std::array<DimSize_t, 2>&, + const std::array<DimSize_t, 2>&, + const bool, + const std::array<DimSize_t, 4> &, + const void *, + void *), + void(const std::array<DimSize_t, 2>&, + const std::array<DimSize_t, 2>&, const std::array<DimSize_t, 2>&, const bool, const std::array<DimSize_t, 4> &, diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp index 7b6f04f141eb701849a8d436561bcf9e37471cfa..3057878d1ff90f46f5eb5ab887065dc5b4cede3f 100644 --- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp @@ -14,8 +14,10 @@ #include <array> #include <cmath> +#include <cstdint> #include <tuple> + #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp" #include "aidge/backend/cpu/data/GetCPUPtr.h" #include "aidge/data/Data.hpp" @@ -33,182 +35,234 @@ namespace Aidge { * @param output_ Output Tensor. */ template <class I, class O> -void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, - const std::array<DimSize_t, 2>& kernelDims, - const bool /*ceilMode*/, - const std::array<DimSize_t, 4> &dims, - const void *input_, - void *output_) { - // FIXME: missing convolution parameters as arguments - const I *input = static_cast<const I *>(input_); - O *output = static_cast<O *>(output_); - - // output H size - const std::size_t oxSize = - static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) / - static_cast<float>(strideDims[0]))); - // output W size - const std::size_t oySize = - static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) / - static_cast<float>(strideDims[1]))); - - // TODO: kernel computation - // output (batch, outCh, Xout, Yout) - // input (batch, ch, Xin, Yin) - // weight (outCh, ch, kernelX, kernelY) - // does not take Dilation parameter into account - using signedsize = std::make_signed<std::size_t>::type; - for (std::size_t batch = 0; batch < dims[0]; ++batch) { - for (std::size_t ch = 0; ch < dims[1]; ++ch) { - const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize; - const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3]; - for (std::size_t ox = 0; ox < oxSize; ++ox) { - const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]); - const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0))); - const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx); - for (std::size_t oy = 0; oy < oySize; ++oy) { - const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]); - const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0))); - const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify); - const std::size_t oIndexFull = oIndex + ox*oySize + oy; - const std::size_t ix = ox * strideDims[0]; - const std::size_t iy = oy * strideDims[1]; - - I poolValue(0.0); - bool valid = false; - - for (unsigned int channel = 0; channel < dims[1]; - ++channel){ - for (unsigned int sy = syMin; sy < syMax; ++sy) { - for (unsigned int sx = sxMin; sx < sxMax; ++sx) - { - const I value = input[iIndex + (ix+sx)*dims[3] + (iy+sy)]; - - if (!valid || value > poolValue) { - poolValue = value; - valid = true; - } - } - } - } - output[oIndexFull] = poolValue; +void MaxPoolingImpl2D_cpu_forward_kernel( + const std::array<DimSize_t, 2>& strideDims, + const std::array<DimSize_t, 2>& kernelDims, + const std::array<DimSize_t, 2>& dilations, + const bool ceilMode, + const std::array<DimSize_t, 4> &dims, + const void *input_, + void *output_) +{ + const I *input = static_cast<const I *>(input_); + O *output = static_cast<O *>(output_); + + // output H size + auto hOut = static_cast<float>( + dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0] + ) / static_cast<float>(strideDims[0]); + const std::size_t outXSize = ceilMode + ? static_cast<std::size_t>(std::ceil(hOut)) + : static_cast<std::size_t>(std::floor(hOut)); + + // output W size + auto wOut = static_cast<float>( + dims[3] - ( kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1] + ) / static_cast<float>(strideDims[1]); + + const std::size_t outYSize = ceilMode + ? static_cast<std::size_t>(std::ceil(wOut)) + : static_cast<std::size_t>(std::floor(wOut)); + + using signedsize = std::make_signed<std::size_t>::type; + +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16) +#endif + for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch){ + for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel){ + auto batchChannelIndex = (channel + batch * dims[1]); + const std::size_t outputBaseIndex = batchChannelIndex * outXSize * outYSize; + const std::size_t inputBaseIndex = batchChannelIndex * dims[2] * dims[3]; + for (std::size_t outX = 0; outX < outXSize; ++outX) { + const signedsize negStrideX = static_cast<signedsize>( + -outX * strideDims[0] + ); + const std::size_t kernelXMin = static_cast<std::size_t>( + std::max(negStrideX, signedsize(0)) + ); + /* Compute kernelXMax */ + std::size_t kernelXMax = dims[2] + negStrideX; + if ((static_cast<signedsize>(dims[2]) + negStrideX) < 0){ + kernelXMax = 0; + } + else if (kernelXMax > kernelDims[0]){ + kernelXMax = kernelDims[0]; + } + for (std::size_t outY = 0; outY < outYSize; ++outY) { + const signedsize negStrideY = static_cast<signedsize>(-outY * strideDims[1]); + const std::size_t kernelYMin = static_cast<std::size_t>( + std::max(negStrideY, signedsize(0)) + ); + /* Compute kernelYMax */ + std::size_t kernelYMax = dims[3] + negStrideY; + const std::size_t outputIndex = outputBaseIndex + outX * outYSize + outY; + const std::size_t strideXoffset = outX * strideDims[0]; + const std::size_t strideYoffset = outY * strideDims[1]; + I poolValue(0.0); + bool valid = false; + if (static_cast<signedsize>(dims[3]) + negStrideY < 0){ + kernelYMax = 0; + } + else if(kernelYMax > kernelDims[1]){ + kernelYMax = kernelDims[1]; + } + for (unsigned int kY = kernelYMin; kY < kernelYMax ; ++kY){ + for (unsigned int kX = kernelXMin; kX < kernelXMax; ++kX){ + // Apply dilation factor to kernel indices + const std::size_t dilatedkernelX = kX * dilations[0]; + const std::size_t dilatedkernelY = kY * dilations[1]; + // Ensure indices are within bounds + auto inputXPostDilation = strideXoffset + dilatedkernelX; + auto inputYPostDilation = strideYoffset + dilatedkernelY; + if (inputXPostDilation < dims[2] && inputYPostDilation < dims[3]){ + const I inputValue = input[ + inputBaseIndex + inputXPostDilation * dims[3] + + inputYPostDilation + ]; + if (!valid || inputValue > poolValue) { + poolValue = inputValue; + valid = true; } + } } + } + output[outputIndex] = poolValue; } + } } -} + } +} + -//N2D2 version -/* -template <class T> -void N2D2::PoolCell_Frame_Kernels::forwardMax(const T* alpha, - const Tensor<T>& - inputs, - const Descriptor& desc, - const T* beta, - Tensor<T>& outputs, - Tensor<ArgMax>& argMax, - bool useArgMax, - const Tensor<bool>& maps) +template <class I, class O> +void MaxPoolingImpl2D_cpu_backward_kernel( + const std::array<DimSize_t, 2>& strideDims, + const std::array<DimSize_t, 2>& kernelDims, + const std::array<DimSize_t, 2>& dilations, + const bool ceilMode, + const std::array<DimSize_t, 4> &dims, + const void *input_, + void *grad_ +) { - const unsigned int size = inputs.dimB() * outputs.dimZ(); + const I *input = static_cast<const I *>(input_); + I *grad = static_cast<I *>(grad_); -#if defined(_OPENMP) && _OPENMP >= 200805 -#pragma omp parallel for collapse(2) if (size > 16) -#else -#pragma omp parallel for if (inputs.dimB() > 4 && size > 16) -#endif - for (int batchPos = 0; batchPos < (int)inputs.dimB(); ++batchPos) { - for (unsigned int output = 0; output < outputs.dimZ(); ++output) { - for (unsigned int oy = 0; oy < outputs.dimY(); ++oy) { - for (unsigned int ox = 0; ox < outputs.dimX(); ++ox) { - const unsigned int sxMin = (unsigned int)std::max( - desc.padding[0] - (int)(ox * desc.stride[0]), 0); - const unsigned int syMin = (unsigned int)std::max( - desc.padding[1] - (int)(oy * desc.stride[1]), 0); - const unsigned int sxMax = Utils::clamp - <int>(inputs.dimX() + desc.padding[0] - ox * desc.stride[0], - 0, - desc.pool[0]); - const unsigned int syMax = Utils::clamp - <int>(inputs.dimY() + desc.padding[1] - oy * desc.stride[1], - 0, - desc.pool[1]); - - const int ix = (int)(ox * desc.stride[0]) - desc.padding[0]; - const int iy = (int)(oy * desc.stride[1]) - desc.padding[1]; - - T poolValue(0.0); - - // For each output, compute the pool value - if (useArgMax) { - const ArgMax inputMax - = argMax(ox, oy, output, batchPos); - - if (inputMax.valid) { - poolValue = inputs(inputMax.ix, - inputMax.iy, - inputMax.channel, - batchPos); - } - } - else { - unsigned int ixMax = 0; - unsigned int iyMax = 0; - unsigned int channelMax = 0; - bool valid = false; - - for (unsigned int channel = 0; channel < inputs.dimZ(); - ++channel) - { - if (!maps.empty() && !maps(output, channel)) - continue; - - for (unsigned int sy = syMin; sy < syMax; ++sy) { - for (unsigned int sx = sxMin; sx < sxMax; ++sx) - { - const T value = inputs(ix + sx, - iy + sy, - channel, - batchPos); - - if (!valid || value > poolValue) { - poolValue = value; - valid = true; - - ixMax = ix + sx; - iyMax = iy + sy; - channelMax = channel; - } - } - } - } - - argMax(ox, oy, output, batchPos) - = ArgMax(ixMax, iyMax, channelMax, valid); - } - - outputs(ox, oy, output, batchPos) - = (*alpha) * poolValue - + (*beta) * outputs(ox, oy, output, batchPos); + // output H size + auto hOut = static_cast<float>( + dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0] + ) / static_cast<float>(strideDims[0]); + const std::size_t outXSize = ceilMode + ? static_cast<std::size_t>(std::ceil(hOut)) + : static_cast<std::size_t>(std::floor(hOut)); + + // output W size + auto wOut = static_cast<float>( + dims[3] - ( kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1] + ) / static_cast<float>(strideDims[1]); + + const std::size_t outYSize = ceilMode + ? static_cast<std::size_t>(std::ceil(wOut)) + : static_cast<std::size_t>(std::floor(wOut)); + + using signedsize = std::make_signed<std::size_t>::type; + + for (std::size_t batch = 0; batch < dims[0]; ++batch){ + for (std::size_t channel = 0; channel < dims[1]; ++channel){ + auto batchChannelIndex = (channel + batch * dims[1]); + const std::size_t inputBaseIndex = batchChannelIndex * dims[2] * dims[3]; + for (std::size_t outX = 0; outX < outXSize; ++outX) { + const signedsize negStrideX = static_cast<signedsize>( + -outX * strideDims[0] + ); + const std::size_t kernelXMin = static_cast<std::size_t>( + std::max(negStrideX, signedsize(0)) + ); + /* Compute kernelXMax */ + std::size_t kernelXMax = dims[2] + negStrideX; + if ((static_cast<signedsize>(dims[2]) + negStrideX) < 0){ + kernelXMax = 0; + } + else if (kernelXMax > kernelDims[0]){ + kernelXMax = kernelDims[0]; + } + for (std::size_t outY = 0; outY < outYSize; ++outY) { + const signedsize negStrideY = static_cast<signedsize>(-outY * strideDims[1]); + const std::size_t kernelYMin = static_cast<std::size_t>( + std::max(negStrideY, signedsize(0)) + ); + /* Compute kernelYMax */ + std::size_t kernelYMax = dims[3] + negStrideY; + const std::size_t strideXoffset = outX * strideDims[0]; + const std::size_t strideYoffset = outY * strideDims[1]; + I poolValue(0.0); + bool valid = false; + if (static_cast<signedsize>(dims[3]) + negStrideY < 0){ + kernelYMax = 0; + } + else if(kernelYMax > kernelDims[1]){ + kernelYMax = kernelDims[1]; + } + std::size_t saveIndex = 0; + for (unsigned int kY = kernelYMin; kY < kernelYMax ; ++kY){ + for (unsigned int kX = kernelXMin; kX < kernelXMax; ++kX){ + // Apply dilation factor to kernel indices + const std::size_t dilatedkernelX = kX * dilations[0]; + const std::size_t dilatedkernelY = kY * dilations[1]; + // Ensure indices are within bounds + auto inputXPostDilation = strideXoffset + dilatedkernelX; + auto inputYPostDilation = strideYoffset + dilatedkernelY; + if (inputXPostDilation < dims[2] && inputYPostDilation < dims[3]){ + std::size_t inputIndex = + inputBaseIndex + inputXPostDilation * dims[3] + + inputYPostDilation; + const I inputValue = input[inputIndex]; + if (!valid || inputValue > poolValue) { + poolValue = inputValue; + saveIndex = inputIndex; + valid = true; } + } } + } + if (valid){ + grad[saveIndex]++; + } } + } } + } } -*/ + + // Kernels registration to implementation entry point REGISTRAR(MaxPoolingImpl2D_cpu, {DataType::Float32}, - {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<float, float>, nullptr}); + { + ProdConso::inPlaceModel, + Aidge::MaxPoolingImpl2D_cpu_forward_kernel<float, float>, + Aidge::MaxPoolingImpl2D_cpu_backward_kernel<float, float>, + } +); REGISTRAR(MaxPoolingImpl2D_cpu, {DataType::Float64}, - {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<double, double>, nullptr}); + { + ProdConso::inPlaceModel, + Aidge::MaxPoolingImpl2D_cpu_forward_kernel<double, double>, + Aidge::MaxPoolingImpl2D_cpu_backward_kernel<double, double>, + } +); REGISTRAR(MaxPoolingImpl2D_cpu, {DataType::Int32}, - {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<int32_t, int32_t>, nullptr}); + { + ProdConso::inPlaceModel, + Aidge::MaxPoolingImpl2D_cpu_forward_kernel<int32_t, int32_t>, + Aidge::MaxPoolingImpl2D_cpu_backward_kernel<int32_t, int32_t>, + } +); } // namespace Aidge #endif /* AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/ModImpl.hpp b/include/aidge/backend/cpu/operator/ModImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..96ff599b6633c66aad411b484e292b6a076e3090 --- /dev/null +++ b/include/aidge/backend/cpu/operator/ModImpl.hpp @@ -0,0 +1,33 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_MODIMPL_H_ +#define AIDGE_CPU_OPERATOR_MODIMPL_H_ + +#include <memory> +#include <tuple> +#include <vector> + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/Mod.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { +// Operator implementation entry point for the backend +using ModImpl_cpu = OperatorImpl_cpu<Mod_Op, + void(bool, const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)>; + +// Implementation entry point registration to Operator +REGISTRAR(Mod_Op, "cpu", Aidge::ModImpl_cpu::create); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_MODIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..15d18bf4de5cee7e7d75817a2ccf425f5ff41971 --- /dev/null +++ b/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp @@ -0,0 +1,80 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_ + +#include <numeric> // std::accumulate +#include <cstddef> // std::size_t +#include <cstdint> // std::int32_t, std::int64_t +#include <functional> // std::multiplies + +#include "aidge/utils/Registrar.hpp" + +#include "aidge/backend/cpu/data/Broadcasting.hpp" +#include "aidge/backend/cpu/operator/ModImpl.hpp" + +namespace Aidge { + +template <typename T, + typename std::enable_if<std::is_integral<T>::value>::type* = nullptr> +static inline T modulus(T a, T b) { + return a % b; +} + +template <typename T, + typename std::enable_if<!std::is_integral<T>::value>::type* = nullptr> +static inline T modulus(T /*a*/, T /*b*/) { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Mod Operator with fmod attribute set to false only supports integer types."); +} + +template <class I1, class I2, class O> +constexpr void ModImpl_cpu_forward_kernel(bool fmod, + const std::size_t input1size_, + const std::size_t input2size_, + const std::size_t output1size_, + const void* input1_, + const void* input2_, + void* output_) { + + const I1* input_1 = static_cast<const I1*>(input1_); + const I2* input_2 = static_cast<const I2*>(input2_); + O* output = static_cast<O*>(output_); + +// suppose values are contiguous in memory + for (std::size_t i = 0; i < output1size_; ++i) { + const std::size_t in1_id = (input1size_ != 1) ? i : 0; + const std::size_t in2_id = (input2size_ != 1) ? i : 0; + if (fmod) { + output[i] = static_cast<O>(std::fmod(input_1[in1_id], input_2[in2_id])); + } + else { + output[i] = static_cast<O>(modulus(input_1[in1_id], input_2[in2_id])); + } + } +} + +// Kernels registration to implementation entry point +REGISTRAR(ModImpl_cpu, + {DataType::Float32}, + {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<float, float, float>, nullptr}); +REGISTRAR(ModImpl_cpu, + {DataType::Float64}, + {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<double, double, double>, nullptr}); +REGISTRAR(ModImpl_cpu, + {DataType::Int32}, + {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr}); +REGISTRAR(ModImpl_cpu, + {DataType::UInt64}, + {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<std::uint64_t, std::uint64_t, std::uint64_t>, nullptr}); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp index 36acb9199c51e900287ca9b262322aa86287d838..bda28f63fc11c1234097f46b5df8c20197db4b0a 100644 --- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp @@ -148,8 +148,8 @@ void MulImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, } template <class I1, class I2, class O> -void MulImpl_cpu_backward_kernel(const std::size_t input0Length, - const std::size_t input1Length, +void MulImpl_cpu_backward_kernel(const std::size_t /*input0Length*/, + const std::size_t /*input1Length*/, const std::size_t gradOutputLength, const std::vector<std::size_t>& dims0, const std::vector<std::size_t>& dims1, @@ -166,9 +166,6 @@ void MulImpl_cpu_backward_kernel(const std::size_t input0Length, auto* grad_input_0 = static_cast<I1*>(gradientInput0_); auto* grad_input_1 = static_cast<I2*>(gradientInput1_); - std::fill_n(grad_input_0, input0Length, static_cast<I1>(0)); - std::fill_n(grad_input_1, input1Length, static_cast<I2>(0)); - // Broadcast dims0 and dims1 to match the shape of outputDims auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0); auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1); diff --git a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp index cae106632053366e1370b5ce1d3a2ee4cfd3b62b..51fd1bb62a94df5854a476f311f0e4e33a132ab0 100644 --- a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp @@ -163,12 +163,6 @@ void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims, I2* grad1 = static_cast<I2*>(gradientInput1_); const O* gradOut = static_cast<const O*>(gradOutput_); - // Fill input grads with zeros - std::size_t input0Elements = std::accumulate(input0Dims.cbegin(), input0Dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); - std::fill(grad0, grad0 + input0Elements, I1(0)); - std::size_t input1Elements = std::accumulate(input1Dims.cbegin(), input1Dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); - std::fill(grad1, grad1 + input1Elements, I2(0)); - std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); for (size_t oIndex = 0; oIndex < totalElements; ++oIndex) { diff --git a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp index e39e9b7decd91e392c5db7e9e9bc4ed0f366829d..3789052cac7b9b082cbf86cee1ce62ab486b556a 100644 --- a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp @@ -26,28 +26,28 @@ namespace Aidge { // Kernels template <class I, class O> -void ReLUImpl_cpu_forward_kernel(std::size_t inputLenght, +void ReLUImpl_cpu_forward_kernel(std::size_t inputLength, const void* input_, void* output_) { const I* input = static_cast<const I*>(input_); O* output = static_cast<O*>(output_); -//#pragma omp parallel for if (inputLenght > 1024) - for (std::size_t i = 0; i < inputLenght; ++i) { +//#pragma omp parallel for if (inputLength > 1024) + for (std::size_t i = 0; i < inputLength; ++i) { output[i] = (input[i] > 0) ? input[i] : 0; } } template <class I, class GI, class GO> -void ReLUImpl_cpu_backward_kernel(const std::size_t inputLenght, +void ReLUImpl_cpu_backward_kernel(const std::size_t inputLength, const void* input_, const void* grad_output_, void* grad_input_) { const I* input = static_cast<const I*>(input_); const GO* grad_output = static_cast<const GO*>(grad_output_); GI* grad_input = static_cast<GI*>(grad_input_); - for (std::size_t i = 0; i < inputLenght; ++i) { - grad_input[i] = (input[i] > 0) ? grad_output[i] : 0; + for (std::size_t i = 0; i < inputLength; ++i) { + grad_input[i] += (input[i] > 0) ? grad_output[i] : 0; } } @@ -60,7 +60,10 @@ REGISTRAR(ReLUImpl_cpu, {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<double, double>, Aidge::ReLUImpl_cpu_backward_kernel<double, double, double>}); REGISTRAR(ReLUImpl_cpu, {DataType::Int32}, - {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::ReLUImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>}); + {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr}); +REGISTRAR(ReLUImpl_cpu, + {DataType::Int8}, + {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<int8_t, int8_t>, nullptr}); } // namespace Aidge #endif /* AIDGE_CPU_OPERATOR_RELUIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp b/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp index 1c50805d5af768dfc160488fda1e8fadfa798454..d6c60c352dc862095bad9ac67ab50d05129b8dc2 100644 --- a/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp +++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp @@ -12,7 +12,6 @@ #ifndef AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_H_ #define AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_H_ -#include <array> #include <memory> #include <tuple> #include <vector> diff --git a/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp index 864b89c4fa4667b70e43ed7436382e30bc150745..73aa283d51d72e28d135ae5bb422f3f9f8dcd8c6 100644 --- a/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp @@ -26,35 +26,38 @@ namespace Aidge { +template <typename T> +using Acc_T = typename std::conditional_t<std::is_floating_point<T>::value, T, double>; + template <typename T> typename std::enable_if<std::is_floating_point<T>::value, T>::type -stableMean(const T* vec, size_t len, size_t stride) { +stableMean(const T* vec, std::size_t len, std::size_t stride) { T mean = 0; - for (size_t i = 0; i < len; ++i) { - mean = std::fma<T>(vec[i * stride] - mean, 1.0f / (i + 1), mean); + for (std::size_t i = 0; i < len; ++i) { + mean = std::fma(vec[i * stride] - mean, static_cast<T>(1) / static_cast<T>(i + 1), mean); } return mean; } // Specialization for integers: perform the mean computation in float template <typename T> -typename std::enable_if<!std::is_floating_point<T>::value, T>::type -stableMean(const T* vec, size_t len, size_t stride) { +typename std::enable_if_t<!std::is_floating_point<T>::value, double> +stableMean(const T* vec, std::size_t len, std::size_t stride) { double mean = 0; for (size_t i = 0; i < len; ++i) { - mean = std::fma<double>(vec[i * stride] - mean, 1.0f / (i + 1), mean); + mean = std::fma<double>(static_cast<double>(vec[i * stride]) - mean, 1.0 / static_cast<double>(i + 1), mean); } return mean; } template <typename T> -typename std::enable_if<std::is_floating_point<T>::value, T>::type +typename std::enable_if_t<std::is_floating_point<T>::value, T> castFromFloat(T value) { return value; } template <typename T> -typename std::enable_if<!std::is_floating_point<T>::value, T>::type +typename std::enable_if_t<!std::is_floating_point<T>::value, T> castFromFloat(double value) { return static_cast<T>(std::nearbyint(value)); } @@ -102,13 +105,13 @@ void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes, } // Type should be the return type of stableMean<I>(), which is always floating point - const decltype(stableMean<I>(input, 0, 0))* inputAccumulation = nullptr; - decltype(stableMean<I>(input, 0, 0))* outputAccumulation = nullptr; + const Acc_T<I>* inputAccumulation = nullptr; + Acc_T<I>* outputAccumulation = nullptr; for (const auto& axisInt : axes) { const std::size_t a = static_cast<std::size_t>(axisInt); outputElements /= inputDims[a]; - outputAccumulation = new I[outputElements]; + outputAccumulation = new Acc_T<I>[outputElements]; const std::size_t dim_i = inputDims[a]; for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) { for (std::size_t post = 0; post < stride_post[a]; ++post) { @@ -118,7 +121,7 @@ void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes, outputAccumulation[idx_o] = stableMean<I>(input + idx_i, dim_i, stride_post[a]); } else { - outputAccumulation[idx_o] = stableMean<I>(inputAccumulation + idx_i, dim_i, stride_post[a]); + outputAccumulation[idx_o] = stableMean<Acc_T<I>>(inputAccumulation + idx_i, dim_i, stride_post[a]); } } } diff --git a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp index 6449417baf855620669aba11ebca16d9384c4e7c..477f18cde5737bc26851280a25b0d0967538dfd8 100644 --- a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp @@ -50,12 +50,13 @@ void ResizeImpl_cpu_forward_kernel( outputDims.cend(), 1, std::multiplies<DimSize_t>()); - std::vector<float> coordInApprox(inputDims.size()); - std::vector<std::size_t> coordIn(inputDims.size()); - std::vector<DimSize_t> coordOut; - for (DimSize_t idxFlatOut = 0; idxFlatOut < outputLen; ++idxFlatOut) { - coordOut = Tensor::toCoord(outputDims, idxFlatOut); - coordInApprox = + +#ifdef _OPENMP + #pragma omp parallel for if (outputLen >= 16) +#endif + for (int idxFlatOut = 0; idxFlatOut < static_cast<int>(outputLen); ++idxFlatOut) { + const auto coordOut = Tensor::toCoord(outputDims, idxFlatOut); + auto coordInApprox = Interpolation::untransformCoordinates(coordOut, inputDims, outputDims, @@ -72,6 +73,7 @@ void ResizeImpl_cpu_forward_kernel( coordInApprox[i] = std::ceil(coordInApprox[i] - 0.5f); } } + std::vector<std::size_t> coordIn(inputDims.size()); if (Tensor::isInBounds<float>(inputDims, coordInApprox)) { for (std::size_t i = 0; i < coordInApprox.size(); ++i) { coordIn[i] = static_cast<std::size_t>(coordInApprox[i]); diff --git a/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp index ba9c63bc3618ba81e238d7721147c894b54cf832..7ac4319b2b10241dda2617db7df40a7947eb17ff 100644 --- a/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp @@ -21,14 +21,14 @@ namespace Aidge { template <class I, class O> -void RoundImpl_cpu_forward_kernel(const std::size_t inputLenght, +void RoundImpl_cpu_forward_kernel(const std::size_t inputLength, const void* input_, void* output_) { const I* input = static_cast<const I*>(input_); O* output = static_cast<O*>(output_); - for (std::size_t i = 0; i < inputLenght; ++i) { + for (std::size_t i = 0; i < inputLength; ++i) { //std::round would not work since it doesn't follow the halves rules (See ONNX Round) output[i] = static_cast<O>(std::nearbyint(static_cast<float>(input[i]))); } diff --git a/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp index c758c9cf39e76bb370c6d03c28e3a670c280eefc..f9ca00b73193c9dbd54d286125da1f084ae25587 100644 --- a/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp @@ -76,14 +76,14 @@ template <class I, class O> void ScalingImpl_cpu_forward_kernel(const float scalingFactor, const std::size_t quantizedNbBits, const bool isOutputUnsigned, - std::size_t inputLenght, + std::size_t inputLength, const void* input_, void* output_) { const I* input = static_cast<const I*>(input_); O* output = static_cast<O*>(output_); - for (std::size_t i = 0; i < inputLenght; ++i) { + for (std::size_t i = 0; i < inputLength; ++i) { output[i] = static_cast<O>(input[i] * static_cast<I>(scalingFactor)); if(quantizedNbBits > 0) { diff --git a/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp index dfd71ce0a878efbeb779f3a67ad4ccc762bb8363..b3446dba400e8a771e163d641ee64fd092518bac 100644 --- a/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp @@ -18,15 +18,15 @@ namespace Aidge { template <class I, class O> -void SigmoidImpl_cpu_forward_kernel(std::size_t inputLenght, +void SigmoidImpl_cpu_forward_kernel(std::size_t inputLength, const void* input_, void* output_) { const I* input = static_cast<const I*>(input_); O* output = static_cast<O*>(output_); -//#pragma omp parallel for if (inputLenght > 1024) - for (std::size_t i = 0; i < inputLenght; ++i) { +//#pragma omp parallel for if (inputLength > 1024) + for (std::size_t i = 0; i < inputLength; ++i) { if (input[i] > I(0)) { output[i] = O(1) / (O(1) + std::exp(-input[i])); } else { @@ -36,14 +36,14 @@ void SigmoidImpl_cpu_forward_kernel(std::size_t inputLenght, } template <class O, class GI, class GO> -void SigmoidImpl_cpu_backward_kernel(const std::size_t inputLenght, +void SigmoidImpl_cpu_backward_kernel(const std::size_t inputLength, const void* output_, const void* grad_output_, void* grad_input_) { const O* output = static_cast<const O*>(output_); const GO* grad_output = static_cast<const GO*>(grad_output_); GI* grad_input = static_cast<GI*>(grad_input_); - for (std::size_t i = 0; i < inputLenght; ++i) { - grad_input[i] = output[i] * (O(1) - output[i]) * grad_output[i]; + for (std::size_t i = 0; i < inputLength; ++i) { + grad_input[i] += output[i] * (O(1) - output[i]) * grad_output[i]; } } diff --git a/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp index d290c40f26270a789c2d328f98560c65ecac1559..9ae425347b47601f7d3b1cb3e4710e9c13dd926f 100644 --- a/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp @@ -48,13 +48,16 @@ void SliceImpl_cpu_forward_kernel(const std::vector<std::int64_t>& starts, static_cast<DimSize_t>(starts[i]) : static_cast<DimSize_t>(starts[i] + static_cast<std::int64_t>(inputDims[axis])), dims[axis]-1); - const DimSize_t end = ends[i] >= 0 ? + const DimSize_t end = std::min(ends[i] >= 0 ? static_cast<DimSize_t>(ends[i]) : - static_cast<DimSize_t>(ends[i] + static_cast<std::int64_t>(inputDims[axis])); + static_cast<DimSize_t>(ends[i] + static_cast<std::int64_t>(inputDims[axis])), + dims[axis]); const std::int64_t step = steps[i]; const std::size_t sliceSize = static_cast<std::size_t>(std::ceil((static_cast<float>(end) - static_cast<float>(start)) / static_cast<float>(step))); + totalSize /= dims[axis]; + totalSize *= sliceSize; outputAccumulation = new I[totalSize]; const std::size_t stride_pre = std::accumulate(dims.cbegin(), dims.cbegin() + axis, 1, std::multiplies<std::size_t>()); const std::size_t stride_post = std::accumulate(dims.crbegin(), dims.crbegin() + nbDims -1 - axis, 1, std::multiplies<std::size_t>()); @@ -62,17 +65,13 @@ void SliceImpl_cpu_forward_kernel(const std::vector<std::int64_t>& starts, { const std::size_t idx_in = outer * stride_post * dims[axis] + start * stride_post; const std::size_t idx_out = outer * stride_post * sliceSize; - std::size_t addedSlices = 0; for (std::size_t inner = 0; inner < sliceSize; ++inner) { std::copy_n(std::next(inputAccumulation, idx_in + inner * step * stride_post), stride_post, - std::next(outputAccumulation, idx_out + addedSlices * stride_post)); - addedSlices++; + std::next(outputAccumulation, idx_out + inner * stride_post)); } } - totalSize /= dims[axis]; - totalSize *= sliceSize; dims[axis] = sliceSize; if (inputAccumulation != input) { diff --git a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp index 07486a48f1b8cf29f6a6ef8aa934a9decdbafef7..0e72710cac4004876e8026ccdfbc38cb7c2618eb 100644 --- a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp @@ -37,8 +37,11 @@ void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSi preAxisElems *= inputDims[i]; } - for (std::size_t i = 0; i < preAxisElems; ++i) { - for (std::size_t j = 0; j < postAxisElems; ++j) { +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems >= 16) +#endif + for (int i = 0; i < static_cast<int>(preAxisElems); ++i) { + for (int j = 0; j < static_cast<int>(postAxisElems); ++j) { I maxVal = input[i * inputDims[axisIdx] * postAxisElems + j]; for (std::size_t k = 1; k < inputDims[axisIdx]; ++k) { std::size_t inIdx = i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j; diff --git a/include/aidge/backend/cpu/operator/SqrtImpl.hpp b/include/aidge/backend/cpu/operator/SqrtImpl.hpp index dba75d1c58fb19ab2284ee0e98a32bff7ac58557..2f24277fe4c02d52d7260d788a5dcd92a08c4d48 100644 --- a/include/aidge/backend/cpu/operator/SqrtImpl.hpp +++ b/include/aidge/backend/cpu/operator/SqrtImpl.hpp @@ -26,7 +26,7 @@ namespace Aidge { // Operator implementation entry point for the backend using SqrtImpl_cpu = OperatorImpl_cpu<Sqrt_Op, void(const std::size_t, const void*, void*), - void(const std::size_t, const void*, void*)>; + void(const std::size_t, const void*, const void*, void*)>; // Implementation entry point registration to Operator REGISTRAR(Sqrt_Op, "cpu", Aidge::SqrtImpl_cpu::create); diff --git a/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp index 0464119cad60742bc58c79da984b30776bc7932f..beddc74d95b16d08074c675f189c7e30061d4e28 100644 --- a/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp @@ -21,28 +21,30 @@ namespace Aidge { template <class I, class O> -void SqrtImpl_cpu_forward_kernel(const std::size_t inputLenght, +void SqrtImpl_cpu_forward_kernel(const std::size_t inputLength, const void* input_, void* output_) { const I* input = static_cast<const I*>(input_); O* output = static_cast<O*>(output_); - for (std::size_t i = 0; i < inputLenght; ++i) { + for (std::size_t i = 0; i < inputLength; ++i) { output[i] = static_cast<O>(std::sqrt(static_cast<float>(input[i]))); } } template <class I, class O> -void SqrtImpl_cpu_backward_kernel(const std::size_t inputLenght, - const void* input_, - void* output_) { +void SqrtImpl_cpu_backward_kernel(const std::size_t inputLength, + const void* output_, + const void* grad_output_, + void* grad_input_) { - const I* input = static_cast<const I*>(input_); - O* output = static_cast<O*>(output_); + const I* output = static_cast<const I*>(output_); + const I* grad_output = static_cast<const I*>(grad_output_); + O* grad_input = static_cast<O*>(grad_input_); - for (std::size_t i = 0; i < inputLenght; ++i) { - output[i] = static_cast<O>(0.5/(std::sqrt(static_cast<float>(input[i])))); + for (std::size_t i = 0; i < inputLength; ++i) { + grad_input[i] += static_cast<O>(0.5/output[i]) * grad_output[i]; } } diff --git a/include/aidge/backend/cpu/operator/SubImpl.hpp b/include/aidge/backend/cpu/operator/SubImpl.hpp index eed26ddcc9f57b3bb7796049a62f3f6be7de4eb5..1f94ff139c319916fed68120317c5f1931619495 100644 --- a/include/aidge/backend/cpu/operator/SubImpl.hpp +++ b/include/aidge/backend/cpu/operator/SubImpl.hpp @@ -15,15 +15,23 @@ #include "aidge/backend/cpu/operator/OperatorImpl.hpp" #include "aidge/operator/Sub.hpp" #include "aidge/utils/Registrar.hpp" -#include "aidge/utils/Types.h" -#include "aidge/backend/cpu/data/GetCPUPtr.h" -#include <memory> + #include <vector> namespace Aidge { // Operator implementation entry point for the backend using SubImpl_cpu = OperatorImpl_cpu<Sub_Op, - void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*)>; + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*), + void(const std::size_t, + const std::size_t, + const std::size_t, + const std::vector<std::size_t>&, + const std::vector<std::size_t>&, + const std::vector<std::size_t>&, + const void*, + void*, + void*) +>; // Implementation entry point registration to Operator REGISTRAR(Sub_Op, "cpu", Aidge::SubImpl_cpu::create); diff --git a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp index 1d789c3c8886d35ce6597d5704c76060bad196c1..751177a7c845fc55f2c2e932c0f6ba76bd0ba9ad 100644 --- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp @@ -42,6 +42,7 @@ void sub_contiguous_arrays(const std::size_t input1size, namespace Aidge { + template <class I1, class I2, class O> void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, std::vector<std::size_t> dims1, @@ -149,10 +150,52 @@ void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, } } +template <class I1, class I2, class O> +void SubImpl_cpu_backward_kernel(const std::size_t input0Length, + const std::size_t input1Length, + const std::size_t gradOutputLength, + const std::vector<std::size_t>& dims0, + const std::vector<std::size_t>& dims1, + const std::vector<std::size_t>& outputDims, + const void* grad_output_, + void* gradientInput0_, + void* gradientInput1_) +{ + const O* grad_output = static_cast<const O*>(grad_output_); + auto* grad_input_0 = static_cast<I1*>(gradientInput0_); + auto* grad_input_1 = static_cast<I2*>(gradientInput1_); + + auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0); + auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1); + + for (std::size_t i = 0; i < gradOutputLength; ++i) { + auto idxOutputGrad = getMultiDimIndices(outputDims, i); + std::vector<std::size_t> idxInput0(broadcastedDims0.size()); + std::vector<std::size_t> idxInput1(broadcastedDims1.size()); + + for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) { + idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension]; + } + + for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) { + idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension]; + } + + auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0); + auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1); + + // For subtraction: gradient of first input is 1 * grad_output + grad_input_0[idx0] += static_cast<I1>(grad_output[i]); + // For subtraction: gradient of second input is -1 * grad_output + grad_input_1[idx1] += static_cast<I2>(-grad_output[i]); + } +} + + // Kernels registration to implementation entry point REGISTRAR(SubImpl_cpu, {DataType::Float32}, - {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<float, float, float>, nullptr}); + {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<float, float, float>, Aidge::SubImpl_cpu_backward_kernel<float,float,float>}); REGISTRAR(SubImpl_cpu, {DataType::Float64}, {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<double, double, double>, nullptr}); diff --git a/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp b/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp index fdcac210484b11f2220dcc2a6813efed503d1913..ca4510d9353e2ce07102989577b67b81e9f1811c 100644 --- a/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp @@ -18,28 +18,28 @@ namespace Aidge { template <class I, class O> -void TanhImpl_cpu_forward_kernel(std::size_t inputLenght, +void TanhImpl_cpu_forward_kernel(std::size_t inputLength, const void* input_, void* output_) { const I* input = static_cast<const I*>(input_); O* output = static_cast<O*>(output_); -//#pragma omp parallel for if (inputLenght > 1024) - for (std::size_t i = 0; i < inputLenght; ++i) { +//#pragma omp parallel for if (inputLength > 1024) + for (std::size_t i = 0; i < inputLength; ++i) { output[i] = std::tanh(input[i]); } } template <class O, class GI, class GO> -void TanhImpl_cpu_backward_kernel(const std::size_t inputLenght, +void TanhImpl_cpu_backward_kernel(const std::size_t inputLength, const void* output_, const void* grad_output_, void* grad_input_) { const O* output = static_cast<const O*>(output_); const GO* grad_output = static_cast<const GO*>(grad_output_); GI* grad_input = static_cast<GI*>(grad_input_); - for (std::size_t i = 0; i < inputLenght; ++i) { - grad_input[i] = (O(1) - output[i] * output[i]) * grad_output[i]; + for (std::size_t i = 0; i < inputLength; ++i) { + grad_input[i] += (O(1) - output[i] * output[i]) * grad_output[i]; } } diff --git a/include/aidge/backend/cpu/operator/TopKImpl.hpp b/include/aidge/backend/cpu/operator/TopKImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..05849b7657a7290d07091ff90d4f5581bc54a130 --- /dev/null +++ b/include/aidge/backend/cpu/operator/TopKImpl.hpp @@ -0,0 +1,41 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_TOPKIMPL_H_ +#define AIDGE_CPU_OPERATOR_TOPKIMPL_H_ + +#include <array> +#include <memory> +#include <tuple> +#include <vector> + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/TopK.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { +// Operator implementation entry point for the backend +using TopKImpl_cpu = OperatorImpl_cpu<TopK_Op, + void(int64_t, + bool, + bool, + IOIndex_t, + const std::vector<DimSize_t>&, + const void*, + void*, + void*)>; + +// Implementation entry point registration to Operator +REGISTRAR(TopK_Op, "cpu", Aidge::TopKImpl_cpu::create); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_TOPKIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9b219deb34bc323b3310cc0d86c7bd23ccb1c1cf --- /dev/null +++ b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp @@ -0,0 +1,101 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_TOPKIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_TOPKIMPL_KERNELS_H_ + +#include <algorithm> // std::for_each +#include <cstddef> // std::size_t +#include <cstdint> // std::int32_t +#include <functional> //std::multiplies +#include <numeric> //std::accumulate +#include <vector> + +#include "aidge/backend/cpu/operator/TopKImpl.hpp" +#include "aidge/data/Data.hpp" +#include "aidge/operator/TopK.hpp" +#include "aidge/utils/Registrar.hpp" + +namespace Aidge { + +template <class I, class O> +void TopKImpl_cpu_forward_kernel(int64_t axis, + bool largest, + bool /*sorted*/, + IOIndex_t k, + const std::vector<DimSize_t>& inputDims, + const void* input_, + void* output_, + void* indices_) +{ + const I* input = static_cast<const I*>(input_); + O* output = static_cast<O*>(output_); + int64_t* indices = static_cast<int64_t*>(indices_); + + const std::size_t nb_dims = inputDims.size(); + const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axis, 1, std::multiplies<std::size_t>()); + const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axis, 1, std::multiplies<std::size_t>()); + + const std::size_t dim_i = inputDims[axis]; + std::vector<std::pair<I, int64_t>> buffer(dim_i); + +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (stride_pre * stride_post >= 16) +#endif + for (int pre = 0; pre < static_cast<int>(stride_pre); ++pre) { + for (int post = 0; post < static_cast<int>(stride_post); ++post) { + const std::size_t idx_i = pre * dim_i * stride_post + post; + const std::size_t idx_o = pre * k * stride_post + post; + + for (size_t i = 0; i < dim_i; ++i) { + const auto idx = idx_i + i * stride_post; + buffer[i] = std::make_pair(input[idx], i); + } + + if (largest) { + std::partial_sort(buffer.begin(), buffer.begin() + k, buffer.end(), + [](const auto& lhs, const auto& rhs) { return lhs.first > rhs.first; }); + } + else { + std::partial_sort(buffer.begin(), buffer.begin() + k, buffer.end(), + [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; }); + } + + for (size_t i = 0; i < k; ++i) { + output[idx_o + i] = buffer[i].first; + indices[idx_o + i] = buffer[i].second; + } + } + } +} + +// Kernels registration to implementation entry point +REGISTRAR(TopKImpl_cpu, + { + {{DataType::Float32}, {DataType::Any}}, + {{DataType::Float32}, {DataType::Int64}} + }, + {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<float, float>, nullptr}); +REGISTRAR(TopKImpl_cpu, + { + {{DataType::Float64}, {DataType::Any}}, + {{DataType::Float64}, {DataType::Int64}} + }, + {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<double, double>, nullptr}); +REGISTRAR(TopKImpl_cpu, + { + {{DataType::Int32}, {DataType::Any}}, + {{DataType::Int32}, {DataType::Int64}} + }, + {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr}); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_TOPKIMPL_KERNELS_H_ */ diff --git a/pyproject.toml b/pyproject.toml index 39bed4d209581b272a8491fbce6c3f28029fdd57..1e8869ad5163477af5201e53e48bd3a6ff2ffbdf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ description="CPU implementation of operators of the AIDGE framework" dependencies = [ "numpy", ] -requires-python = ">= 3.8" +requires-python = ">= 3.10" readme = "README.md" license = { file = "LICENSE" } classifiers = [ diff --git a/setup.py b/setup.py index 366a4825da5d8ad369834b2231152a8c0424c9e8..ee54ca953b11b7f4d0aabe42aed33345bafbc21c 100644 --- a/setup.py +++ b/setup.py @@ -88,6 +88,7 @@ class AidgePkgBuild(build_ext): f"-DCMAKE_CXX_COMPILER={cxx_compiler}", f"-DENABLE_ASAN={asan}", "-DPYBIND=ON", + "-DPYBIND11_FINDPYTHON=ON", f"-DPYBIND_INSTALL_PREFIX:PATH={pybind_install_prefix}", "-DCMAKE_EXPORT_COMPILE_COMMANDS=1", "-DCOVERAGE=OFF", diff --git a/src/data/Interpolation.cpp b/src/data/Interpolation.cpp index fbf224d84f65c442e98967783d303605a177d390..24aeeb9f8ebd284172c82622dce7c1cde4235e8c 100644 --- a/src/data/Interpolation.cpp +++ b/src/data/Interpolation.cpp @@ -39,13 +39,14 @@ InterpolationCPU::linearRecurse(const std::vector<float> &coordToInterpolate, return points; } - auto extractPtCoords = [](std::set<Point<T>> pts) -> std::set<Coords> { - std::set<Coords> result; - for (const auto &pt : pts) { - result.insert(pt.first); - } - return result; - }; + // :!\ Warning: seems to be unused now + // auto extractPtCoords = [](std::set<Point<T>> pts) -> std::set<Coords> { + // std::set<Coords> result; + // for (const auto &pt : pts) { + // result.insert(pt.first); + // } + // return result; + // }; /////////////////// // ERROR CHECKING if (alongDim > coordToInterpolate.size() || points.size() == 0) { @@ -79,10 +80,10 @@ InterpolationCPU::linearRecurse(const std::vector<float> &coordToInterpolate, pointsCoords, alongDim); } - Log::debug("\nEntering linear recurse with {} points.", points.size()); - Log::debug("Points : {}", extractPtCoords(points)); - Log::debug("coordsToInterpolate : {}", coordToInterpolate); - Log::debug("alongDim : {}", alongDim); + //Log::debug("\nEntering linear recurse with {} points.", points.size()); + //Log::debug("Points : {}", extractPtCoords(points)); + //Log::debug("coordsToInterpolate : {}", coordToInterpolate); + //Log::debug("alongDim : {}", alongDim); /////////////////// // COMPUTATION @@ -98,9 +99,9 @@ InterpolationCPU::linearRecurse(const std::vector<float> &coordToInterpolate, upperPoints.insert(point); } } - Log::debug("alongDim : {}", alongDim); - Log::debug("lowerPoints : {}", extractPtCoords(lowerPoints)); - Log::debug("upperPoints : {}", extractPtCoords(upperPoints)); + //Log::debug("alongDim : {}", alongDim); + //Log::debug("lowerPoints : {}", extractPtCoords(lowerPoints)); + //Log::debug("upperPoints : {}", extractPtCoords(upperPoints)); // Here are 3 cases // 1. upper/lowerPoints.size() == 0 @@ -174,7 +175,7 @@ InterpolationCPU::linearRecurse(const std::vector<float> &coordToInterpolate, // 0 is just a sanity check to ensure later that all dims have been // interpolate interpolatedPoint.first[alongDim] = 0; - Log::debug("successfully returned from alongDim : {}", alongDim); + //Log::debug("successfully returned from alongDim : {}", alongDim); return std::set<Point<T>>({interpolatedPoint}); } diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp index 101743eccb606c998a38f49dd9b89f5ec279bcae..cff6128741db657136aca1006c0f273ce64aa87a 100644 --- a/src/operator/AddImpl.cpp +++ b/src/operator/AddImpl.cpp @@ -55,5 +55,26 @@ void Aidge::AddImpl_cpu::forward() { template <> void Aidge::AddImpl_cpu::backward() { - AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Add_Op on backend cpu"); + const Add_Op& op_ = dynamic_cast<const Add_Op&>(mOp); + + auto in0 = op_.getInput(0); + auto in1 = op_.getInput(1); + auto in0grad = op_.getInput(0)->grad(); + auto in1grad = op_.getInput(1)->grad(); + auto out0grad = op_.getOutput(0)->grad(); + + // Find the correct kernel type + const auto impl = Registrar<AddImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Call kernel + impl.backward(in0grad->size(), + in1grad->size(), + out0grad->size(), + in0->dims(), + in1->dims(), + out0grad->dims(), + getCPUPtr(out0grad), + getCPUPtr(in0grad), + getCPUPtr(in1grad)); + } diff --git a/src/operator/ArgMaxImpl.cpp b/src/operator/ArgMaxImpl.cpp index b8fb85a7cd86a788cda69307d5ed8f363619f9f0..5829070a4d619f370ae5ddebfddf133e6f6d9003 100644 --- a/src/operator/ArgMaxImpl.cpp +++ b/src/operator/ArgMaxImpl.cpp @@ -21,12 +21,13 @@ template <> void Aidge::ArgMaxImpl_cpu::forward() { const ArgMax_Op& op_ = dynamic_cast<const ArgMax_Op&>(mOp); + std::int32_t axis = (op_.axis() >= 0) ? op_.axis() : op_.getInput(0)->nbDims() + op_.axis(); // Find the correct kernel type const auto impl = Registrar<ArgMaxImpl_cpu>::create(getBestMatch(getRequiredSpec())); // Call kernel - impl.forward(op_.axis(), + impl.forward(axis, op_.selectLastIndex(), op_.getInput(0)->dims(), op_.getInput(0)->getImpl()->rawPtr(), diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp index 01a5e8cf1772161f5cf98d3a8bd52f43ac7a1d0d..eb5ef87bd16620cdef33330bd7b39ece1783ecfc 100644 --- a/src/operator/AvgPoolingImpl.cpp +++ b/src/operator/AvgPoolingImpl.cpp @@ -32,7 +32,9 @@ void Aidge::AvgPoolingImpl2D_cpu::forward() { // Call kernel impl.forward(op_.strideDims(), op_.kernelDims(), + op_.dilations(), op_.getInput(0)->template dims<4>(), + op_.ceilMode(), getCPUPtr(op_.getInput(0)), getCPUPtr(op_.getOutput(0))); } diff --git a/src/operator/BitShiftImpl.cpp b/src/operator/BitShiftImpl.cpp index c6940554dd925905a18de66651707c3d58594ade..ad41cb153f82e1131b1b6ef970a362b6af957bb9 100644 --- a/src/operator/BitShiftImpl.cpp +++ b/src/operator/BitShiftImpl.cpp @@ -33,6 +33,7 @@ void Aidge::BitShiftImpl_cpu::forward() { // Call kernel impl.forward( op_.direction(), + op_.rounding(), op_.getInput(0)->dims(), op_.getInput(1)->dims(), op_.getOutput(0)->dims(), diff --git a/src/operator/ConstantOfShapeImpl.cpp b/src/operator/ConstantOfShapeImpl.cpp index 16e4b762ba04e5f01bfccf965f6de3650fa2e734..1d41160b7738f4d8d8af103f25a0f3554f1e4442 100644 --- a/src/operator/ConstantOfShapeImpl.cpp +++ b/src/operator/ConstantOfShapeImpl.cpp @@ -13,15 +13,14 @@ #include <functional> #include <memory> -#include <vector> +#include <stdexcept> // std::runtime_error #include "aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp" -#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/operator/ConstantOfShape.hpp" +#include "aidge/backend/OperatorImpl.hpp" // Aidge::getBestMatch, Aidge::getRequiredSpec #include "aidge/utils/ErrorHandling.hpp" #include "aidge/utils/Registrar.hpp" -#include "aidge/utils/Types.h" template <> void Aidge::ConstantOfShapeImpl_cpu::forward() { @@ -33,9 +32,7 @@ void Aidge::ConstantOfShapeImpl_cpu::forward() { const auto impl = Registrar<ConstantOfShapeImpl_cpu>::create(getBestMatch(getRequiredSpec())); // Call kernel - impl.forward(op_.getOutput(0)->dims(), - op_.value(), - op_.getOutput(0)->getImpl()->rawPtr()); + impl.forward(op_.getOutput(0), op_.value()); } template <> diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp index fdfe19fbf4bf3e71c86aa28b966cfb21a1b5ba40..eae5f109f6af8298b90cc8e505ff44eff51bab5c 100644 --- a/src/operator/ConvImpl.cpp +++ b/src/operator/ConvImpl.cpp @@ -12,18 +12,18 @@ #include "aidge/backend/cpu/operator/ConvImpl.hpp" #include "aidge/backend/cpu/operator/ConvImpl_kernels.hpp" -#include <cassert> -#include <chrono> // std::chrono::milliseconds -#include <numeric> // std::accumulate -#include <thread> // std::this_thread::sleep_for +#include <memory> #include <vector> #include "aidge/backend/cpu/data/GetCPUPtr.h" #include "aidge/operator/Conv.hpp" +#include "aidge/utils/ErrorHandling.hpp" #include "aidge/utils/Types.h" +namespace Aidge { + template <> -void Aidge::ConvImpl1D_cpu::forward() { +void ConvImpl1D_cpu::forward() { const auto& op_ = static_cast<const Conv_Op<1>&>(mOp); // FIXME: uncomment the following code once memory handling will work @@ -43,25 +43,65 @@ void Aidge::ConvImpl1D_cpu::forward() { const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor(); // Call kernel - impl.forward(op_.strideDims(), - op_.dilationDims(), - op_.kernelDims(), - op_.getInput(0)->template dims<3>(), // input dimensions - dynamic_cast<const Conv_Op<1>&>(mOp).outChannels(), // outChannels - input0.getImpl()->rawPtr(), // input - input1.getImpl()->rawPtr(), // weight - op_.getInput(2) ? input2.getImpl()->rawPtr() : nullptr, // bias - getCPUPtr(mOp.getRawOutput(0)) // output - ); + impl.forward( + op_.strideDims(), + op_.dilationDims(), + op_.kernelDims(), + op_.getInput(0)->template dims<3>(), // input dimensions + dynamic_cast<const Conv_Op<1> &>(mOp).outChannels(), // outChannels + input0.getImpl()->rawPtr(), // input + input1.getImpl()->rawPtr(), // weight + op_.getInput(2) ? input2.getImpl()->rawPtr() : nullptr, // bias + getCPUPtr(mOp.getRawOutput(0)) // output + ); } template <> -void Aidge::ConvImpl1D_cpu::backward() { - AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<1> on backend cpu"); +void ConvImpl1D_cpu::backward() { + const auto &op = dynamic_cast<const Conv1D_Op &>(mOp); + const auto &outputGrad = op.getOutput(0)->grad(); + AIDGE_ASSERT(outputGrad, "{}: missing ouput #0 gradient", op.type()); + AIDGE_ASSERT(op.getInput(0)->grad(), + "{}: missing data input(#0) gradient", + op.type()); + AIDGE_ASSERT(op.getInput(1)->grad(), + "{}: missing weight input(#1) gradient", + op.type()); + + std::shared_ptr<Tensor> inputDataGradFallback, inputWeightGradFallback, + inputBiasGradFallback; + const auto &inputDataGrad = + op.getInput(0)->grad()->refCastFrom(inputDataGradFallback, + *(op.getOutput(0))); + const auto &inputWeightGrad = + op.getInput(1)->grad()->refCastFrom(inputWeightGradFallback, + *(op.getOutput(0))); + const auto &inputBiasGrad = + (op.getInput(2) && op.getInput(2)->grad()) + ? op.getInput(2)->grad()->refCastFrom(inputBiasGradFallback, + *(op.getOutput(0))) + : Tensor(); + + // Call kernel + const auto impl = + Registrar<ConvImpl1D_cpu>::create(getBestMatch(getRequiredSpec())); + impl.backward( + op.strideDims(), + op.dilationDims(), + op.kernelDims(), + op.getInput(0)->template dims<3>(), + op.getOutput(0)->template dims<3>(), + + getCPUPtr(op.getInput(0)), + getCPUPtr(op.getInput(1)), + getCPUPtr(outputGrad), + inputDataGrad.getImpl()->rawPtr(), + inputWeightGrad.getImpl()->rawPtr(), + op.getInput(2) ? inputBiasGrad.getImpl()->rawPtr() : nullptr); } template <> -void Aidge::ConvImpl2D_cpu::forward() { +void ConvImpl2D_cpu::forward() { const auto& op_ = dynamic_cast<const Conv_Op<2>&>(mOp); // FIXME: uncomment the following code once memory handling will work @@ -93,7 +133,49 @@ void Aidge::ConvImpl2D_cpu::forward() { ); } + template <> -void Aidge::ConvImpl2D_cpu::backward() { - AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<2> on backend cpu"); +void ConvImpl2D_cpu::backward() { + const auto &op = dynamic_cast<const Conv2D_Op &>(mOp); + const auto &outputGrad = op.getOutput(0)->grad(); + AIDGE_ASSERT(outputGrad, "{}: missing ouput #0 gradient", op.type()); + AIDGE_ASSERT(op.getInput(0)->grad(), + "{}: missing data input(#0) gradient", + op.type()); + AIDGE_ASSERT(op.getInput(1)->grad(), + "{}: missing weight input(#1) gradient", + op.type()); + + std::shared_ptr<Tensor> inputDataGradFallback, inputWeightGradFallback, + inputBiasGradFallback; + const auto &inputDataGrad = + op.getInput(0)->grad()->refCastFrom(inputDataGradFallback, + *(op.getOutput(0))); + const auto &inputWeightGrad = + op.getInput(1)->grad()->refCastFrom(inputWeightGradFallback, + *(op.getOutput(0))); + const auto &inputBiasGrad = + (op.getInput(2) && op.getInput(2)->grad()) + ? op.getInput(2)->grad()->refCastFrom(inputBiasGradFallback, + *(op.getOutput(0))) + : Tensor(); + + // Call kernel + const auto impl = + Registrar<ConvImpl2D_cpu>::create(getBestMatch(getRequiredSpec())); + impl.backward( + op.strideDims(), + op.dilationDims(), + op.kernelDims(), + op.getInput(0)->template dims<4>(), + op.getOutput(0)->template dims<4>(), + + getCPUPtr(op.getInput(0)), + getCPUPtr(op.getInput(1)), + getCPUPtr(outputGrad), + inputDataGrad.getImpl()->rawPtr(), + inputWeightGrad.getImpl()->rawPtr(), + op.getInput(2) ? inputBiasGrad.getImpl()->rawPtr() : nullptr); } + +} // namespace Aidge diff --git a/src/operator/ConvTransposeImpl.cpp b/src/operator/ConvTransposeImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d1135cc92dd3c68746b9dcf80739f4f65acdad2e --- /dev/null +++ b/src/operator/ConvTransposeImpl.cpp @@ -0,0 +1,91 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cpu/operator/ConvTransposeImpl.hpp" +#include "aidge/backend/cpu/operator/ConvTransposeImpl_kernels.hpp" + +template <> void Aidge::ConvTransposeImpl1D_cpu::forward() { + const auto &op = static_cast<const ConvTranspose_Op<1> &>(mOp); + + AIDGE_ASSERT(op.getInput(0), "{}: missing data input (#0).", op.type()); + AIDGE_ASSERT(op.getInput(1), "{}: missing bias input (#1).", op.type()); + AIDGE_ASSERT(op.getInput(2), "{}: missing weight input (#1).", op.type()); + + std::shared_ptr<Tensor> inputDataFallback, inputWeightFallback, + inputBiasFallback; + const auto &inputData = + op.getInput(0)->refCastFrom(inputDataFallback, *op.getOutput(0)); + const auto &inputWeight = + op.getInput(1)->refCastFrom(inputWeightFallback, *op.getOutput(0)); + const auto &inputBias = + (op.getInput(2)) + ? op.getInput(2)->refCastFrom(inputBiasFallback, *op.getOutput(0)) + : Tensor(); + + // Call kernel + const auto impl = Registrar<ConvTransposeImpl1D_cpu>::create( + getBestMatch(getRequiredSpec())); + impl.forward(op.strideDims(), + op.dilationDims(), + op.kernelDims(), + op.getInput(0)->template dims<3>(), + op.getOutput(0)->template dims<3>(), + inputData.getImpl()->hostPtr(), + inputWeight.getImpl()->hostPtr(), + op.getInput(2) ? inputBias.getImpl()->hostPtr() : nullptr, + op.getOutput(0)->getImpl()->rawPtr()); +} + +template <> void Aidge::ConvTransposeImpl1D_cpu::backward() { + AIDGE_THROW_OR_ABORT( + std::runtime_error, + "Backward not yet implemented for Conv_Op<1> on backend cpu"); +} + +template <> void Aidge::ConvTransposeImpl2D_cpu::forward() { + const auto &op = static_cast<const ConvTranspose_Op<2> &>(mOp); + + AIDGE_ASSERT(op.getInput(0), "{}: missing data input (#0).", op.type()); + AIDGE_ASSERT(op.getInput(1), "{}: missing bias input (#1).", op.type()); + AIDGE_ASSERT(op.getInput(2), "{}: missing weight input (#1).", op.type()); + + std::shared_ptr<Tensor> inputDataFallback, inputWeightFallback, + inputBiasFallback; + const auto &inputData = + op.getInput(0)->refCastFrom(inputDataFallback, *op.getOutput(0)); + const auto &inputWeight = + op.getInput(1)->refCastFrom(inputWeightFallback, *op.getOutput(0)); + const auto &inputBias = + (op.getInput(2)) + ? op.getInput(2)->refCastFrom(inputBiasFallback, *op.getOutput(0)) + : Tensor(); + + // Call kernel + const auto impl = Registrar<ConvTransposeImpl2D_cpu>::create( + getBestMatch(getRequiredSpec())); + + impl.forward(op.strideDims(), + op.dilationDims(), + op.kernelDims(), + op.getInput(0)->template dims<4>(), + op.getOutput(0)->template dims<4>(), + inputData.getImpl()->hostPtr(), + inputWeight.getImpl()->hostPtr(), + op.getInput(2) ? inputBias.getImpl()->hostPtr() : nullptr, + op.getOutput(0)->getImpl()->rawPtr()); +} + +template <> void Aidge::ConvTransposeImpl2D_cpu::backward() { + AIDGE_THROW_OR_ABORT( + std::runtime_error, + "Backward not yet implemented for Conv_Op<2> on backend cpu"); +} + diff --git a/src/operator/CryptoHashImpl.cpp b/src/operator/CryptoHashImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..10d82dd05408733b898da0c8d3edb38df76dbe1a --- /dev/null +++ b/src/operator/CryptoHashImpl.cpp @@ -0,0 +1,46 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <chrono> // std::chrono::milliseconds +#include <numeric> // std::accumulate +#include <thread> // std::this_thread::sleep_for +#include <vector> + +#include "aidge/operator/CryptoHash.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" + +#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp" +#include "aidge/backend/cpu/operator/CryptoHashImpl_kernels.hpp" + +#ifdef WITH_OPENSSL +template <> +void Aidge::CryptoHashImpl_cpu::forward() { + const CryptoHash_Op& op_ = dynamic_cast<const CryptoHash_Op&>(mOp); + std::shared_ptr<Tensor> in0 = op_.getInput(0); + std::shared_ptr<Tensor> out0 = op_.getOutput(0); + AIDGE_ASSERT(in0, "missing input #0"); + + // Find the correct kernel type + const auto impl = Registrar<CryptoHashImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Call kernel + impl.forward(in0->size(), + getCPUPtr(mOp.getRawInput(0)), + getCPUPtr(mOp.getRawOutput(0))); +} + +template <> +void Aidge::CryptoHashImpl_cpu::backward() { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not available for CryptoHash_Op"); +} +#endif diff --git a/src/operator/DivImpl.cpp b/src/operator/DivImpl.cpp index 135b32b5005a961e55910e758f9b7102ca51b63c..67444cb88bb607dfdb3d099732a4ffaf380591ff 100644 --- a/src/operator/DivImpl.cpp +++ b/src/operator/DivImpl.cpp @@ -152,5 +152,26 @@ void Aidge::DivImpl_cpu::forward() { template <> void Aidge::DivImpl_cpu::backward() { - AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Div_Op on backend cpu"); + const Div_Op& op_ = dynamic_cast<const Div_Op&>(mOp); + + auto in0 = op_.getInput(0); + auto in1 = op_.getInput(1); + auto in0grad = op_.getInput(0)->grad(); + auto in1grad = op_.getInput(1)->grad(); + auto out0grad = op_.getOutput(0)->grad(); + + const auto impl = Registrar<DivImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + impl.backward(in0grad->size(), + in1grad->size(), + out0grad->size(), + in0->dims(), + in1->dims(), + out0grad->dims(), + getCPUPtr(in0), + getCPUPtr(in1), + getCPUPtr(out0grad), + getCPUPtr(in0grad), + getCPUPtr(in1grad)); } + diff --git a/src/operator/DropoutImpl.cpp b/src/operator/DropoutImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6975ce6837cc701f64c116a6d1aaec31688ee023 --- /dev/null +++ b/src/operator/DropoutImpl.cpp @@ -0,0 +1,49 @@ +/******************************************************************************** + * Copyright (c) 2025 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cpu/operator/DropoutImpl.hpp" + +#include <stdexcept> // std::runtime_erro +#include <random> // std::random_device + +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/Dropout.hpp" +#include "aidge/utils/ErrorHandling.hpp" +#include "aidge/utils/Registrar.hpp" + + +#include "aidge/backend/cpu/operator/DropoutImpl_kernels.hpp" + +template <> +void Aidge::DropoutImpl_cpu::forward() { + const Dropout_Op& op_ = dynamic_cast<const Dropout_Op&>(mOp); + // Check if input is provided + AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Dropout Operator."); + + // Get random seed + const unsigned int seed = static_cast<unsigned int>(std::random_device{}()); + + // Find the correct kernel type + const auto impl = Registrar<DropoutImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Call kernel + impl.forward(op_.probability(), + op_.getInput(0)->size(), + seed, + std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->getImpl()->rawPtr(), + std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()); +} + +template <> +void Aidge::DropoutImpl_cpu::backward() { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Dropout_Op on backend cpu"); +} \ No newline at end of file diff --git a/src/operator/EqualImpl.cpp b/src/operator/EqualImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5926212e8453a32e54de7691343d44f9c6849a05 --- /dev/null +++ b/src/operator/EqualImpl.cpp @@ -0,0 +1,61 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <chrono> // std::chrono::milliseconds +#include <numeric> // std::accumulate +#include <thread> // std::this_thread::sleep_for +#include <vector> + +#include "aidge/operator/Equal.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/Broadcasting.hpp" +#include "aidge/backend/cpu/data/GetCPUPtr.h" + +#include "aidge/backend/cpu/operator/EqualImpl.hpp" +#include "aidge/backend/cpu/operator/EqualImpl_kernels.hpp" + +template <> +void Aidge::EqualImpl_cpu::forward() { + const Equal_Op& op = static_cast<const Equal_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in Equal operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Equal forward because the 0-th input has no implementation."); + + AIDGE_ASSERT(op.getInput(1), "missing input in Equal operator"); + AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Equal forward because the 1st input has no implementation."); + + AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot Equal inputs with two differents data type."); + + // Find the correct kernel type + const auto impl = Registrar<EqualImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; + const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0)); + const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1)); + + + impl.forward(op.getInput(0)->dims(), + op.getInput(1)->dims(), + op.getOutput(0)->dims(), + input0.getImpl()->rawPtr(), + input1.getImpl()->rawPtr(), + getCPUPtr(op.getRawOutput(0))); +} + +template <> +void Aidge::EqualImpl_cpu::backward() { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Equal_Op on backend cpu"); +} diff --git a/src/operator/GlobalAveragePoolingImpl.cpp b/src/operator/GlobalAveragePoolingImpl.cpp index c53f92e199aee30d55ddafe39b5ef121979acbf7..1b6d9a0629d856c2ad1fc3eae35db4c12058bc4f 100644 --- a/src/operator/GlobalAveragePoolingImpl.cpp +++ b/src/operator/GlobalAveragePoolingImpl.cpp @@ -30,13 +30,15 @@ void Aidge::GlobalAveragePoolingImpl_cpu::forward() const GlobalAveragePooling_Op& op_ = static_cast<const GlobalAveragePooling_Op&>(mOp); // Check if input is provided AIDGE_ASSERT(op_.getInput(0), "missing input 0"); + // error checking + AIDGE_ASSERT(op_.getInput(0)->nbDims() >= 3,"GlobalAveragePool needs at least a 3 dimensions " + "input. Got input dims {}", op_.getInput(0)->dims()); // Find the correct kernel type const auto impl = Registrar<GlobalAveragePoolingImpl_cpu>::create(getBestMatch(getRequiredSpec())); // Call kernel - impl.forward(op_.getInput(0)->dims(), - op_.getInput(0)->getImpl()->rawPtr(), + impl.forward(op_.getInput(0), op_.getOutput(0)->getImpl()->rawPtr()); } diff --git a/src/operator/HeavisideImpl.cpp b/src/operator/HeavisideImpl.cpp index 56ceb9b0b474d416f25d77b533373d4b193532b8..3932eb3341b5515c3a590d72aa538a5aeda6f423 100644 --- a/src/operator/HeavisideImpl.cpp +++ b/src/operator/HeavisideImpl.cpp @@ -13,25 +13,37 @@ #include <stdexcept> -#include "aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp" #include "aidge/backend/cpu/data/GetCPUPtr.h" +#include "aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp" #include "aidge/utils/ErrorHandling.hpp" template <> void Aidge::HeavisideImplCpu::forward() { - const Heaviside_Op &op_ = dynamic_cast<const Heaviside_Op &>(mOp); - std::shared_ptr<Tensor> input0 = op_.getInput(0); - std::shared_ptr<Tensor> output0 = op_.getOutput(0); - AIDGE_ASSERT(input0, "missing input #0"); - - const auto impl = - Registrar<HeavisideImplCpu>::create(getBestMatch(getRequiredSpec())); - - impl.forward(input0->size(), - getCPUPtr(mOp.getRawInput(0)), - getCPUPtr(mOp.getRawOutput(0)), - op_.value()); + const Heaviside_Op &op_ = dynamic_cast<const Heaviside_Op &>(mOp); + std::shared_ptr<Tensor> input0 = op_.getInput(0); + std::shared_ptr<Tensor> output0 = op_.getOutput(0); + AIDGE_ASSERT(input0, "missing input #0"); + + const auto impl = + Registrar<HeavisideImplCpu>::create(getBestMatch(getRequiredSpec())); + + impl.forward(input0->size(), getCPUPtr(mOp.getRawInput(0)), + getCPUPtr(mOp.getRawOutput(0)), op_.value()); } template <> void Aidge::HeavisideImplCpu::backward() { - AIDGE_THROW_OR_ABORT(std::runtime_error, "Heaviside backward not implemented yet"); + + // TODO: The following lines are assuming that the surrogate gradient is Atan + // remove that assumption by providing an attribute to Heaviside, + // allowing to choose between different surrogate gradients. + + const Heaviside_Op &op_ = dynamic_cast<const Heaviside_Op &>(mOp); + const auto impl = + Registrar<HeavisideImplCpu>::create(getBestMatch(getRequiredSpec())); + + auto in0 = op_.getInput(0); + auto gra_int0 = op_.getInput(0)->grad(); + auto gra_out0 = op_.getOutput(0)->grad(); + + impl.backward(gra_int0->size(), getCPUPtr(in0), getCPUPtr(gra_out0), + getCPUPtr(gra_int0)); } diff --git a/src/operator/LeakyReLUImpl.cpp b/src/operator/LeakyReLUImpl.cpp index 6c0802dd967d2a20b34a2f1ca91fc0640c063c83..2178ecc4f116913e5357411fb35936e52d860cb8 100644 --- a/src/operator/LeakyReLUImpl.cpp +++ b/src/operator/LeakyReLUImpl.cpp @@ -43,8 +43,9 @@ template <> void Aidge::LeakyReLUImpl_cpu::backward() { // reversing in and out Data for backprop const LeakyReLU_Op& op_ = dynamic_cast<const LeakyReLU_Op&>(mOp); - std::shared_ptr<Tensor> in0 = op_.getOutput(0)->grad(); - std::shared_ptr<Tensor> out0 = op_.getInput(0)->grad(); + std::shared_ptr<Tensor> in0 = op_.getInput(0)->grad(); + std::shared_ptr<Tensor> out0grad = op_.getOutput(0)->grad(); + std::shared_ptr<Tensor> in0grad = op_.getInput(0)->grad(); AIDGE_ASSERT(in0, "missing input #0"); // Find the correct kernel type @@ -52,7 +53,8 @@ void Aidge::LeakyReLUImpl_cpu::backward() { // Call kernel impl.backward(op_.negativeSlope(), - in0->size(), + out0grad->size(), getCPUPtr(in0), - getCPUPtr(out0)); + getCPUPtr(out0grad), + getCPUPtr(in0grad)); } \ No newline at end of file diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp index 90075a397be3f082ef95fd4df074c99d926fd385..42be049dbe9a5736104d624939a6da6fc13168f0 100644 --- a/src/operator/MaxPoolingImpl.cpp +++ b/src/operator/MaxPoolingImpl.cpp @@ -25,11 +25,13 @@ void Aidge::MaxPoolingImpl2D_cpu::forward() { AIDGE_ASSERT(op_.getInput(0), "missing input #0 in MaxPooling Operator."); // Find the correct kernel type - const auto impl = Registrar<MaxPoolingImpl2D_cpu>::create(getBestMatch(getRequiredSpec())); + const auto impl = + Registrar<MaxPoolingImpl2D_cpu>::create(getBestMatch(getRequiredSpec())); // Call kernel impl.forward(op_.strideDims(), op_.kernelDims(), + op_.dilations(), op_.ceilMode(), op_.getInput(0)->template dims<4>(), getCPUPtr(mOp.getRawInput(0)), @@ -38,5 +40,19 @@ void Aidge::MaxPoolingImpl2D_cpu::forward() { template <> void Aidge::MaxPoolingImpl2D_cpu::backward() { - AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for MaxPooling_Op<2> on backend cpu"); + const auto& op_ = dynamic_cast<const MaxPooling_Op<2>&>(mOp); + AIDGE_ASSERT(op_.getInput(0), "missing input #0 in MaxPooling Operator."); + + // Find the correct kernel type + const auto impl = + Registrar<MaxPoolingImpl2D_cpu>::create(getBestMatch(getRequiredSpec())); + + // Call kernel + impl.backward(op_.strideDims(), + op_.kernelDims(), + op_.dilations(), + op_.ceilMode(), + op_.getInput(0)->template dims<4>(), + getCPUPtr(mOp.getRawInput(0)), + op_.getInput(0)->grad()->getImpl()->rawPtr()); } diff --git a/src/operator/ModImpl.cpp b/src/operator/ModImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..161f7bc114d6e8bf566b1e2739c1d057ecfdf3f7 --- /dev/null +++ b/src/operator/ModImpl.cpp @@ -0,0 +1,131 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <memory> +#include <vector> + +#include "aidge/backend/cpu/data/Broadcasting.hpp" +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include "aidge/backend/cpu/operator/ModImpl.hpp" +#include "aidge/backend/cpu/operator/ModImpl_kernels.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/Types.h" + +template <> +void Aidge::ModImpl_cpu::forward() { + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + const auto& opTensor = static_cast<const Mod_Op&>(mOp); + + // Find the correct kernel type + const auto impl = Registrar<ModImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Compute compatible input dimensions + std::vector<std::size_t> dims0 = opTensor.getInput(0)->dims(); + std::vector<std::size_t> dims1 = opTensor.getInput(1)->dims(); + const std::vector<std::size_t>& outDims = opTensor.getOutput(0)->dims(); + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + impl.forward(opTensor.fmod(), + input0_contiguous_size, input0_contiguous_size, input0_contiguous_size, + getCPUPtr(mOp.getRawInput(0)), + getCPUPtr(mOp.getRawInput(1)), + getCPUPtr(mOp.getRawOutput(0))); + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outDims.cbegin()+contiguousIdx, outDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outDims.cbegin(), outDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + impl.forward(opTensor.fmod(), input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + getCPUPtr(mOp.getRawInput(0), offsetIn0*input0_contiguous_size), + getCPUPtr(mOp.getRawInput(1), offsetIn1*input1_contiguous_size), + getCPUPtr(mOp.getRawOutput(0), offsetOut*output_contiguous_size)); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outDims[dim] == 0) { + tmp_stack /= outDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } +} + +template <> +void Aidge::ModImpl_cpu::backward() { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Mod_Op on backend cpu"); +} diff --git a/src/operator/PadImpl.cpp b/src/operator/PadImpl.cpp index cdae21f8ed2757128f6a36b661b0897a4ba65f89..9a54437f445a1842b2f97555a0cbea8988acf50a 100644 --- a/src/operator/PadImpl.cpp +++ b/src/operator/PadImpl.cpp @@ -9,14 +9,14 @@ * ********************************************************************************/ +#include <cstddef> #include <vector> -#include "aidge/utils/Types.h" #include "aidge/backend/cpu/data/GetCPUPtr.h" -#include "aidge/operator/Conv.hpp" - #include "aidge/backend/cpu/operator/PadImpl.hpp" #include "aidge/backend/cpu/operator/PadImpl_kernels.hpp" +#include "aidge/operator/Pad.hpp" +#include "aidge/utils/Types.h" Aidge::Elts_t Aidge::Pad_ProdConso_cpu::getNbRequiredProtected(Aidge::IOIndex_t inputIdx) const { AIDGE_ASSERT(inputIdx == 0, "input index out of range." diff --git a/src/operator/ReduceSumImpl.cpp b/src/operator/ReduceSumImpl.cpp index aad0801835a74ecefb046f3dc64729ae1f8bd8bb..93a89a3436fb7c08489a54e94b991e4e36a0e5d4 100644 --- a/src/operator/ReduceSumImpl.cpp +++ b/src/operator/ReduceSumImpl.cpp @@ -12,11 +12,14 @@ #include "aidge/backend/cpu/operator/ReduceSumImpl.hpp" #include <memory> +#include <stdexcept> #include <vector> -#include "aidge/utils/Types.h" -#include "aidge/operator/ReduceSum.hpp" #include "aidge/backend/cpu/operator/ReduceSumImpl_kernels.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/ReduceSum.hpp" +#include "aidge/utils/ErrorHandling.hpp" +#include "aidge/utils/Types.h" template <> void Aidge::ReduceSumImpl_cpu::forward() { diff --git a/src/operator/SqrtImpl.cpp b/src/operator/SqrtImpl.cpp index 25bdb42fd5140ef4f64d704fc3a5ccf237f17f81..d93bfe1f3edadf28a68b0fa627593378306bb2cb 100644 --- a/src/operator/SqrtImpl.cpp +++ b/src/operator/SqrtImpl.cpp @@ -40,6 +40,7 @@ template <> void Aidge::SqrtImpl_cpu::backward() { // reversing in and out Data for backprop const Sqrt_Op& op_ = dynamic_cast<const Sqrt_Op&>(mOp); + std::shared_ptr<Tensor> out0 = op_.getOutput(0); std::shared_ptr<Tensor> out0grad = op_.getOutput(0)->grad(); std::shared_ptr<Tensor> in0grad = op_.getInput(0)->grad(); AIDGE_ASSERT(out0grad, "missing output #0"); @@ -49,6 +50,7 @@ void Aidge::SqrtImpl_cpu::backward() { // Call kernel impl.backward(out0grad->size(), + getCPUPtr(out0), getCPUPtr(out0grad), getCPUPtr(in0grad)); } \ No newline at end of file diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp index e36abe2a9d68a2b56ab1777aa04b0e911df514c8..7f57bf2f1e16bbe6a6dd510f39463b611d925220 100644 --- a/src/operator/SubImpl.cpp +++ b/src/operator/SubImpl.cpp @@ -41,5 +41,27 @@ void Aidge::SubImpl_cpu::forward() { template <> void Aidge::SubImpl_cpu::backward() { - AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Sub_Op on backend cpu"); + + const Sub_Op& op_ = dynamic_cast<const Sub_Op&>(mOp); + + auto in0 = op_.getInput(0); + auto in1 = op_.getInput(1); + auto in0grad = op_.getInput(0)->grad(); + auto in1grad = op_.getInput(1)->grad(); + auto out0grad = op_.getOutput(0)->grad(); + + // Find the correct kernel type + const auto impl = Registrar<SubImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Call kernel + impl.backward(/* input0Length */ in0grad->size(), + /* input1Length */ in1grad->size(), + /* grad0Length */ out0grad->size(), + /* input0Dims */ in0->dims(), + /* input1Dims */ in1->dims(), + /* outputDims */ out0grad->dims(), + /* gradOutput */ getCPUPtr(out0grad), + /* gradInput0 */ getCPUPtr(in0grad), + /* gradInput1 */ getCPUPtr(in1grad)); + } diff --git a/src/operator/TopKImpl.cpp b/src/operator/TopKImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b84ca9d113c5fdd16c7e7e37b69cf2eb8c43538e --- /dev/null +++ b/src/operator/TopKImpl.cpp @@ -0,0 +1,44 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <memory> +#include <vector> + +#include "aidge/utils/Types.h" +#include "aidge/operator/TopK.hpp" +#include "aidge/backend/cpu/data/GetCPUPtr.h" + +#include "aidge/backend/cpu/operator/TopKImpl.hpp" +#include "aidge/backend/cpu/operator/TopKImpl_kernels.hpp" + +template <> +void Aidge::TopKImpl_cpu::forward() { + const TopK_Op& op_ = dynamic_cast<const TopK_Op&>(mOp); + std::int32_t axis = (op_.axis() >= 0) ? op_.axis() : op_.getInput(0)->nbDims() + op_.axis(); + + // Find the correct kernel type + const auto impl = Registrar<TopKImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Call kernel + impl.forward(axis, + op_.largest(), + op_.sorted(), + op_.k(), + op_.getInput(0)->dims(), + op_.getInput(0)->getImpl()->rawPtr(), + op_.getOutput(0)->getImpl()->rawPtr(), + op_.getOutput(1)->getImpl()->rawPtr()); +} + +template <> +void Aidge::TopKImpl_cpu::backward() { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for TopK_Op on backend cpu"); +} diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt index 6c7af9c376a0a58a361880edad9340e3d845febc..217cf8fbcd344968064f3ca3a5ba52c5a4d56ac7 100644 --- a/unit_tests/CMakeLists.txt +++ b/unit_tests/CMakeLists.txt @@ -21,10 +21,18 @@ file(GLOB_RECURSE src_files "*.cpp") add_executable(tests${module_name} ${src_files}) +if (WIN32) + target_compile_definitions(tests${module_name} PRIVATE _USE_MATH_DEFINES) +endif() + target_link_libraries(tests${module_name} PRIVATE ${module_name}) target_link_libraries(tests${module_name} PRIVATE Catch2::Catch2WithMain) +target_compile_options(tests${module_name} PRIVATE + $<$<CXX_COMPILER_ID:MSVC>: + /bigobj>) + list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras) include(CTest) include(Catch) diff --git a/unit_tests/data/Test_Interpolation.cpp b/unit_tests/data/Test_Interpolation.cpp index 5c3b56f02ab17092a6ba238cc74e1bf75e203718..4886885d7d979c7ea4aaa70a33d75cb553b361de 100644 --- a/unit_tests/data/Test_Interpolation.cpp +++ b/unit_tests/data/Test_Interpolation.cpp @@ -9,15 +9,21 @@ * ********************************************************************************/ -#include <aidge/backend/cpu/data/Interpolation.hpp> -#include <aidge/data/Interpolation.hpp> -#include <aidge/data/Tensor.hpp> -#include <aidge/filler/Filler.hpp> -#include <aidge/utils/Types.h> -#include <catch2/catch_test_macros.hpp> +#include <cmath> // std::fabs +#include <cstdlib> // std::abs #include <limits> +#include <memory> +#include <set> +#include <vector> + +#include <catch2/catch_test_macros.hpp> #include "aidge/backend/cpu/data/Interpolation.hpp" +#include "aidge/data/Interpolation.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/filler/Filler.hpp" +#include "aidge/utils/Types.h" +#include "aidge/utils/TensorUtils.hpp" namespace Aidge { @@ -30,12 +36,12 @@ TEST_CASE("Interpolation", "[Interpolation][Data]") { SECTION("1D") { pointsToInterpolateInt = std::set<Interpolation::Point<int>>({{{0}, 10}, {{1}, 20}}); - CHECK(abs(InterpolationCPU::linear({0.5}, pointsToInterpolateInt) - + REQUIRE(std::abs(InterpolationCPU::linear({0.5}, pointsToInterpolateInt) - 15) <= std::numeric_limits<int>::epsilon()); pointsToInterpolateFloat = std::set<Interpolation::Point<float>>( {{{0}, .0F}, {{1}, 0.2F}}); - CHECK(fabs(InterpolationCPU::linear({0.3}, + REQUIRE(std::fabs(InterpolationCPU::linear({0.3}, pointsToInterpolateFloat) - .06F) <= 1e-5); } @@ -46,21 +52,21 @@ TEST_CASE("Interpolation", "[Interpolation][Data]") { {{14, 21}, 162.F}, {{15, 20}, 210.F}, {{15, 21}, 95.F}}; - CHECK(fabs(InterpolationCPU::linear<float>( - {14.5F, 20.2F}, - pointsToInterpolateFloat) - - 146.1) < 1e-5); + const Tensor interpolatedValue = Tensor(std::fabs(InterpolationCPU::linear<float>( + {14.5F, 20.2F}, + pointsToInterpolateFloat))); + REQUIRE(approxEq<float>(interpolatedValue, Tensor(146.1f))); // pointsToInterpolateFloat = {{{0, 0}, .10F}, // {{0, 1}, .20F}, // {{1, 0}, .30F}, // {{1, 1}, .40F}}; - // CHECK(abs(InterpolationCPU::linear<float>({1.5, 0.5}, + // REQUIRE(std::abs(InterpolationCPU::linear<float>({1.5, 0.5}, // pointsToInterpolateInt) // - // 25) < std::numeric_limits<int>::epsilon()); // pointsToInterpolateFloat = std::vector({0.1F, 0.2F, 0.3F, - // 0.4F}); CHECK(InterpolationCPU::linear(pointsToInterpolateFloat) + // 0.4F}); REQUIRE(InterpolationCPU::linear(pointsToInterpolateFloat) // == .25f); } SECTION("3D") { @@ -72,7 +78,7 @@ TEST_CASE("Interpolation", "[Interpolation][Data]") { {{1, 0, 1}, .6F}, {{1, 1, 0}, .7F}, {{1, 1, 1}, .8F}}; - CHECK(fabs(InterpolationCPU::linear({.5, .5, .5}, + REQUIRE(std::fabs(InterpolationCPU::linear({.5, .5, .5}, pointsToInterpolateFloat) - .45f) < 1e-5); } @@ -94,7 +100,7 @@ TEST_CASE("Interpolation", "[Interpolation][Data]") { {{1, 1, 0, 1}, 1.4F}, {{1, 1, 1, 0}, 1.5F}, {{1, 1, 1, 1}, 1.6F}}; - CHECK(fabs(InterpolationCPU::linear<float>( + REQUIRE(std::fabs(InterpolationCPU::linear<float>( {.5, .5, .5, .5}, pointsToInterpolateFloat) - .85f) < 0.0001); @@ -139,25 +145,25 @@ TEST_CASE("Interpolation", "[Interpolation][Data]") { {{4}, 5.0F}}; SECTION("Floor") { - CHECK(InterpolationCPU::nearest( + REQUIRE(InterpolationCPU::nearest( coordToInterpolate, pointsToInterpolate, Interpolation::Mode::Floor) == 1); } SECTION("Ceil") { - CHECK(InterpolationCPU::nearest( + REQUIRE(InterpolationCPU::nearest( coordToInterpolate, pointsToInterpolate, Interpolation::Mode::Ceil) == 2); } SECTION("RoundPreferFloor") { - CHECK(InterpolationCPU::nearest( + REQUIRE(InterpolationCPU::nearest( coordToInterpolate, pointsToInterpolate, Interpolation::Mode::RoundPreferFloor) == 1); } SECTION("RoundPreferCeil") { - CHECK(InterpolationCPU::nearest( + REQUIRE(InterpolationCPU::nearest( coordToInterpolate, pointsToInterpolate, Interpolation::Mode::RoundPreferCeil) == 2); @@ -172,26 +178,26 @@ TEST_CASE("Interpolation", "[Interpolation][Data]") { {{3, 3}, 50.0}, {{3, 4}, 60.0}}; SECTION("Floor") { - CHECK(InterpolationCPU::nearest( + REQUIRE(InterpolationCPU::nearest( coordToInterpolate, pointsToInterpolate, Interpolation::Mode::Floor) == 30.); } SECTION("Ceil") { - CHECK(InterpolationCPU::nearest( + REQUIRE(InterpolationCPU::nearest( coordToInterpolate, pointsToInterpolate, Interpolation::Mode::Ceil) == 60.); } SECTION("RoundPreferFloor") { - CHECK(InterpolationCPU::nearest( + REQUIRE(InterpolationCPU::nearest( coordToInterpolate, pointsToInterpolate, Interpolation::Mode::RoundPreferFloor) == 40.); } SECTION("RoundPreferCeil") { - CHECK(InterpolationCPU::nearest( + REQUIRE(InterpolationCPU::nearest( coordToInterpolate, pointsToInterpolate, Interpolation::Mode::RoundPreferCeil) == 60.); @@ -207,26 +213,26 @@ TEST_CASE("Interpolation", "[Interpolation][Data]") { {{2, 3, 4}, 50.0}, {{3, 3, 4}, 60.0}}; SECTION("Floor") { - CHECK(InterpolationCPU::nearest( + REQUIRE(InterpolationCPU::nearest( coordToInterpolate, pointsToInterpolate, Interpolation::Mode::Floor) == 10.); } SECTION("Ceil") { - CHECK(InterpolationCPU::nearest( + REQUIRE(InterpolationCPU::nearest( coordToInterpolate, pointsToInterpolate, Interpolation::Mode::Ceil) == 50.); } SECTION("RoundPreferFloor") { - CHECK(InterpolationCPU::nearest( + REQUIRE(InterpolationCPU::nearest( coordToInterpolate, pointsToInterpolate, Interpolation::Mode::RoundPreferFloor) == 30.); } SECTION("RoundPreferCeil") { - CHECK(InterpolationCPU::nearest( + REQUIRE(InterpolationCPU::nearest( coordToInterpolate, pointsToInterpolate, Interpolation::Mode::RoundPreferCeil) == 30.); diff --git a/unit_tests/operator/Test_AddImpl.cpp b/unit_tests/operator/Test_AddImpl.cpp index bff9629be152163b2aa92bdc9d0c3029d7987b9b..d9adb4848b1354afd656821f5573b116a23c5b3e 100644 --- a/unit_tests/operator/Test_AddImpl.cpp +++ b/unit_tests/operator/Test_AddImpl.cpp @@ -10,6 +10,7 @@ ********************************************************************************/ #include <memory> +#include <random> #include <catch2/catch_test_macros.hpp> @@ -19,6 +20,7 @@ #include "aidge/graph/Node.hpp" #include "aidge/operator/Add.hpp" #include "aidge/utils/ArrayHelpers.hpp" +#include "aidge/utils/TensorUtils.hpp" using namespace Aidge; @@ -139,4 +141,275 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") { Log::info("Expected Add_1 Tensor:\n{}", expectedOutput); REQUIRE(*op_1->getOutput(0) == expectedOutput); } -} \ No newline at end of file +} + +TEST_CASE("[cpu/operator] Add(backward)", "[Add][CPU]") { + std::shared_ptr<Add_Op> op = std::make_shared<Add_Op>(); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + // NOTE: The first four tests use fixed values, the last one uses random values but static dimensions. + + SECTION("Case 1: 1D and 2D Tensors") { + const auto T0 = std::make_shared<Tensor>( + Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 2, 3}, {4, 5, 6}}})); + + const auto T1 = + std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, 3>({0.1, 0.2, 0.3})); + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->forwardDims(); + + op->getOutput(0)->setGrad(std::make_shared<Tensor>( + Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}))); + op->backward(); + + const Tensor expectedGrad0 = + Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 1, 1}, {1, 1, 1}}}); + + const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({2, 2, 2}); + + + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1)); + } + + SECTION("Case 2: 3D and 1D tensors") { + const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}, + {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}})); + + const auto T1 = + std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1})); + + const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}})); + + const Tensor expectedGrad0 = + Array3D<float, 2, 2, 3>({{{{1, 1, 1}, {1, 1, 1}}, + {{1, 1, 1}, {1, 1, 1}}}}); + + const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({4, 4, 4}); + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->forwardDims(); + + op->getOutput(0)->setGrad(newGrad); + op->backward(); + + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1)); + } + + SECTION("Case 3: 4D and 2D tensors") { + const auto T0 = std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>( + {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}}, + {{10.0, 11.0, 12.0}, {13.0, 14.0, 15.0}, {16.0, 17.0, 18.0}}}, + {{{19.0, 20.0, 21.0}, {22.0, 23.0, 24.0}, {25.0, 26.0, 27.0}}, + {{28.0, 29.0, 30.0}, + {31.0, 32.0, 33.0}, + {34.0, 35.0, 36.0}}}}})); + + const auto T1 = std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 3>( + {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}})); + + const auto newGrad = + std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>( + {{{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}, + {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}, + {{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}, + {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}})); + + const Tensor expectedGrad0 = + Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>( + {{{{{1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}}}, + {{{1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}}}}}); + + const Tensor expectedGrad1 = + Array2D<cpptype_t<DataType::Float32>, 3, 3>({{ + {4.0, 4.0, 4.0}, + {4.0, 4.0, 4.0}, + {4.0, 4.0, 4.0}}}); + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->forwardDims(); + + op->getOutput(0)->setGrad(newGrad); + op->backward(); + + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1)); + } + + SECTION("Case 4: 3D and 2D tensors") { + const auto T0 = std::make_shared<Tensor>( + Array3D<float, 2, 3, 4>({{{ + {1.0, 2.0, 3.0, 4.0}, + {5.0, 6.0, 7.0, 8.0}, + {9.0, 10.0, 11.0, 12.0}, + }, + { + {13.0, 14.0, 15.0, 16.0}, + {17.0, 18.0, 19.0, 20.0}, + {21.0, 22.0, 23.0, 24.0}, + }}})); + + const auto T1 = std::make_shared<Tensor>( + Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{0.1, 0.2, 0.3, 0.4}, + {0.5, 0.6, 0.7, 0.8}, + {0.9, 1.0, 1.1, 1.2}}})); + + const auto newGrad = std::make_shared<Tensor>( + Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{ + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + }, + { + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + }}})); + + const Tensor expectedGrad0 = + Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{{1, 1, 1, 1}, + {1, 1, 1, 1}, + {1, 1, 1, 1}}, + {{1, 1, 1, 1}, + {1, 1, 1, 1}, + {1, 1, 1, 1}}}}); + + const Tensor expectedGrad1 = + Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{2.0, 2.0, 2.0, 2.0}, + {2.0, 2.0, 2.0, 2.0}, + {2.0, 2.0, 2.0, 2.0}}}); + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->forwardDims(); + op->getOutput(0)->setGrad(newGrad); + + op->backward(); + + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1)); + } + + SECTION("Case 5: Tensors with random values") { + + // Use random values + const std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor + const std::vector<std::size_t> dims1 = {2, 6, 7}; // Second tensor + const std::vector<std::size_t> outputDims = {5, 2, 6, 7}; + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> dist(0.1f, 1.0f); + + auto T0 = std::make_shared<Tensor>(dims0); + T0->setDataType(DataType::Float32); + T0->setBackend("cpu"); + float* input0Data = static_cast<float*>(T0->getImpl()->rawPtr()); + // Fill with random values + for (std::size_t i = 0; i < T0->size(); ++i) { + input0Data[i] = dist(gen); + } + + auto T1 = std::make_shared<Tensor>(dims1); + T1->setDataType(DataType::Float32); + T1->setBackend("cpu"); + float* input1Data = static_cast<float*>(T1->getImpl()->rawPtr()); + // Fill with random values + for (std::size_t i = 0; i < T1->size(); ++i) { + input1Data[i] = dist(gen); + } + + op->associateInput(0, T0); + op->associateInput(1, T1); + + op->forwardDims(); + op->forward(); + + Tensor expectedOutput{outputDims}; + expectedOutput.setBackend("cpu"); + float* expectedOutputData = static_cast<float*>(expectedOutput.getImpl()->rawPtr()); + + for (std::size_t n = 0; n < 5; ++n) { + for (std::size_t c = 0; c < 2; ++c) { + for (std::size_t h = 0; h < 6; ++h) { + for (std::size_t w = 0; w < 7; ++w) { + std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n)); + std::size_t in0Idx = + w + 7 * (0 + 1 * (c + 2 * n)); // middle dim is 1 + std::size_t in1Idx = + w + 7 * (h + 6 * c); // no n dimension + + expectedOutputData[outIdx] = input0Data[in0Idx] + input1Data[in1Idx]; + } + } + } + } + + auto outputTensor = op->getOutput(0); + + REQUIRE(approxEq<float>(*outputTensor, expectedOutput)); + + // Backward pass + std::vector<float> gradOutputData(expectedOutput.size()); + for (auto &val : gradOutputData) { + val = dist(gen); + } + + op->getOutput(0)->setGrad(std::make_shared<Tensor>(outputDims)); + op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(), + expectedOutput.size()); + + // Compute reference gradients + std::vector<float> expectedGrad0(T0->size(), 0.0f); + std::vector<float> expectedGrad1(T1->size(), 0.0f); + + for (std::size_t n = 0; n < 5; ++n) { + for (std::size_t c = 0; c < 2; ++c) { + for (std::size_t h = 0; h < 6; ++h) { + for (std::size_t w = 0; w < 7; ++w) { + std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n)); + std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n)); + std::size_t in1Idx = w + 7 * (h + 6 * c); + + // Gradient for input0: just accumulate grad_output + expectedGrad0[in0Idx] += gradOutputData[outIdx]; + + // Gradient for input1: just accumulate grad_output + expectedGrad1[in1Idx] += gradOutputData[outIdx]; + } + } + } + } + + // Perform backward pass + op->backward(); + + auto expectedGrad0Tensor = std::make_shared<Tensor>(); + expectedGrad0Tensor->resize(T0->dims()); + expectedGrad0Tensor->setBackend("cpu"); + expectedGrad0Tensor->setDataType(DataType::Float32); + expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(), + expectedGrad0.size()); + + auto expectedGrad1Tensor = std::make_shared<Tensor>(T1->dims()); + expectedGrad1Tensor->setBackend("cpu"); + expectedGrad1Tensor->setDataType(DataType::Float32); + expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(), + expectedGrad1.size()); + + // Verify backward pass + REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor)); + REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor)); + } +} + diff --git a/unit_tests/operator/Test_AndImpl.cpp b/unit_tests/operator/Test_AndImpl.cpp index c2309dce5f32862ad9aeceaf98430b75ab7be6ef..148298d5f44b9f7744ab18bcfc5ce675f77a784c 100644 --- a/unit_tests/operator/Test_AndImpl.cpp +++ b/unit_tests/operator/Test_AndImpl.cpp @@ -26,75 +26,92 @@ using namespace Aidge; TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") { - SECTION("ForwardDims") - { + SECTION("ForwardDims") { constexpr std::uint16_t NBTRIALS = 10; // Create a random number generator std::random_device rd; std::mt19937 gen(rd()); - std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10)); - std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5)); - std::uniform_int_distribution<int> boolDist(0,1); + std::uniform_int_distribution<int> boolDist(0, 1); // Use 0 for false, 1 for true + std::uniform_int_distribution<std::size_t> dimSizeDist(2, 10); + std::uniform_int_distribution<std::size_t> nbDimsDist(1, 5); SECTION("Same dimensions") { for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { DimSize_t nbDims = nbDimsDist(gen); std::vector<DimSize_t> dims(nbDims); - for (std::size_t i = 0; i < nbDims; i++) { + for (std::size_t i = 0; i < nbDims; ++i) { dims[i] = dimSizeDist(gen); } - + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + float* array0 = new float[nb_elements]; + float* array1 = new float[nb_elements]; + for (std::size_t i = 0; i < nb_elements; ++i) { + array0[i] = boolDist(gen); + array1[i] = boolDist(gen); + } std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims); - myInput1->setBackend("cpu"); - myInput1->setDataType(DataType::Float32); - myInput1->zeros(); std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims); - myInput2->setBackend("cpu"); + myInput1->setDataType(DataType::Float32); myInput2->setDataType(DataType::Float32); - myInput2->zeros(); + myInput1->setBackend("cpu"); + myInput2->setBackend("cpu"); + + myInput1 -> getImpl() -> setRawPtr(array0, nb_elements); + myInput2 -> getImpl() -> setRawPtr(array1, nb_elements); + std::shared_ptr<Node> myAnd = And(); - auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator()); - op->associateInput(0,myInput1); - op->associateInput(1,myInput2); + auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator()); + op->associateInput(0, myInput1); + op->associateInput(1, myInput2); op->setDataType(DataType::Float32); op->setBackend("cpu"); op->forwardDims(); const auto outputDims = op->getOutput(0)->dims(); REQUIRE(outputDims == dims); + delete[] array0; + delete[] array1; } } + SECTION("Broadcasting") { for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { DimSize_t nbDims = nbDimsDist(gen); std::vector<DimSize_t> dims1(nbDims, 1); std::vector<DimSize_t> dims2(nbDims, 1); std::vector<DimSize_t> expectedOutDims; - for (std::size_t i = 0; i < nbDims; i++) { + for (std::size_t i = 0; i < nbDims; ++i) { DimSize_t dim = dimSizeDist(gen); - if (boolDist(gen)) { - dims1[i] = dim; - } - if (boolDist(gen)) { - dims2[i] = dim; - } - expectedOutDims.push_back(std::max(dims1[i],dims2[i])); + if (boolDist(gen)) dims1[i] = dim; + if (boolDist(gen)) dims2[i] = dim; + expectedOutDims.push_back(std::max(dims1[i], dims2[i])); } + const std::size_t nb_elements0 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements1 = std::accumulate(dims2.cbegin(), dims2.cend(), std::size_t(1), std::multiplies<std::size_t>()); + float* array0 = new float[nb_elements0]; + float* array1 = new float[nb_elements1]; + for (std::size_t i = 0; i < nb_elements0; ++i) { + array0[i] = boolDist(gen); + } + for (std::size_t i = 0; i < nb_elements1; ++i) { + array1[i] = boolDist(gen); + } std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1); - myInput1->setBackend("cpu"); - myInput1->setDataType(DataType::Float32); - myInput1->zeros(); std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2); - myInput2->setBackend("cpu"); + myInput1->setDataType(DataType::Float32); myInput2->setDataType(DataType::Float32); - myInput2->zeros(); + myInput1->setBackend("cpu"); + myInput2->setBackend("cpu"); + myInput1 -> getImpl() -> setRawPtr(array0, nb_elements0); + myInput2 -> getImpl() -> setRawPtr(array1, nb_elements1); + + std::shared_ptr<Node> myAnd = And(); - auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator()); - op->associateInput(0,myInput1); - op->associateInput(1,myInput2); + auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator()); + op->associateInput(0, myInput1); + op->associateInput(1, myInput2); op->setDataType(DataType::Float32); op->setBackend("cpu"); @@ -102,110 +119,67 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") { const auto outputDims = op->getOutput(0)->dims(); REQUIRE(outputDims == expectedOutDims); + delete[] array0; + delete[] array1; } } } + SECTION("Same size inputs") { - std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { - { // - { // - {{20, 15},{31, 11},{22, 49}}, // - {{41, 10},{24, 51},{27, 52}}, // - {{26, 53},{27, 54},{28, 55}} // - }, // - { // - {{29, 56},{30, 57},{31, 58}}, // - {{32, 59},{33, 60},{34, 61}}, // - {{35, 62},{36, 63},{37, 64}} // - }, // - { // - {{38, 65},{39, 66},{40, 67}}, // - {{41, 68},{42, 69},{43, 70}}, // - {{44, 71},{45, 72},{46, 73}} // - } // - } // - }); // - std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { - { // - { // - {{20, 47},{21, 48},{22, 49}}, // - {{23, 50},{24, 51},{25, 52}}, // - {{17, 53},{27, 26},{14, 33}} // - }, // - { // - {{29, 56},{30, 57},{31, 58}}, // - {{72, 44},{33, 20},{27, 55}}, // - {{35, 24},{25, 63},{28, 64}} // - }, // - { // - {{32, 65},{39, 66},{40, 70}}, // - {{41, 53},{42, 60},{34, 70}}, // - {{44, 71},{30, 12},{46, 73}} // - } // - } // - }); // - std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { + std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{ { - { - {{1, 0},{0, 0},{1, 1}}, - {{0, 0},{1, 1},{0, 1}}, - {{0, 1},{1, 0},{0, 0}} - }, - { - {{1, 1},{1, 1},{1, 1}}, - {{0, 0},{1, 0},{0, 0}}, - {{1, 0},{0, 1},{0, 1}} - }, - { - {{0, 1},{1, 1},{1, 0}}, - {{1, 0},{1, 0},{0, 1}}, - {{1, 1},{0, 0},{1, 1}} - } - } - }); + {{{1, 0}, {0, 1}}, + {{1, 1}, {0, 0}}}, + {{{0, 1}, {1, 0}}, + {{1, 0}, {0, 1}}}} + }); + std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{ + { + {{{1, 1}, {0, 0}}, + {{0, 1}, {1, 1}}}, + {{{1, 1}, {0, 0}}, + {{0, 1}, {1, 0}}}} + }); + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{ + { + {{{1, 0}, {0, 0}}, + {{0, 1}, {0, 0}}}, + {{{0, 1}, {0, 0}}, + {{0, 0}, {0, 0}}}} + }); std::shared_ptr<Node> myAnd = And(); - auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator()); + auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator()); op->associateInput(0, input1); op->associateInput(1, input2); op->setBackend("cpu"); - op->setDataType(DataType::Int32); + op->setDataType(DataType::Float32); myAnd->forward(); - + op->getOutput(0)->print(); REQUIRE(*(op->getOutput(0)) == *expectedOutput); } SECTION("Broadcasting") { - std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> { - { // - { // - {{10, 20},{22, 23},{20, 20}}, // - {{10, 15},{10, 29},{20, 20}}, // - {{26, 25},{33, 20},{10, 20}} // - } // - } // - }); // - - std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}}); - std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> { - { // - { // - {{ 1, 1},{ 0, 0},{ 0, 1}}, // - {{ 1, 0},{ 1, 0},{ 0, 1}}, // - {{ 0, 0},{ 0, 1},{ 1, 1}} // - } // - } // - }); // + std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{ + { + {{{1, 0}, {1, 0}}, + {{1, 1}, {0, 0}}}} + }); + std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float, 2>{{1, 0}}); + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{ + { + {{{1, 0}, {1, 0}}, + {{1, 0}, {0, 0}}}} + }); std::shared_ptr<Node> myAnd = And(); - auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator()); + auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator()); op->associateInput(0, input_1); op->associateInput(1, input_2); - op->setDataType(DataType::Int32); + op->setDataType(DataType::Float32); op->setBackend("cpu"); myAnd->forward(); - op->getOutput(0)->print(); - expectedOutput->print(); - REQUIRE(*op->getOutput(0) == *expectedOutput); + + REQUIRE(*(op->getOutput(0)) == *expectedOutput); } -} \ No newline at end of file +} diff --git a/unit_tests/operator/Test_AvgPoolingImpl.cpp b/unit_tests/operator/Test_AvgPoolingImpl.cpp index 372febc61d04c2ba983dd33f009fe5bf1d2908a0..d0299ab56eac7ae19cfef6cf8c4bf9757bca4ab1 100644 --- a/unit_tests/operator/Test_AvgPoolingImpl.cpp +++ b/unit_tests/operator/Test_AvgPoolingImpl.cpp @@ -110,5 +110,116 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") { REQUIRE(std::abs(outPtr[i] - expectedOutPtr[i]) < 0.00001); } } - // std::cout << static_cast<Tensor>((*op)["weight"])[0][0][0][0] << std::endl; + SECTION("Dilations") { + std::shared_ptr<Tensor> myInput3 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW + { + { + {{ 1, 2, 3, 4, 5}, + { 6, 7, 8, 9, 10}, + {11, 12, 13, 14, 15}, + {16, 17, 18, 19, 20}, + {21, 22, 23, 24, 25}} + } + } + }); + + // Dilation of 2 means we take every second element in the window + std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "mycdw", {1,1}, {2,2}); + auto op = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool -> getOperator()); + + std::shared_ptr<Tensor> myOutput3 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { + { + { + {{ 7, 8, 9}, + { 12, 13, 14}, + { 17, 18, 19}} + } + } + }); + + op->associateInput(0, myInput3); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + myAvgPool->forward(); + op->getOutput(0)->print(); + REQUIRE(*(op->getOutput(0)) == *myOutput3); + } + SECTION("Ceil Mode") { + std::shared_ptr<Tensor> myInput4 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW + { + { + { + { 1, 2, 3, 4, 5}, + { 6, 7, 8, 9, 10}, + {11, 12, 13, 14, 15}, + {16, 17, 18, 19, 20}, + {21, 22, 23, 24, 25} + } + } + } + }); + + // AvgPool with ceil_mode = true + std::shared_ptr<Node> myAvgPool1 = AvgPooling({2,2}, "mycdw", {2,2}, {1,1}, true); + auto op1 = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool1 -> getOperator()); + + std::shared_ptr<Tensor> myOutput4 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { + { + { + { + { 4.0, 6.0, 7.5 }, + { 14.0, 16.0, 17.5 }, + { 21.5, 23.5, 25.0 } + } + } + } + }); + op1->associateInput(0, myInput4); + op1->setDataType(DataType::Float32); + op1->setBackend("cpu"); + myAvgPool1->forward(); + op1->getOutput(0)->print(); + REQUIRE(*(op1->getOutput(0)) == *myOutput4); + + // AvgPool with ceil_mode = false + std::shared_ptr<Node> myAvgPool2 = AvgPooling({2,2}, "mycdw", {2,2}, {1,1}, false); + auto op2 = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool2 -> getOperator()); + std::shared_ptr<Tensor> myOutput5 = std::make_shared<Tensor>(Array4D<float,1,1,2,2> { + { + { + { + { 4.0, 6.0 }, + { 14.0, 16.0 } + } + } + } + }); + op2->associateInput(0, myInput4); + op2->setDataType(DataType::Float32); + op2->setBackend("cpu"); + myAvgPool2->forward(); + op2->getOutput(0)->print(); + REQUIRE(*(op2->getOutput(0)) == *myOutput5); + } + + SECTION("Simple test") { + std::shared_ptr<Tensor> tensor = + std::make_shared<Tensor>(Array4D<int32_t, 1, 1, 7, 7>{{{{ + {0, 8, 26, 35, 49, 45, 22}, + {2, 24, 48, 66, 60, 46, 26}, + {8, 41, 64, 68, 39, 18, 9}, + {10, 48, 72, 76, 42, 14, 9}, + {6, 29, 52, 65, 27, 7, 3}, + {1, 9, 24, 31, 18, 7, 1}, + {0, 0, 4, 6, 7, 1, 1}}}}}); + + auto op = AvgPooling2D_Op({7, 7}); + op.setDataType(DataType::Int32); + op.setBackend("cpu"); + + op.associateInput(0, tensor); + op.forwardDims(); + op.forward(); + REQUIRE(op.getOutput(0)->get<int32_t>(0) == 26); + } } \ No newline at end of file diff --git a/unit_tests/operator/Test_BitShift.cpp b/unit_tests/operator/Test_BitShift.cpp index 33ab932e296be717604be42716d7abe2b61f65ee..8d69d410cf42ce8cb340a8966943ff9242ed6f54 100644 --- a/unit_tests/operator/Test_BitShift.cpp +++ b/unit_tests/operator/Test_BitShift.cpp @@ -8,7 +8,6 @@ * SPDX-License-Identifier: EPL-2.0 * ********************************************************************************/ - #include <chrono> // std::micro, std::chrono::time_point, // std::chrono::system_clock #include <cstddef> // std::size_t @@ -139,6 +138,82 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") { Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); Log::info("total time: {}μs\n", duration.count()); } + SECTION("Test Forward Kernel with same dimensions and applying rounding") { + std::shared_ptr<Node> RoundBitShift = BitShift(BitShift_Op::BitShiftDirection::right,true); + auto op_r = std::static_pointer_cast<OperatorTensor>(RoundBitShift-> getOperator()); + op_r->setDataType(DataType::Int32); + op_r->setBackend("cpu"); + + // Create 2 input Tensors + std::shared_ptr<Tensor> T0_r = std::make_shared<Tensor>(); + op_r->associateInput(0,T0_r); + T0_r->setDataType(DataType::Int32); + T0_r->setBackend("cpu"); + std::shared_ptr<Tensor> T1_r = std::make_shared<Tensor>(); + op_r -> associateInput(1,T1_r); + T1_r->setDataType(DataType::Int32); + T1_r->setBackend("cpu"); + + // Create results Tensor + std::shared_ptr<Tensor> Tres_r = std::make_shared<Tensor>(); + Tres_r->setDataType(DataType::Int32); + Tres_r->setBackend("cpu"); + std::size_t number_of_operation = 0; + + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate 2 random Tensors + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) { + dims.push_back(dimSizeDist(gen)); + } + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + + // without broadcasting + int* array0 = new int[nb_elements]; + int* array1 = new int[nb_elements]; + int* result = new int[nb_elements]; + for (std::size_t i = 0; i < nb_elements; ++i) + { + array0[i] = valueDist(gen); + array1[i] = std::abs(valueDist(gen)); // bitshift is impossible with negative value + result[i] = array0[i] >> array1[i]; + if(array1[i] > 0) //Cannot use rounding when shift value is 0 + result[i] = ((array0[i] >> (array1[i] - 1)) + 1) >> 1; + } + + // input0 + T0_r->resize(dims); + T0_r -> getImpl() -> setRawPtr(array0, nb_elements); + + // input1 + T1_r->resize(dims); + T1_r -> getImpl() -> setRawPtr(array1, nb_elements); + + // results + Tres_r->resize(dims); + Tres_r -> getImpl() -> setRawPtr(result, nb_elements); + + op_r->forwardDims(); + start = std::chrono::system_clock::now(); + RoundBitShift->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + bool is_eq_round = approxEq<int>(*(op_r->getOutput(0)), *Tres_r); + auto Output = *(op_r->getOutput(0)); + auto prt = Output.getImpl()->rawPtr(); + + REQUIRE(is_eq_round); + + delete[] array0; + delete[] array1; + delete[] result; + } + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {}μs\n", duration.count()); + } SECTION("Test BitShift kernels with Broadcasting") { std::size_t number_of_operation = 0; diff --git a/unit_tests/operator/Test_ClipImpl.cpp b/unit_tests/operator/Test_ClipImpl.cpp index 99147ac93bd659dd91897f6b7f1f3f33e5552ef6..3d75ad78807d0e4d23ec231f5df485e8574a03ee 100644 --- a/unit_tests/operator/Test_ClipImpl.cpp +++ b/unit_tests/operator/Test_ClipImpl.cpp @@ -315,5 +315,5 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") Log::info("total time: {}\n", duration.count()); } } -} // namespace Aidge -} \ No newline at end of file +} +} // namespace Aidge diff --git a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp index 8ec1669b92a5116999413cf55a8c5113363ef330..9af2ca1155eb2fe1b4418276fd6f9c0079b0d73c 100644 --- a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp +++ b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp @@ -32,84 +32,84 @@ #include "aidge/utils/Types.h" namespace Aidge { -TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") { - constexpr std::uint16_t NBTRIALS = 10; - // Create a random number generator - auto random_seed = Catch::Generators::Detail::getSeed; - std::mt19937 gen(random_seed()); - std::uniform_real_distribution<float> valueDist( - 0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<DimSize_t> input_tensor_size_dist( - std::size_t(1), std::size_t(10)); - std::uniform_int_distribution<int64_t> input_tensor_values_dist( - std::size_t(1), std::size_t(7)); - std::uniform_real_distribution<double> operator_attr_value_dist(-100., 100.); - /////////////////////////////////////////////// - // SETUP FUNCTIONS - auto generate_input_tensor = - [&gen, &input_tensor_size_dist, - &input_tensor_values_dist]() -> std::shared_ptr<Tensor> { - std::vector<DimSize_t> input_dims; - input_dims.push_back(input_tensor_size_dist(gen)); +TEST_CASE("[cpu/operator] ConstantOfShape(forward)", "[ConstantOfShape][CPU][forward]") { + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + auto random_seed = Catch::Generators::Detail::getSeed; + std::mt19937 gen(random_seed()); + std::uniform_real_distribution<float> valueDist( + 0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<DimSize_t> input_tensor_size_dist( + std::size_t(1), std::size_t(10)); + std::uniform_int_distribution<int64_t> input_tensor_values_dist( + std::size_t(1), std::size_t(7)); + std::uniform_real_distribution<double> operator_attr_value_dist(-100., 100.); - auto result = std::make_shared<Tensor>(input_dims); - result->setDataType(DataType::Int64); - result->setBackend("cpu"); - for (DimSize_t i = 0; i < result->size(); ++i) { - result->set<std::int64_t>(i, input_tensor_values_dist(gen)); - } - return result; - }; + /////////////////////////////////////////////// + // SETUP FUNCTIONS + auto generate_input_tensor = + [&gen, &input_tensor_size_dist, + &input_tensor_values_dist]() -> std::shared_ptr<Tensor> { + std::vector<DimSize_t> input_dims; + input_dims.push_back(input_tensor_size_dist(gen)); - auto generate_random_operator = - [&gen, - &operator_attr_value_dist]() -> std::shared_ptr<ConstantOfShape_Op> { - auto node = ConstantOfShape(Tensor(operator_attr_value_dist(gen))); - auto op = std::static_pointer_cast<ConstantOfShape_Op>(node->getOperator()); - op->setDataType(DataType::Float64); - op->setBackend("cpu"); - return op; - }; + auto result = std::make_shared<Tensor>(input_dims); + result->setDataType(DataType::Int64); + result->setBackend("cpu"); + for (DimSize_t i = 0; i < result->size(); ++i) { + result->set<std::int64_t>(i, input_tensor_values_dist(gen)); + } + return result; + }; - auto generate_output_tensor = [](std::shared_ptr<Tensor> input_tensor, - std::shared_ptr<ConstantOfShape_Op> op) { - std::vector<DimSize_t> output_dims; - output_dims.reserve(input_tensor->size()); - for (DimSize_t i = 0; i < input_tensor->size(); ++i) { - output_dims.push_back(input_tensor->get<int64_t>(i)); - } - auto result = std::make_shared<Tensor>(output_dims); - result->setDataType(op->value().dataType()); - result->setBackend("cpu"); - constantFiller(result, op->value().get<double>(0)); - return result; - }; + auto generate_random_operator = + [&gen, + &operator_attr_value_dist]() -> std::shared_ptr<ConstantOfShape_Op> { + std::shared_ptr<ConstantOfShape_Op> op = std::make_shared<ConstantOfShape_Op>(Tensor(operator_attr_value_dist(gen))); + op->setDataType(DataType::Float64); + op->setBackend("cpu"); + return op; + }; + + auto generate_output_tensor = [](std::shared_ptr<Tensor> input_tensor, + std::shared_ptr<ConstantOfShape_Op> op) { + std::vector<DimSize_t> output_dims; + output_dims.reserve(input_tensor->size()); + for (DimSize_t i = 0; i < input_tensor->size(); ++i) { + output_dims.push_back(input_tensor->get<std::int64_t>(i)); + } + auto result = std::make_shared<Tensor>(output_dims); + result->setDataType(op->value().dataType()); + result->setBackend("cpu"); + constantFiller(result, op->value().get<double>(0)); + return result; + }; - ///////////////////////////////////// - // BENCHMARKING - std::chrono::time_point<std::chrono::system_clock> start; - std::chrono::time_point<std::chrono::system_clock> end; - std::chrono::duration<double, std::micro> duration{}; - int number_of_operation{0}; + ///////////////////////////////////// + // BENCHMARKING + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + int number_of_operation{0}; - SECTION("ConstantOfShapeImpl_cpu::forward()") { - for (int i = 0; i < NBTRIALS; ++i) { - auto input_T = generate_input_tensor(); - std::shared_ptr<ConstantOfShape_Op> op = generate_random_operator(); - auto output_T = generate_output_tensor(input_T, op); - op->associateInput(0, input_T); + SECTION("ConstantOfShapeImpl_cpu::forward()") { + for (int i = 0; i < NBTRIALS; ++i) { + auto input_T = generate_input_tensor(); + std::shared_ptr<ConstantOfShape_Op> op = generate_random_operator(); + auto output_T = generate_output_tensor(input_T, op); + op->associateInput(0, input_T); - REQUIRE(op->forwardDims(true)); - REQUIRE_NOTHROW(op->forward()); + REQUIRE(op->forwardDims(true)); + REQUIRE_NOTHROW(op->forward()); - CHECK(output_T->nbDims() == op->getOutput(0)->nbDims()); - for (DimIdx_t i = 0; i < output_T->nbDims(); ++i) { - CHECK(output_T->dims().at(i) == op->getOutput(0)->dims().at(i)); - } - CHECK(approxEq<double>(*output_T, *op->getOutput(0))); + CHECK(output_T->nbDims() == op->getOutput(0)->nbDims()); + for (DimIdx_t i = 0; i < output_T->nbDims(); ++i) { + CHECK(output_T->dims().at(i) == op->getOutput(0)->dims().at(i)); + } + CHECK(approxEq<double>(*output_T, *op->getOutput(0))); + } } - } } } // namespace Aidge diff --git a/unit_tests/operator/Test_ConvImpl.cpp b/unit_tests/operator/Test_ConvImpl.cpp index f7be338c0b9c5bb1d5af6bfa09ed7855c17fb6c0..c7242bbb6f0c7ba6632d1d5937b72e2a0d5cc218 100644 --- a/unit_tests/operator/Test_ConvImpl.cpp +++ b/unit_tests/operator/Test_ConvImpl.cpp @@ -17,9 +17,11 @@ #include "aidge/backend/cpu/operator/ConvImpl.hpp" #include "aidge/data/Data.hpp" // DataType #include "aidge/data/Tensor.hpp" +#include "aidge/filler/Filler.hpp" #include "aidge/graph/Node.hpp" #include "aidge/operator/Conv.hpp" #include "aidge/utils/TensorUtils.hpp" +#include "aidge/operator/Pad.hpp" using namespace Aidge; @@ -1645,4 +1647,1070 @@ TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") { REQUIRE(approxEq<float>(*(conv_op.getOutput(0)),*expectedOutput, 1e-5f, 1e-6f)); } } -} \ No newline at end of file + + SECTION("kernel size [7,7]") { + SECTION("stride [2,2], no dilation, with padding (3,3,3,3)") { + Conv_Op<2> conv_op = Conv_Op<2>({7,7}, {2,2}); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array1D<int32_t,3*4*4> { + { + 54, 46, 32, 24, 18, 13, 13, 17, 22, 8, 34, 37, + 37, 36, 30, 31, 28, 32, 32, 29, 29, 24, 18, 16, + 57, 63, 57, 42, 30, 20, 17, 30, 41, 52, 46, 38, + 65, 52, 60, 60, 59, 61, 65, 70, 69, 69, 71, 67 + } + }); + myInput->resize(std::vector<std::size_t>({1,4,4,3})); + myInput->setDataFormat(DataFormat::NHWC); + myInput->setDataFormat(DataFormat::NCHW); + std::shared_ptr<Tensor> myBiases = std::make_shared<Tensor>(Array1D<int32_t,1> { + {18300} + }); + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int32_t,1,3,7,7> { + {{{{ 0, 0, -1, 0, 1, 0, -1}, + { 0, 0, 0, 1, 1, 0, -1}, + { 0, 0, 0, 1, 1, 1, 0}, + { 0, 1, 1, 0, 1, 1, 0}, + { 0, 1, 1, 1, 1, 1, 0}, + { 0, 1, 1, 1, 1, 0, -1}, + { -1, 0, 1, 2, 2, 0, -1}}, + + {{ 0, 0, -1, 0, 0, 0, -1}, + { 0, 0, 0, 1, 1, 0, 0}, + { 0, 0, 1, 1, 1, 1, 0}, + { 0, 1, 1, 1, 1, 1, 1}, + { 0, 1, 1, 1, 1, 1, 0}, + { 0, 1, 1, 0, 1, 0, 0}, + { -1, 0, 1, 1, 1, 0, -1}}, + + {{ 0, -1, -1, 0, 1, 0, -1}, + { 0, 1, 1, 2, 2, 1, 0}, + { 0, 1, 1, 2, 2, 1, 1}, + { 0, 1, 1, 1, 1, 1, 2}, + { -1, 1, 1, 0, 1, 1, 1}, + { -1, 1, 1, 0, 0, 0, 0}, + { -1, 0, 1, 1, 1, 0, 0}}}} + }); + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array1D<int32_t,1> { + { + 19282 + } + }); + Pad_Op<2> pad_op = Pad_Op<2>({3,3}); + pad_op.setBackend("cpu"); + pad_op.associateInput(0,myInput); + pad_op.setDataType(DataType::Int32); + pad_op.forwardDims(); + pad_op.forward(); + + conv_op.associateInput(0, pad_op.getOutput(0)); + conv_op.associateInput(1, myWeights); + conv_op.associateInput(2, myBiases); + conv_op.setBackend("cpu"); + conv_op.setDataType(DataType::Int32); + conv_op.forwardDims(); + conv_op.forward(); + conv_op.getOutput(0)->resize(std::vector<std::size_t>({1})); + //conv_op.getOutput(0)->print(); + //fmt::print("{:.^20}\n", "truth"); + //(*expectedOutput).print(); + REQUIRE(*(conv_op.getOutput(0)) == *expectedOutput); + } + } + +} + +template <DimSize_t DIM> +std::shared_ptr<OperatorTensor> +setupTestConv(const DimSize_t batchSize, + const DimSize_t inChannels, + const DimSize_t outChannels, + const std::array<DimSize_t, DIM> kernelSize, + const std::array<DimSize_t, DIM> dataSize, + const std::array<DimSize_t, DIM> stride, + const std::array<DimSize_t, DIM> dilation, + const std::array<DimSize_t, 2 * DIM> padding, + const std::shared_ptr<Tensor> input, + const std::shared_ptr<Tensor> weights, + const std::shared_ptr<Tensor> biases) { + input->setBackend("cpu"); + weights->setBackend("cpu"); + biases->setBackend("cpu"); + std::shared_ptr<Node> convNode; + convNode = Conv(inChannels, + outChannels, + kernelSize, + "myconv", + std::array<DimSize_t, DIM>({stride}), + dilation); + auto op = + std::static_pointer_cast<OperatorTensor>(convNode->getOperator()); + + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + op->associateInput(0, input); + op->associateInput(1, weights); + op->associateInput(2, biases); + + REQUIRE_NOTHROW(op->forwardDims(true)); + + return op; +} + +TEST_CASE("[cpu/operator] Conv(backward)", "[Conv][CPU]") { + SECTION("1D") { + const std::size_t DIM = 1; + SECTION("no stride & no dilation, outChannels > inChannels") { + + const DimSize_t batchSize = 1; + const DimSize_t inChannels = 2; + const DimSize_t outChannels = 3; + const DimSize_t kernelSize = 4; + const DimSize_t inDataSize = 12; + + const DimSize_t stride = 1; + const DimSize_t dilation = 1; + const std::array<DimSize_t, 2 * DIM> padding({0, 0}); + + auto inputSize = + std::vector<DimSize_t>({batchSize, inChannels, inDataSize}); + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000}, + {1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000}}}})); + + auto weights = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{0.100000, 0.100000, 0.100000, 0.100000}, + {0.100000, 0.100000, 0.100000, 0.100000}}, + {{0.100000, 0.100000, 0.100000, 0.100000}, + {0.100000, 0.100000, 0.100000, 0.100000}}, + {{0.100000, 0.100000, 0.100000, 0.100000}, + {0.100000, 0.100000, 0.100000, 0.100000}}} + + })); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({0.010000, 0.010000, 0.010000})); + + auto op = setupTestConv<DIM>( + batchSize, + inChannels, + outChannels, + std::array<DimSize_t, DIM>({kernelSize}), + std::array<DimSize_t, DIM>({inDataSize}), + std::array<DimSize_t, DIM>({stride}), + std::array<DimSize_t, DIM>({dilation}), + padding, + input, + weights, + biases); + + //////////////////////////////////// + // setup gradients for backward + auto outputGrad = + std::make_shared<Tensor>(op->getOutput(0)->dims()); + outputGrad->setDataType(DataType::Float32); + outputGrad->setBackend("cpu"); + constantFiller(outputGrad, 1.f); + op->getOutput(0)->setGrad(outputGrad); + + //////////////////////////////////// + // setup gradients for backward + REQUIRE_NOTHROW(op->backward()); + + SECTION("Input Grad") { + auto expectedInputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{0.3000, + 0.6000, + 0.9000, + 1.2000, + 1.2000, + 1.2000, + 1.2000, + 1.2000, + 1.2000, + 0.9000, + 0.6000, + 0.3000}, + {0.3000, + 0.6000, + 0.9000, + 1.2000, + 1.2000, + 1.2000, + 1.2000, + 1.2000, + 1.2000, + 0.9000, + 0.6000, + 0.3000}}}})); + CHECK(approxEq<float, float>(*op->getInput(0)->grad(), + *expectedInputGrad)); + } + SECTION("Weight grad") { + std::vector<DimSize_t> weightsSize( + {outChannels, inChannels, kernelSize}); + auto expectedWeightsGrad = + std::make_shared<Tensor>(weightsSize); + expectedWeightsGrad->setBackend("cpu"); + expectedWeightsGrad->setDataType(DataType::Float32); + constantFiller<float>(expectedWeightsGrad, 9.); + + CHECK(approxEq<float, float>(*op->getInput(1)->grad(), + *expectedWeightsGrad)); + } + SECTION("Bias Grad") { + std::vector<DimSize_t> biasesSize({outChannels}); + auto expectedBiasGrad = std::make_shared<Tensor>(biasesSize); + expectedBiasGrad->setBackend("cpu"); + expectedBiasGrad->setDataType(DataType::Float32); + constantFiller<float>(expectedBiasGrad, 9.); + CHECK(approxEq<float, float>(*op->getInput(2)->grad(), + *expectedBiasGrad)); + } + } + + SECTION("stride and no dilation, inChannel > outChannels") { + const DimSize_t batchSize = 2; + const DimSize_t inChannels = 3; + const DimSize_t outChannels = 1; + const DimSize_t kernelSize = 2; + const DimSize_t inDataSize = 8; + const DimSize_t stride = 3; + const DimSize_t dilation = 1; + const std::array<DimSize_t, 2 * DIM> padding({0, 0}); + + auto inputSize = + std::vector<DimSize_t>({batchSize, inChannels, inDataSize}); + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}}, + + {{1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}}}})); + auto weights = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{0.1000, 0.1000}, + {0.1000, 0.1000}, + {0.1000, 0.1000}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({0.060000})); + + auto op = setupTestConv<DIM>( + batchSize, + inChannels, + outChannels, + std::array<DimSize_t, DIM>({kernelSize}), + std::array<DimSize_t, DIM>({inDataSize}), + std::array<DimSize_t, DIM>({stride}), + std::array<DimSize_t, DIM>({dilation}), + padding, + input, + weights, + biases); + + //////////////////////////////////// + // setup gradients for backward + auto outputGrad = + std::make_shared<Tensor>(op->getOutput(0)->dims()); + outputGrad->setDataType(DataType::Float32); + outputGrad->setBackend("cpu"); + constantFiller(outputGrad, 1.f); + op->getOutput(0)->setGrad(outputGrad); + + //////////////////////////////////// + // setup gradients for backward + REQUIRE_NOTHROW(op->backward()); + + SECTION("Input Grad") { + auto expectedInputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000}}, + + {{0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000}}}})); + CHECK(approxEq<float, float>(*op->getInput(0)->grad(), + *expectedInputGrad)); + } + SECTION("Weight grad") { + auto expectedWeightsGrad = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{6., 6.}, {6., 6.}, {6., 6.}}}})); + CHECK(approxEq<float, float>(*op->getInput(1)->grad(), + *expectedWeightsGrad)); + } + SECTION("Bias Grad") { + auto expectedBiasesGrad = std::make_shared<Tensor>( + Array1D<float, outChannels>({6.})); + CHECK(approxEq<float, float>(*op->getInput(2)->grad(), + *expectedBiasesGrad)); + } + } + + SECTION("dilation, no stride") { + const DimSize_t batchSize = 2; + const DimSize_t inChannels = 3; + const DimSize_t outChannels = 1; + const DimSize_t kernelSize = 2; + const DimSize_t inDataSize = 8; + + const DimSize_t stride = 1; + const DimSize_t dilation = 2; + const std::array<DimSize_t, 2 * DIM> padding({0, 0}); + + auto inputSize = + std::vector<DimSize_t>({batchSize, inChannels, inDataSize}); + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}}, + + {{1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}}}})); + auto weights = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{0.1000, 0.1000}, + {0.1000, 0.1000}, + {0.1000, 0.1000}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({0.060000})); + + auto op = setupTestConv<DIM>( + batchSize, + inChannels, + outChannels, + std::array<DimSize_t, DIM>({kernelSize}), + std::array<DimSize_t, DIM>({inDataSize}), + std::array<DimSize_t, DIM>({stride}), + std::array<DimSize_t, DIM>({dilation}), + padding, + input, + weights, + biases); + + //////////////////////////////////// + // setup gradients for backward + auto outputGrad = + std::make_shared<Tensor>(op->getOutput(0)->dims()); + outputGrad->setDataType(DataType::Float32); + outputGrad->setBackend("cpu"); + constantFiller(outputGrad, 1.f); + op->getOutput(0)->setGrad(outputGrad); + + //////////////////////////////////// + // setup gradients for backward + REQUIRE_NOTHROW(op->backward()); + + SECTION("Input Grad") { + auto expectedInputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{0.1000, + 0.1000, + 0.2000, + 0.2000, + 0.2000, + 0.2000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.2000, + 0.2000, + 0.2000, + 0.2000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.2000, + 0.2000, + 0.2000, + 0.2000, + 0.1000, + 0.1000}}, + + {{0.1000, + 0.1000, + 0.2000, + 0.2000, + 0.2000, + 0.2000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.2000, + 0.2000, + 0.2000, + 0.2000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.2000, + 0.2000, + 0.2000, + 0.2000, + 0.1000, + 0.1000}}}})); + CHECK(approxEq<float, float>(*op->getInput(0)->grad(), + *expectedInputGrad)); + } + SECTION("Weight grad") { + auto expectedWeightsGrad = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{12., 12.}, {12., 12.}, {12., 12.}}}})); + CHECK(approxEq<float, float>(*op->getInput(1)->grad(), + *expectedWeightsGrad)); + } + SECTION("Bias Grad") { + auto expectedBiasesGrad = std::make_shared<Tensor>( + Array1D<float, outChannels>({12.})); + CHECK(approxEq<float, float>(*op->getInput(2)->grad(), + *expectedBiasesGrad)); + } + } + SECTION("stride & dilation") { + const DimSize_t batchSize = 1; + const DimSize_t inChannels = 4; + const DimSize_t outChannels = 4; + const DimSize_t kernelSize = 3; + const DimSize_t inDataSize = 13; + + const DimSize_t stride = 4; + const DimSize_t dilation = 3; + const std::array<DimSize_t, 2 * DIM> padding({0, 0}); + + auto inputSize = + std::vector<DimSize_t>({batchSize, inChannels, inDataSize}); + + auto input = std::make_shared< + Tensor>(Array3D<float, batchSize, inChannels, inDataSize>( + {{{{1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}}}})); + auto weights = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}}, + + {{0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}}, + + {{0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}}, + + {{0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}}}})); + + auto biases = std::make_shared<Tensor>(Array1D<float, outChannels>( + {{0.0100, 0.0100, 0.0100, 0.0100}})); + + auto op = setupTestConv<DIM>( + batchSize, + inChannels, + outChannels, + std::array<DimSize_t, DIM>({kernelSize}), + std::array<DimSize_t, DIM>({inDataSize}), + std::array<DimSize_t, DIM>({stride}), + std::array<DimSize_t, DIM>({dilation}), + padding, + input, + weights, + biases); + + //////////////////////////////////// + // setup gradients for backward + auto outputGrad = + std::make_shared<Tensor>(op->getOutput(0)->dims()); + outputGrad->setDataType(DataType::Float32); + outputGrad->setBackend("cpu"); + constantFiller(outputGrad, 1.f); + op->getOutput(0)->setGrad(outputGrad); + + //////////////////////////////////// + // setup gradients for backward + REQUIRE_NOTHROW(op->backward()); + + SECTION("Input Grad") { + auto expectedInputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.0000, + 0.0000}, + {0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.0000, + 0.0000}, + {0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.0000, + 0.0000}, + {0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.0000, + 0.0000}}}})); + CHECK(approxEq<float, float>(*op->getInput(0)->grad(), + *expectedInputGrad)); + } + SECTION("Weight grad") { + auto expectedWeightsGrad = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}}, + + {{2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}}, + + {{2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}}, + + {{2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}}}})); + CHECK(approxEq<float, float>(*op->getInput(1)->grad(), + *expectedWeightsGrad)); + } + SECTION("Bias Grad") { + auto expectedBiasesGrad = std::make_shared<Tensor>( + Array1D<float, outChannels>({{2., 2., 2., 2.}})); + CHECK(approxEq<float, float>(*op->getInput(2)->grad(), + *expectedBiasesGrad)); + } + } + + // Harder to read, look at previous tests in case of issue + SECTION("Sequential values") { + const DimSize_t batchSize = 1; + const DimSize_t inChannels = 2; + const DimSize_t outChannels = 2; + const DimSize_t kernelSize = 3; + const DimSize_t inDataSize = 8; + + const DimSize_t stride = 2; + const DimSize_t dilation = 2; + const std::array<DimSize_t, 2 * DIM> padding({0, 0}); + + const DimSize_t outDataSize = 2; + + auto inputSize = + std::vector<DimSize_t>({batchSize, inChannels, inDataSize}); + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{1., 2., 3., 4., 5., 6., 7., 8.}, + {9., 10., 11., 12., 13., 14., 15., 16.}}}})); + auto weights = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{0.1000, 0.2000, 0.3000}, {0.4000, 0.5000, 0.6000}}, + + {{0.7000, 0.8000, 0.9000}, {1.0000, 1.1000, 1.2000}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{0.0100, 0.0200}})); + + auto outputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, outChannels, outDataSize>( + {{{{1., 2.}, {3., 4.}}}})); + + auto op = setupTestConv<DIM>( + batchSize, + inChannels, + outChannels, + std::array<DimSize_t, DIM>({kernelSize}), + std::array<DimSize_t, DIM>({inDataSize}), + std::array<DimSize_t, DIM>({stride}), + std::array<DimSize_t, DIM>({dilation}), + padding, + input, + weights, + biases); + + //////////////////////////////////// + // setup gradients for backward + op->getOutput(0)->setGrad(outputGrad); + + REQUIRE_NOTHROW(op->backward()); + + SECTION("Input Grad") { + auto expectedInputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{2.2000, + 0.0000, + 5.6000, + 0.0000, + 6.6000, + 0.0000, + 4.2000, + 0.0000}, + {3.4000, + 0.0000, + 8.6000, + 0.0000, + 9.6000, + 0.0000, + 6.0000, + 0.0000}}}})); + CHECK(approxEq<float, float>(*op->getInput(0)->grad(), + *expectedInputGrad)); + } + SECTION("Weight grad") { + auto expectedWeightsGrad = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{7., 13., 19.}, {31., 37., 43.}}, + + {{15., 29., 43.}, {71., 85., 99.}}}})); + CHECK(approxEq<float, float>(*op->getInput(1)->grad(), + *expectedWeightsGrad)); + } + SECTION("Bias Grad") { + auto expectedBiasesGrad = std::make_shared<Tensor>( + Array1D<float, outChannels>({{3., 7.}})); + CHECK(approxEq<float, float>(*op->getInput(2)->grad(), + *expectedBiasesGrad)); + } + } + SECTION("random values testing") { + const DimSize_t batchSize = 1; + const DimSize_t inChannels = 4; + const DimSize_t outChannels = 4; + const DimSize_t kernelSize = 3; + const DimSize_t inDataSize = 13; + const DimSize_t outDataSize = 2; + + const DimSize_t stride = 4; + const DimSize_t dilation = 3; + const std::array<DimSize_t, 2 * DIM> padding({0, 0}); + + auto inputSize = + std::vector<DimSize_t>({batchSize, inChannels, inDataSize}); + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{0.180772, + -0.069988, + -0.359623, + -0.915204, + 0.625765, + 0.025510, + 0.954514, + 0.064349, + 0.361151, + 1.167878, + -1.349893, + -0.510177, + 0.235958}, + {-0.239778, + -0.921115, + 1.543297, + 1.348826, + -0.139642, + 0.285797, + 0.965120, + -2.037150, + 0.493136, + 1.486999, + 0.591033, + 0.126030, + -1.562687}, + {-1.160103, + -0.334841, + 0.447772, + -0.801645, + 1.523611, + 2.508587, + -0.663096, + -0.251275, + 1.010145, + 0.121547, + -1.510835, + 2.104773, + 2.762959}, + {-1.746529, + 0.410919, + -0.242185, + 0.420812, + 0.277596, + 0.778898, + 1.533269, + 1.609736, + -0.403228, + -0.274928, + 1.473840, + 0.068826, + 1.332708}}}})); + auto weights = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{0.587285, 0.286069, 0.008287}, + {-0.252325, -1.324722, 0.189178}, + {0.021100, 0.940420, -0.557690}, + {-0.693927, -0.325247, 1.243933}}, + + {{-1.167186, -0.409124, 1.260062}, + {-1.563006, 1.134614, -0.082384}, + {0.289316, 0.835773, -0.244991}, + {0.271223, 0.093636, -0.883432}}, + + {{-0.327417, 0.078394, -0.380766}, + {0.377508, 0.111912, 2.314279}, + {-0.798906, -0.564303, -1.134660}, + {0.170527, 0.994665, 1.262572}}, + + {{1.621816, 1.077471, 0.594781}, + {-1.529087, 2.043707, -0.165627}, + {0.087070, -0.527656, -0.100288}, + {1.053922, -0.623074, -1.590572}}}})); + + auto biases = std::make_shared<Tensor>(Array1D<float, outChannels>( + {{1.285940, -0.051787, -0.968103, -0.586324}})); + + auto op = setupTestConv<DIM>( + batchSize, + inChannels, + outChannels, + std::array<DimSize_t, DIM>({kernelSize}), + std::array<DimSize_t, DIM>({inDataSize}), + std::array<DimSize_t, DIM>({stride}), + std::array<DimSize_t, DIM>({dilation}), + padding, + input, + weights, + biases); + + //////////////////////////////////// + // setup gradients for backward + auto outputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, outChannels, outDataSize>( + {{{{0.053156, 1.189073}, + {0.100228, 1.042344}, + {-1.468991, 0.581337}, + {1.330418, 0.487802}}}})); + op->getOutput(0)->setGrad(outputGrad); + + //////////////////////////////////// + // setup gradients for backward + REQUIRE_NOTHROW(op->backward()); + + SECTION("Input Grad") { + auto expectedInputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{2.552898, + 0.000000, + 0.000000, + 1.292528, + 0.082501, + 0.000000, + 1.477383, + 0.484875, + 0.000000, + 0.000000, + 1.392054, + 0.000000, + 0.000000}, + {-2.758950, + 0.000000, + 0.000000, + 2.597889, + -2.455656, + 0.000000, + -3.618210, + 0.669449, + 0.000000, + 0.000000, + 1.403657, + 0.000000, + 0.000000}, + {1.319545, + 0.000000, + 0.000000, + 0.260710, + -0.095303, + 0.000000, + 1.479181, + 1.403949, + 0.000000, + 0.000000, + -1.627040, + 0.000000, + 0.000000}, + {1.141951, + 0.000000, + 0.000000, + -2.298007, + 0.070817, + 0.000000, + -3.993255, + -0.014843, + 0.000000, + 0.000000, + 0.516383, + 0.000000, + 0.000000}}}})); + CHECK(approxEq<float, float>(*op->getInput(0)->grad(), + *expectedInputGrad, + 1e-5, + 1e-6)); + } + SECTION("Weight grad") { + auto expectedWeightsGrad = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{0.753690, 0.027866, -1.554383}, + {-0.178790, -2.350622, 0.754084}, + {1.750019, -0.341397, -1.831741}, + {0.237243, 1.936463, 1.834007}}, + + {{0.670381, -0.024656, -1.311384}, + {-0.169587, -1.988220, 0.712792}, + {1.471852, -0.342263, -1.641270}, + {0.114300, 1.720076, 1.689925}}, + + {{0.098228, 1.381835, -2.186914}, + {0.271054, -3.165683, -1.074165}, + {2.589912, 1.031534, 0.095779}, + {2.727013, 0.317630, -1.395561}}, + + {{0.545751, -1.186215, 0.611421}, + {-0.387123, 0.800776, 1.572321}, + {-0.800201, -1.189095, -1.619183}, + {-2.188202, 1.345088, 2.758830}}} + + })); + CHECK(approxEq<float, float>(*op->getInput(1)->grad(), + *expectedWeightsGrad, + 1e-5, + 1e-6)); + } + SECTION("Bias Grad") { + auto expectedBiasesGrad = + std::make_shared<Tensor>(Array1D<float, outChannels>( + {{1.242230, 1.142572, -0.887655, 1.818220}})); + CHECK(approxEq<float, float>(*op->getInput(2)->grad(), + *expectedBiasesGrad)); + } + } + } + SECTION("2D") { + const DimSize_t DIM = 2; + SECTION("Sequential values") { + constexpr DimSize_t batchSize = 1; + constexpr DimSize_t inChannels = 1; + constexpr DimSize_t outChannels = 2; + constexpr std::array<DimSize_t, DIM> kernelSize = {1, 2}; + constexpr std::array<DimSize_t, DIM> inDataSize = {3, 4}; + + constexpr std::array<DimSize_t, DIM> stride = {1, 2}; + constexpr std::array<DimSize_t, DIM> dilation = {1, 2}; + constexpr std::array<DimSize_t, 2 * DIM> padding({0, 0}); + + constexpr std::array<DimSize_t, DIM> outDataSize = {3, 1}; + + auto inputSize = std::vector<DimSize_t>( + {batchSize, inChannels, inDataSize[0], inDataSize[1]}); + + auto input = std::make_shared<Tensor>( + Array4D<float, + batchSize, + inChannels, + inDataSize[0], + inDataSize[1]>({{{{{1., 2., 3., 4.}, + {5., 6., 7., 8.}, + {9., 10., 11., 12.}}}}})); + auto weights = std::make_shared<Tensor>( + Array4D<float, + outChannels, + inChannels, + kernelSize[0], + kernelSize[1]>({{{{{1., 2.}}}, {{{3., 4.}}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{1., 2.}})); + + auto outputGrad = std::make_shared<Tensor>(Array4D<float, + batchSize, + outChannels, + outDataSize[0], + outDataSize[1]>( + {{{{{1.}, {2.}, {3.}}, {{4.}, {5.}, {6.}}}}})); + + auto op = setupTestConv<DIM>(batchSize, + inChannels, + outChannels, + kernelSize, + inDataSize, + stride, + dilation, + padding, + input, + weights, + biases); + + //////////////////////////////////// + // setup gradients for backward + op->getOutput(0)->setGrad(outputGrad); + + REQUIRE_NOTHROW(op->backward()); + + SECTION("Input Grad") { + auto expectedInputGrad = std::make_shared<Tensor>( + Array4D<float, + batchSize, + inChannels, + inDataSize[0], + inDataSize[1]>({{{{{13., 0., 18., 0.}, + {17., 0., 24., 0.}, + {21., 0., 30., 0.}}}}})); + CHECK(approxEq<float, float>(*op->getInput(0)->grad(), + *expectedInputGrad)); + } + SECTION("Weight grad") { + auto expectedWeightsGrad = + std::make_shared<Tensor>(Array4D<float, + outChannels, + inChannels, + kernelSize[0], + kernelSize[1]>( + {{{{{38., 50.}}}, {{{83., 113.}}}}})); + CHECK(approxEq<float, float>(*op->getInput(1)->grad(), + *expectedWeightsGrad)); + } + SECTION("Bias Grad") { + auto expectedBiasesGrad = std::make_shared<Tensor>( + Array1D<float, outChannels>({{6., 15.}})); + CHECK(approxEq<float, float>(*op->getInput(2)->grad(), + *expectedBiasesGrad)); + } + } + } +} diff --git a/unit_tests/operator/Test_ConvTranspose.cpp b/unit_tests/operator/Test_ConvTranspose.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6e889e809e0a05d551829bd15fda9cc651068465 --- /dev/null +++ b/unit_tests/operator/Test_ConvTranspose.cpp @@ -0,0 +1,2298 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <aidge/utils/Types.h> +#include <memory> + +#include <catch2/catch_test_macros.hpp> +#include <fmt/core.h> + +#include "aidge/backend/cpu/operator/ConvTransposeImpl.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/ConvTranspose.hpp" +#include "aidge/utils/TensorUtils.hpp" + +namespace Aidge { + +template <DimSize_t DIM> +static std::shared_ptr<OperatorTensor> +setupTestConvTranspose(const DimSize_t batchSize, + const DimSize_t inChannels, + const DimSize_t outChannels, + const std::array<DimSize_t, DIM> kernelSize, + const std::array<DimSize_t, DIM> dataSize, + const std::array<DimSize_t, DIM> stride, + const std::array<DimSize_t, DIM> dilation, + const std::shared_ptr<Tensor> input, + const std::shared_ptr<Tensor> weights, + const std::shared_ptr<Tensor> biases) { + std::shared_ptr<Node> convTransposeNode; + convTransposeNode = ConvTranspose(inChannels, + outChannels, + kernelSize, + stride, + dilation, + false, + "myconv"); + auto op = std::static_pointer_cast<OperatorTensor>( + convTransposeNode->getOperator()); + + op->associateInput(0, input); + op->setDataType(DataType::Float32); + + input->setBackend("cpu"); + op->setBackend("cpu"); + + weights->setBackend("cpu"); + op->associateInput(1, weights); + + biases->setBackend("cpu"); + op->associateInput(2, biases); + + REQUIRE_NOTHROW(op->forwardDims(true)); + + return op; +} + +TEST_CASE("[cpu/operator] ConvTranspose(forward)", "[ConvTranspose][CPU]") { + constexpr DimSize_t DIM = 1; + SECTION("1D") { + SECTION("kernel = 2 , in/outChannels = 1") { + constexpr DimSize_t batchSize = 1; + constexpr DimSize_t inChannels = 1; + constexpr DimSize_t outChannels = 1; + + constexpr std::array<DimSize_t, DIM> kernelSize{2}; + + constexpr std::array<DimSize_t, DIM> inDataSize{4}; + constexpr std::array<DimSize_t, DIM> outDataSize{5}; + + constexpr std::array<DimSize_t, DIM> stride{1}; + constexpr std::array<DimSize_t, DIM> dilation{1}; + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize[0]>( + {{{{1.000000, 2.000000, 3.000000, 4.000000}}}})); + + auto weights = std::make_shared<Tensor>( + Array3D<float, inChannels, outChannels, kernelSize[0]>( + {{{{0.100000, 0.200000}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{0.010000}})); + + auto op = setupTestConvTranspose<DIM>(batchSize, + inChannels, + outChannels, + kernelSize, + inDataSize, + stride, + dilation, + input, + weights, + biases); + + REQUIRE_NOTHROW(op->forward()); + + auto expectedOutput = std::make_shared<Tensor>( + Array3D<float, batchSize, outChannels, outDataSize[0]>( + {{{{0.110000, 0.410000, 0.710000, 1.010000, 0.810000}}}})); + CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput)); + } + SECTION("kernel = 2, inChannel = 2, outChannels = 1") { + constexpr DimSize_t batchSize = 1; + constexpr DimSize_t inChannels = 2; + constexpr DimSize_t outChannels = 1; + + constexpr std::array<DimSize_t, DIM> kernelSize{2}; + + constexpr std::array<DimSize_t, DIM> inDataSize{4}; + constexpr std::array<DimSize_t, DIM> outDataSize{5}; + + constexpr std::array<DimSize_t, DIM> stride{1}; + constexpr std::array<DimSize_t, DIM> dilation{1}; + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize[0]>( + {{{{1.000000, 2.000000, 3.000000, 4.000000}, + {5.000000, 6.000000, 7.000000, 8.000000}}}})); + + auto weights = std::make_shared<Tensor>( + Array3D<float, inChannels, outChannels, kernelSize[0]>( + {{{{0.100000, 0.200000}}, {{0.300000, 0.400000}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{0.010000}})); + + auto op = setupTestConvTranspose<DIM>(batchSize, + inChannels, + outChannels, + kernelSize, + inDataSize, + stride, + dilation, + input, + weights, + biases); + + REQUIRE_NOTHROW(op->forward()); + + auto expectedOutput = std::make_shared<Tensor>( + Array3D<float, batchSize, outChannels, outDataSize[0]>( + {{{{1.610000, 4.210000, 5.210000, 6.210001, 4.010000}}}})); + CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput)); + } + SECTION("kernel = 2, inChannel = 1, outChannels = 2") { + constexpr DimSize_t batchSize = 1; + constexpr DimSize_t inChannels = 1; + constexpr DimSize_t outChannels = 2; + + constexpr std::array<DimSize_t, DIM> kernelSize{2}; + + constexpr std::array<DimSize_t, DIM> inDataSize{4}; + constexpr std::array<DimSize_t, DIM> outDataSize{5}; + + constexpr std::array<DimSize_t, DIM> stride{1}; + constexpr std::array<DimSize_t, DIM> dilation{1}; + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize[0]>( + {{{{1., 2., 3., 4.}}}})); + + auto weights = std::make_shared<Tensor>( + Array3D<float, inChannels, outChannels, kernelSize[0]>( + {{{{0.1, 0.2}, {0.3, 0.4}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{0.01, 0.02}})); + + auto op = setupTestConvTranspose<DIM>(batchSize, + inChannels, + outChannels, + kernelSize, + inDataSize, + stride, + dilation, + input, + weights, + biases); + + REQUIRE_NOTHROW(op->forward()); + + auto expectedOutput = std::make_shared<Tensor>( + Array3D<float, batchSize, outChannels, outDataSize[0]>( + {{{{0.11, 0.41, 0.71, 1.01, 0.81}, + {0.32, 1.02, 1.72, 2.42, 1.62}}}})); + + CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput)); + } + SECTION("kernel = 1, inChannel = 2, outChannels = 2") { + constexpr DimSize_t batchSize = 1; + constexpr DimSize_t inChannels = 2; + constexpr DimSize_t outChannels = 2; + + constexpr std::array<DimSize_t, DIM> kernelSize{1}; + + constexpr std::array<DimSize_t, DIM> inDataSize{4}; + constexpr std::array<DimSize_t, DIM> outDataSize{4}; + + constexpr std::array<DimSize_t, DIM> stride{1}; + constexpr std::array<DimSize_t, DIM> dilation{1}; + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize[0]>( + {{{{1.000000, 2.000000, 3.000000, 4.000000}, + {5.000000, 6.000000, 7.000000, 8.000000}}}})); + + auto weights = std::make_shared<Tensor>( + Array3D<float, inChannels, outChannels, kernelSize[0]>( + {{{{0.100000}, {0.200000}}, + + {{0.300000}, {0.400000}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{0.010000, 0.020000}})); + + auto op = setupTestConvTranspose<DIM>(batchSize, + inChannels, + outChannels, + kernelSize, + inDataSize, + stride, + dilation, + input, + weights, + biases); + + REQUIRE_NOTHROW(op->forward()); + + auto expectedOutput = std::make_shared<Tensor>( + Array3D<float, batchSize, outChannels, outDataSize[0]>( + {{{{1.610000, 2.010000, 2.410000, 2.810000}, + {2.220000, 2.820000, 3.420000, 4.020000}}}})); + + CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput)); + } + SECTION("kernel = 2, inChannels = 2, outChannels = 3") { + constexpr DimSize_t batchSize = 1; + constexpr DimSize_t inChannels = 2; + constexpr DimSize_t outChannels = 3; + + constexpr std::array<DimSize_t, DIM> kernelSize{2}; + + constexpr std::array<DimSize_t, DIM> inDataSize{4}; + constexpr std::array<DimSize_t, DIM> outDataSize{5}; + + constexpr std::array<DimSize_t, DIM> stride{1}; + constexpr std::array<DimSize_t, DIM> dilation{1}; + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize[0]>( + {{{{1., 2., 3., 4.}, {5., 6., 7., 8.}}}})); + + auto weights = std::make_shared<Tensor>( + Array3D<float, inChannels, outChannels, kernelSize[0]>( + {{{{0.10, 0.20}, {0.30, 0.40}, {0.50, 0.60}}, + + {{0.70, 0.80}, {0.90, 1.}, {1.10, 1.20}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{0.010000, 0.020000, 0.030000}})); + + auto op = setupTestConvTranspose<DIM>(batchSize, + inChannels, + outChannels, + kernelSize, + inDataSize, + stride, + dilation, + input, + weights, + biases); + + REQUIRE_NOTHROW(op->forward()); + + auto expectedOutput = std::make_shared< + Tensor>(Array3D<float, batchSize, outChannels, outDataSize[0]>( + {{{{3.610000, 8.610001, 10.410000, 12.210001, 7.210001}, + {4.820000, 11.420000, 14.020000, 16.620001, 9.620001}, + {6.030000, 14.230000, 17.630001, 21.030001, 12.030000}}}})); + + CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput)); + } + + SECTION("Big test to ensure kernel capabilities") { + constexpr DimSize_t batchSize = 2; + constexpr DimSize_t inChannels = 3; + constexpr DimSize_t outChannels = 4; + + constexpr std::array<DimSize_t, DIM> kernelSize{6}; + + constexpr std::array<DimSize_t, DIM> inDataSize{6}; + constexpr std::array<DimSize_t, DIM> outDataSize{11}; + + constexpr std::array<DimSize_t, DIM> stride{1}; + constexpr std::array<DimSize_t, DIM> dilation{1}; + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize[0]>( + {{{{1., 2., 3., 4., 5., 6.}, + {7., 8., 9., 10., 11., 12.}, + {13., 14., 15., 16., 17., 18.}}, + + {{19., 20., 21., 22., 23., 24.}, + {25., 26., 27., 28., 29., 30.}, + {31., 32., 33., 34., 35., 36.}}}})); + + auto weights = std::make_shared<Tensor>( + Array3D<float, inChannels, outChannels, kernelSize[0]>( + {{{{0.1, 0.2, 0.3, 0.4, 0.5, 0.6}, + {0.7, 0.8, 0.9, 1., 1.1, 1.2}, + {1.3, 1.4, 1.5, 1.6, 1.7, 1.8}, + {1.9, 2., 2.1, 2.2, 2.3, 2.4}}, + + {{2.5, 2.6, 2.7, 2.8, 2.9, 3.}, + {3.1, 3.2, 3.3, 3.4, 3.5, 3.6}, + {3.7, 3.8, 3.9, 4., 4.1, 4.2}, + {4.3, 4.4, 4.5, 4.6, 4.7, 4.8}}, + + {{4.9, 5., 5.1, 5.2, 5.3, 5.4}, + {5.5, 5.6, 5.7, 5.8, 5.9, 6.}, + {6.1, 6.2, 6.3, 6.4, 6.5, 6.6}, + {6.7, 6.8, 6.9, 7., 7.1, 7.2}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{0.01, 0.02, 0.03, 0.04}})); + + auto op = setupTestConvTranspose<DIM>(batchSize, + inChannels, + outChannels, + kernelSize, + inDataSize, + stride, + dilation, + input, + weights, + biases); + + REQUIRE_NOTHROW(op->forward()); + + auto expectedOutput = std::make_shared<Tensor>( + Array3D<float, batchSize, outChannels, outDataSize[0]>( + {{{{81.310005, + 172.210007, + 273.010010, + 384.010040, + 505.509979, + 637.810059, + 561.010010, + 472.809998, + 372.910004, + 261.010010, + 136.809998}, + {93.919998, + 199.220001, + 316.219971, + 445.220001, + 586.520081, + 740.420044, + 651.020020, + 548.420044, + 432.319977, + 302.420013, + 158.419998}, + {106.529999, + 226.230011, + 359.429993, + 506.430054, + 667.530090, + 843.030029, + 741.030029, + 624.030029, + 491.730042, + 343.829987, + 180.029999}, + {119.140007, + 253.240005, + 402.640045, + 567.640076, + 748.539978, + 945.639954, + 831.039978, + 699.640015, + 551.140015, + 385.239990, + 201.639999}}, + + {{216.309998, + 447.610016, + 694.210022, + 956.410034, + 1234.510132, + 1528.810059, + 1317.010010, + 1088.410034, + 842.710022, + 579.610046, + 298.810028}, + {261.319977, + 539.420044, + 834.619995, + 1147.220093, + 1477.520142, + 1825.820068, + 1569.019897, + 1293.619995, + 999.320068, + 685.820007, + 352.819977}, + {306.329987, + 631.230042, + 975.030029, + 1338.030151, + 1720.530029, + 2122.829834, + 1821.029785, + 1498.830200, + 1155.930054, + 792.030029, + 406.830017}, + {351.340027, + 723.039978, + 1115.440063, + 1528.840210, + 1963.539917, + 2419.839844, + 2073.040283, + 1704.040039, + 1312.540039, + 898.239990, + 460.840027}}}})); + CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput)); + } + } + + SECTION("2D") { + constexpr DimSize_t DIM = 2; + SECTION("inChannels = 1, outChannels = 2, kernelSize = {1,2}, " + "inDataSize = {2,3}") { + constexpr DimSize_t batchSize = 1; + constexpr DimSize_t inChannels = 1; + constexpr DimSize_t outChannels = 2; + + constexpr std::array<DimSize_t, DIM> kernelSize{1, 2}; + + constexpr std::array<DimSize_t, DIM> inDataSize{2, 3}; + constexpr std::array<DimSize_t, DIM> outDataSize{2, 4}; + + constexpr std::array<DimSize_t, DIM> stride{1, 1}; + constexpr std::array<DimSize_t, DIM> dilation{1, 1}; + + auto input = std::make_shared<Tensor>(Array4D<float, + batchSize, + inChannels, + inDataSize[0], + inDataSize[1]>( + {{{{{1.000000, 2.000000, 3.000000}, + {4.000000, 5.000000, 6.000000}}}}})); + + auto weights = std::make_shared<Tensor>( + Array4D<float, + inChannels, + outChannels, + kernelSize[0], + kernelSize[1]>({{{{{0.100000, 0.200000}}, + + {{0.300000, 0.400000}}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{0.010000}})); + + auto op = setupTestConvTranspose<DIM>(batchSize, + inChannels, + outChannels, + kernelSize, + inDataSize, + stride, + dilation, + input, + weights, + biases); + + REQUIRE_NOTHROW(op->forward()); + + auto expectedOutput = + std::make_shared<Tensor>(Array4D<float, + batchSize, + outChannels, + outDataSize[0], + outDataSize[1]>( + {{{{{0.110000, 0.410000, 0.710000, 0.610000}, + {0.410000, 1.310000, 1.610000, 1.210000}}, + + {{0.320000, 1.020000, 1.720000, 1.220000}, + {1.220000, 3.120000, 3.820000, 2.420000}}}}})); + } + SECTION("inChannels = 1, outChannels = 2, kernelSize = {2,3}, " + "inDataSize = {2,3}") { + constexpr DimSize_t batchSize = 1; + constexpr DimSize_t inChannels = 1; + constexpr DimSize_t outChannels = 2; + + constexpr std::array<DimSize_t, DIM> kernelSize{2, 3}; + + constexpr std::array<DimSize_t, DIM> inDataSize{2, 3}; + constexpr std::array<DimSize_t, DIM> outDataSize{3, 5}; + + constexpr std::array<DimSize_t, DIM> stride{1, 1}; + constexpr std::array<DimSize_t, DIM> dilation{1, 1}; + + auto input = std::make_shared<Tensor>(Array4D<float, + batchSize, + inChannels, + inDataSize[0], + inDataSize[1]>( + {{{{{1.000000, 2.000000, 3.000000}, + {4.000000, 5.000000, 6.000000}}}}})); + + auto weights = std::make_shared<Tensor>(Array4D<float, + inChannels, + outChannels, + kernelSize[0], + kernelSize[1]>( + {{{{{0.100000, 0.200000, 0.300000}, + {0.400000, 0.500000, 0.600000}}, + + {{0.700000, 0.800000, 0.900000}, + {1.000000, 1.100000, 1.200000}}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{0.010000, 0.020000}})); + + auto op = setupTestConvTranspose<DIM>(batchSize, + inChannels, + outChannels, + kernelSize, + inDataSize, + stride, + dilation, + input, + weights, + biases); + + REQUIRE_NOTHROW(op->forward()); + + auto expectedOutput = std::make_shared< + Tensor>(Array4D<float, + batchSize, + outChannels, + outDataSize[0], + outDataSize[1]>( + {{{{{0.110000, 0.410000, 1.010000, 1.210000, 0.910000}, + {0.810000, 2.610000, 5.610000, 5.410000, 3.610000}, + {1.610000, 4.010000, 7.310000, 6.010000, 3.610000}}, + + {{0.720000, 2.220000, 4.620000, 4.220000, 2.720000}, + {3.820000, 9.820001, 18.220001, 15.020000, 9.020000}, + {4.020000, 9.420000, 16.320000, 12.620001, 7.220000}}}}})); + } + SECTION("inChannels = 1, outChannels = 2, kernelSize = {2,3}, " + "inDataSize = {6,6}, stride = {2, 2}, dilation = {2, 2}") { + constexpr DimSize_t batchSize = 1; + constexpr DimSize_t inChannels = 1; + constexpr DimSize_t outChannels = 2; + + constexpr std::array<DimSize_t, DIM> kernelSize{2, 3}; + + constexpr std::array<DimSize_t, DIM> inDataSize{4, 4}; + constexpr std::array<DimSize_t, DIM> outDataSize{9, 11}; + + constexpr std::array<DimSize_t, DIM> stride{2, 2}; + constexpr std::array<DimSize_t, DIM> dilation{2, 2}; + + auto input = std::make_shared<Tensor>(Array4D<float, + batchSize, + inChannels, + inDataSize[0], + inDataSize[1]>( + {{{{{1.00, 2.00, 3.00, 4.000000}, + {5.00, 6.00, 7.00, 8.000000}, + {9.00, 10.00, 11.00, 12.000000}, + {13.00, 14.00, 15.00, 16.000000}}}}})); + + auto weights = std::make_shared<Tensor>(Array4D<float, + inChannels, + outChannels, + kernelSize[0], + kernelSize[1]>( + {{{{{0.10, 0.20, 0.300000}, {0.40, 0.50, 0.600000}}, + + {{0.70, 0.80, 0.900000}, {1.00, 1.10, 1.200000}}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{0.01, 0.020000}})); + + auto op = setupTestConvTranspose<DIM>(batchSize, + inChannels, + outChannels, + kernelSize, + inDataSize, + stride, + dilation, + input, + weights, + biases); + + REQUIRE_NOTHROW(op->forward()); + + auto expectedOutput = std::make_shared<Tensor>( + Array4D<float, + batchSize, + outChannels, + outDataSize[0], + outDataSize[1]>({{{{{0.11, + 0.01, + 0.41, + 0.01, + 1.01, + 0.01, + 1.61, + 0.01, + 1.71, + 0.01, + 1.210000}, + {0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.010000}, + {0.91, + 0.01, + 2.91, + 0.01, + 6.210001, + 0.01, + 8.31, + 0.01, + 7.510001, + 0.01, + 4.810000}, + {0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.010000}, + {2.91, + 0.01, + 7.710001, + 0.01, + 14.610001, + 0.01, + 16.710001, + 0.01, + 13.910002, + 0.01, + 8.410001}, + {0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.010000}, + {4.91, + 0.01, + 12.51, + 0.01, + 23.01, + 0.01, + 25.110001, + 0.01, + 20.309999, + 0.01, + 12.010000}, + {0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.010000}, + {5.210001, + 0.01, + 12.110001, + 0.01, + 20.809999, + 0.01, + 22.309999, + 0.01, + 17.01, + 0.01, + 9.610001}}, + + {{0.72, + 0.02, + 2.22, + 0.02, + 4.62, + 0.02, + 7.02, + 0.02, + 5.92, + 0.02, + 3.620000}, + {0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.020000}, + {4.52, + 0.02, + 11.320001, + 0.02, + 20.620003, + 0.02, + 26.320002, + 0.02, + 20.720001, + 0.02, + 12.020000}, + {0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.020000}, + {11.32, + 0.02, + 25.720001, + 0.02, + 43.420002, + 0.02, + 49.120003, + 0.02, + 36.720001, + 0.02, + 20.420002}, + {0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.020000}, + {18.119999, + 0.02, + 40.120003, + 0.02, + 66.220001, + 0.02, + 71.919998, + 0.02, + 52.720001, + 0.02, + 28.820002}, + {0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.02, + 0.020000}, + {13.02, + 0.02, + 28.32, + 0.02, + 46.02, + 0.02, + 49.320004, + 0.02, + 35.619999, + 0.02, + 19.220001}}}}})); + } + SECTION("inChannels = 4, outChannels = 3, kernelSize = {2,2}, " + "inDataSize = {3,3}, stride = {2, 2}, dilation = {2, 2}") { + constexpr DimSize_t batchSize = 1; + constexpr DimSize_t inChannels = 4; + constexpr DimSize_t outChannels = 3; + + constexpr std::array<DimSize_t, DIM> kernelSize{2, 2}; + + constexpr std::array<DimSize_t, DIM> inDataSize{4, 4}; + constexpr std::array<DimSize_t, DIM> outDataSize{7, 7}; + + constexpr std::array<DimSize_t, DIM> stride{2, 2}; + constexpr std::array<DimSize_t, DIM> dilation{2, 2}; + + auto input = std::make_shared<Tensor>(Array4D<float, + batchSize, + inChannels, + inDataSize[0], + inDataSize[1]>( + {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}}, + + {{10.0, 11.0, 12.0}, + {13.0, 14.0, 15.0}, + {16.0, 17.0, 18.0}}, + + {{19.0, 20.0, 21.0}, + {22.0, 23.0, 24.0}, + {25.0, 26.0, 27.0}}, + + {{28.0, 29.0, 30.0}, + {31.0, 32.0, 33.0}, + {34.0, 35.0, 36.0}}}}})); + + auto weights = std::make_shared<Tensor>( + Array4D<float, + inChannels, + outChannels, + kernelSize[0], + kernelSize[1]>({{{{{0.1, 0.2}, {0.3, 0.4}}, + + {{0.5, 0.6}, {0.7, 0.8}}, + + {{0.9, 1.0}, {1.1, 1.2}}}, + + {{{1.3, 1.4}, {1.5, 1.6}}, + + {{1.7, 1.8}, {1.9, 2.0}}, + + {{2.1, 2.2}, {2.3, 2.4}}}, + + {{{2.5, 2.6}, {2.7, 2.8}}, + + {{2.9, 3.0}, {3.1, 3.2}}, + + {{3.3, 3.4}, {3.5, 3.6}}}, + + {{{3.7, 3.8}, {3.9, 4.0}}, + + {{4.1, 4.2}, {4.3, 4.4}}, + + {{4.5, 4.6}, {4.7, 4.8}}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{0.010000, 0.020000, 0.030000}})); + + auto op = setupTestConvTranspose<DIM>(batchSize, + inChannels, + outChannels, + kernelSize, + inDataSize, + stride, + dilation, + input, + weights, + biases); + + REQUIRE_NOTHROW(op->forward()); + + auto expectedOutput = std::make_shared<Tensor>( + Array4D<float, + batchSize, + outChannels, + outDataSize[0], + outDataSize[1]>({{{{{164.209991, + 0.010000, + 341.809998, + 0.010000, + 357.410034, + 0.010000, + 186.009995}, + {0.010000, + 0.010000, + 0.010000, + 0.010000, + 0.010000, + 0.010000, + 0.010000}, + {362.809998, + 0.010000, + 754.410034, + 0.010000, + 787.210083, + 0.010000, + 409.210022}, + {0.010000, + 0.010000, + 0.010000, + 0.010000, + 0.010000, + 0.010000, + 0.010000}, + {410.809998, + 0.010000, + 852.810059, + 0.010000, + 885.609985, + 0.010000, + 459.610016}, + {0.010000, + 0.010000, + 0.010000, + 0.010000, + 0.010000, + 0.010000, + 0.010000}, + {226.209991, + 0.010000, + 469.010010, + 0.010000, + 486.210022, + 0.010000, + 252.009995}}, + + {{187.419998, + 0.020000, + 389.820007, + 0.020000, + 408.619995, + 0.020000, + 212.420013}, + {0.020000, + 0.020000, + 0.020000, + 0.020000, + 0.020000, + 0.020000, + 0.020000}, + {414.019989, + 0.020000, + 860.020020, + 0.020000, + 899.220032, + 0.020000, + 466.820007}, + {0.020000, + 0.020000, + 0.020000, + 0.020000, + 0.020000, + 0.020000, + 0.020000}, + {471.620026, + 0.020000, + 977.619995, + 0.020000, + 1016.820068, + 0.020000, + 526.820007}, + {0.020000, + 0.020000, + 0.020000, + 0.020000, + 0.020000, + 0.020000, + 0.020000}, + {259.019989, + 0.020000, + 536.220032, + 0.020000, + 556.619995, + 0.020000, + 288.019989}}, + + {{210.630005, + 0.030000, + 437.829987, + 0.030000, + 459.829987, + 0.030000, + 238.830002}, + {0.030000, + 0.030000, + 0.030000, + 0.030000, + 0.030000, + 0.030000, + 0.030000}, + {465.230011, + 0.030000, + 965.630005, + 0.030000, + 1011.230103, + 0.030000, + 524.430054}, + {0.030000, + 0.030000, + 0.030000, + 0.030000, + 0.030000, + 0.030000, + 0.030000}, + {532.430054, + 0.030000, + 1102.430054, + 0.030000, + 1148.030029, + 0.030000, + 594.030029}, + {0.030000, + 0.030000, + 0.030000, + 0.030000, + 0.030000, + 0.030000, + 0.030000}, + {291.830017, + 0.030000, + 603.430054, + 0.030000, + 627.030029, + 0.030000, + 324.029999}}}}})); + } + SECTION("Big test to ensure kernel capabilities 1") { + constexpr DimSize_t batchSize = 1; + constexpr DimSize_t inChannels = 3; + constexpr DimSize_t outChannels = 4; + + constexpr std::array<DimSize_t, DIM> kernelSize{2, 2}; + + constexpr std::array<DimSize_t, DIM> inDataSize{6, 5}; + constexpr std::array<DimSize_t, DIM> outDataSize{8, 17}; + + constexpr std::array<DimSize_t, DIM> stride{1, 3}; + constexpr std::array<DimSize_t, DIM> dilation{2, 4}; + + auto input = std::make_shared<Tensor>( + Array4D<float, + batchSize, + inChannels, + inDataSize[0], + inDataSize[1]>({{{{{1., 2., 3., 4., 5.}, + {6., 7., 8., 9., 10.}, + {11., 12., 13., 14., 15.}, + {16., 17., 18., 19., 20.}, + {21., 22., 23., 24., 25.}, + {26., 27., 28., 29., 30.}}, + + {{31., 32., 33., 34., 35.}, + {36., 37., 38., 39., 40.}, + {41., 42., 43., 44., 45.}, + {46., 47., 48., 49., 50.}, + {51., 52., 53., 54., 55.}, + {56., 57., 58., 59., 60.}}, + + {{61., 62., 63., 64., 65.}, + {66., 67., 68., 69., 70.}, + {71., 72., 73., 74., 75.}, + {76., 77., 78., 79., 80.}, + {81., 82., 83., 84., 85.}, + {86., 87., 88., 89., 90.}}}}})); + + auto weights = std::make_shared<Tensor>(Array4D<float, + inChannels, + outChannels, + kernelSize[0], + kernelSize[1]>( + {{{{{0.100000, 0.200000}, {0.300000, 0.400000}}, + + {{0.500000, 0.600000}, {0.700000, 0.800000}}, + + {{0.900000, 1.000000}, {1.100000, 1.200000}}, + + {{1.300000, 1.400000}, {1.500000, 1.600000}}}, + + {{{1.700000, 1.800000}, {1.900000, 2.000000}}, + + {{2.100000, 2.200000}, {2.300000, 2.400000}}, + + {{2.500000, 2.600000}, {2.700000, 2.800000}}, + + {{2.900000, 3.000000}, {3.100000, 3.200000}}}, + + {{{3.300000, 3.400000}, {3.500000, 3.600000}}, + + {{3.700000, 3.800000}, {3.900000, 4.000000}}, + + {{4.100000, 4.200000}, {4.300000, 4.400000}}, + + {{4.500000, 4.600000}, {4.700000, 4.800000}}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{0.01, 0.02, 0.03, 0.04}})); + + auto op = setupTestConvTranspose<DIM>(batchSize, + inChannels, + outChannels, + kernelSize, + inDataSize, + stride, + dilation, + input, + weights, + biases); + + REQUIRE_NOTHROW(op->forward()); + + auto expectedOutput = std::make_shared<Tensor>( + Array4D<float, + batchSize, + outChannels, + outDataSize[0], + outDataSize[1]>({{{{{254.110001, + 0.010000, + 0.010000, + 259.210022, + 263.410034, + 0.010000, + 264.309998, + 268.810028, + 0.010000, + 269.410004, + 274.210022, + 0.010000, + 274.510010, + 279.610016, + 0.010000, + 0.010000, + 285.010010}, + {279.610016, + 0.010000, + 0.010000, + 284.710022, + 290.410004, + 0.010000, + 289.809998, + 295.810028, + 0.010000, + 294.910004, + 301.210022, + 0.010000, + 300.010010, + 306.610016, + 0.010000, + 0.010000, + 312.010010}, + {577.810059, + 0.010000, + 0.010000, + 588.609985, + 599.410034, + 0.010000, + 599.410034, + 610.810059, + 0.010000, + 610.209961, + 622.210022, + 0.010000, + 621.010010, + 633.609985, + 0.010000, + 0.010000, + 645.010010}, + {631.810059, + 0.010000, + 0.010000, + 642.609985, + 656.410034, + 0.010000, + 653.410034, + 667.810059, + 0.010000, + 664.209961, + 679.210022, + 0.010000, + 675.010010, + 690.609985, + 0.010000, + 0.010000, + 702.010010}, + {685.810059, + 0.010000, + 0.010000, + 696.609985, + 713.410034, + 0.010000, + 707.410034, + 724.810059, + 0.010000, + 718.209961, + 736.210022, + 0.010000, + 729.010010, + 747.609985, + 0.010000, + 0.010000, + 759.010010}, + {739.810059, + 0.010000, + 0.010000, + 750.609985, + 770.410034, + 0.010000, + 761.410034, + 781.810059, + 0.010000, + 772.209961, + 793.210022, + 0.010000, + 783.010010, + 804.609985, + 0.010000, + 0.010000, + 816.010010}, + {386.710022, + 0.010000, + 0.010000, + 392.410004, + 402.010010, + 0.010000, + 398.110016, + 408.010010, + 0.010000, + 403.809998, + 414.010010, + 0.010000, + 409.510010, + 420.010010, + 0.010000, + 0.010000, + 426.010010}, + {415.210022, + 0.010000, + 0.010000, + 420.910004, + 432.010010, + 0.010000, + 426.610016, + 438.010040, + 0.010000, + 432.309998, + 444.010010, + 0.010000, + 438.010010, + 450.010040, + 0.010000, + 0.010000, + 456.010010}}, + + {{291.320007, + 0.020000, + 0.020000, + 297.619995, + 300.619995, + 0.020000, + 303.919983, + 307.219971, + 0.020000, + 310.220001, + 313.819977, + 0.020000, + 316.519989, + 320.419983, + 0.020000, + 0.020000, + 327.019989}, + {322.820007, + 0.020000, + 0.020000, + 329.119995, + 333.619995, + 0.020000, + 335.419983, + 340.219971, + 0.020000, + 341.720001, + 346.819977, + 0.020000, + 348.019989, + 353.419983, + 0.020000, + 0.020000, + 360.019989}, + {664.220032, + 0.020000, + 0.020000, + 677.420044, + 685.820068, + 0.020000, + 690.619995, + 699.619995, + 0.020000, + 703.820068, + 713.420044, + 0.020000, + 717.020020, + 727.219971, + 0.020000, + 0.020000, + 741.020020}, + {730.220032, + 0.020000, + 0.020000, + 743.420044, + 754.820068, + 0.020000, + 756.619995, + 768.619995, + 0.020000, + 769.820068, + 782.420044, + 0.020000, + 783.020020, + 796.219971, + 0.020000, + 0.020000, + 810.020020}, + {796.220032, + 0.020000, + 0.020000, + 809.420044, + 823.820068, + 0.020000, + 822.620056, + 837.619995, + 0.020000, + 835.820068, + 851.420044, + 0.020000, + 849.020020, + 865.219971, + 0.020000, + 0.020000, + 879.020020}, + {862.220032, + 0.020000, + 0.020000, + 875.420044, + 892.820068, + 0.020000, + 888.619995, + 906.619995, + 0.020000, + 901.820068, + 920.420044, + 0.020000, + 915.020020, + 934.219971, + 0.020000, + 0.020000, + 948.020020}, + {447.919983, + 0.020000, + 0.020000, + 454.820007, + 463.220001, + 0.020000, + 461.720001, + 470.420013, + 0.020000, + 468.619995, + 477.619995, + 0.020000, + 475.519989, + 484.819977, + 0.020000, + 0.020000, + 492.019989}, + {482.419983, + 0.020000, + 0.020000, + 489.320007, + 499.220001, + 0.020000, + 496.220001, + 506.420013, + 0.020000, + 503.119995, + 513.619995, + 0.020000, + 510.019989, + 520.820007, + 0.020000, + 0.020000, + 528.020020}}, + + {{328.529999, + 0.030000, + 0.030000, + 336.029999, + 337.830017, + 0.030000, + 343.529999, + 345.630035, + 0.030000, + 351.029999, + 353.430023, + 0.030000, + 358.529999, + 361.230011, + 0.030000, + 0.030000, + 369.030029}, + {366.029999, + 0.030000, + 0.030000, + 373.529999, + 376.830017, + 0.030000, + 381.029999, + 384.630035, + 0.030000, + 388.529999, + 392.430023, + 0.030000, + 396.029999, + 400.230042, + 0.030000, + 0.030000, + 408.030029}, + {750.630005, + 0.030000, + 0.030000, + 766.230042, + 772.230042, + 0.030000, + 781.830078, + 788.430054, + 0.030000, + 797.430054, + 804.630066, + 0.030000, + 813.030029, + 820.830078, + 0.030000, + 0.030000, + 837.030029}, + {828.630005, + 0.030000, + 0.030000, + 844.230042, + 853.230042, + 0.030000, + 859.830078, + 869.430054, + 0.030000, + 875.430054, + 885.630066, + 0.030000, + 891.030029, + 901.830078, + 0.030000, + 0.030000, + 918.030029}, + {906.630005, + 0.030000, + 0.030000, + 922.230042, + 934.230042, + 0.030000, + 937.830078, + 950.430054, + 0.030000, + 953.430054, + 966.630066, + 0.030000, + 969.030029, + 982.830078, + 0.030000, + 0.030000, + 999.030090}, + {984.630005, + 0.030000, + 0.030000, + 1000.230042, + 1015.230103, + 0.030000, + 1015.830078, + 1031.430054, + 0.030000, + 1031.430054, + 1047.630127, + 0.030000, + 1047.030029, + 1063.830078, + 0.030000, + 0.030000, + 1080.030029}, + {509.130005, + 0.030000, + 0.030000, + 517.230042, + 524.430054, + 0.030000, + 525.330078, + 532.830017, + 0.030000, + 533.430054, + 541.230042, + 0.030000, + 541.530029, + 549.630066, + 0.030000, + 0.030000, + 558.030029}, + {549.630066, + 0.030000, + 0.030000, + 557.730042, + 566.430054, + 0.030000, + 565.830078, + 574.830017, + 0.030000, + 573.930054, + 583.230042, + 0.030000, + 582.030029, + 591.630066, + 0.030000, + 0.030000, + 600.030029}}, + + {{365.740021, + 0.040000, + 0.040000, + 374.440002, + 375.040009, + 0.040000, + 383.140015, + 384.040009, + 0.040000, + 391.839996, + 393.040009, + 0.040000, + 400.540009, + 402.040009, + 0.040000, + 0.040000, + 411.040009}, + {409.240021, + 0.040000, + 0.040000, + 417.940002, + 420.040009, + 0.040000, + 426.640015, + 429.040009, + 0.040000, + 435.339996, + 438.040009, + 0.040000, + 444.040009, + 447.040009, + 0.040000, + 0.040000, + 456.040009}, + {837.039978, + 0.040000, + 0.040000, + 855.040039, + 858.639954, + 0.040000, + 873.039978, + 877.239990, + 0.040000, + 891.039978, + 895.840027, + 0.040000, + 909.039978, + 914.440002, + 0.040000, + 0.040000, + 933.039978}, + {927.039978, + 0.040000, + 0.040000, + 945.040039, + 951.639954, + 0.040000, + 963.039978, + 970.239990, + 0.040000, + 981.039978, + 988.840027, + 0.040000, + 999.039978, + 1007.440002, + 0.040000, + 0.040000, + 1026.040039}, + {1017.039978, + 0.040000, + 0.040000, + 1035.040039, + 1044.640015, + 0.040000, + 1053.040039, + 1063.239990, + 0.040000, + 1071.040039, + 1081.840088, + 0.040000, + 1089.040039, + 1100.440063, + 0.040000, + 0.040000, + 1119.040039}, + {1107.040039, + 0.040000, + 0.040000, + 1125.040039, + 1137.640137, + 0.040000, + 1143.040039, + 1156.239990, + 0.040000, + 1161.040039, + 1174.840088, + 0.040000, + 1179.040039, + 1193.440063, + 0.040000, + 0.040000, + 1212.040039}, + {570.340027, + 0.040000, + 0.040000, + 579.640015, + 585.640015, + 0.040000, + 588.940002, + 595.239990, + 0.040000, + 598.239990, + 604.840027, + 0.040000, + 607.540039, + 614.440002, + 0.040000, + 0.040000, + 624.039978}, + {616.840027, + 0.040000, + 0.040000, + 626.140015, + 633.640015, + 0.040000, + 635.440002, + 643.239990, + 0.040000, + 644.739990, + 652.840027, + 0.040000, + 654.040039, + 662.440002, + 0.040000, + 0.040000, + 672.039978}}}}})); + CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput)); + } + SECTION("Big test to ensure kernel capabilities") { + constexpr DimSize_t batchSize = 1; + constexpr DimSize_t inChannels = 3; + constexpr DimSize_t outChannels = 4; + + constexpr std::array<DimSize_t, DIM> kernelSize{6, 4}; + + constexpr std::array<DimSize_t, DIM> inDataSize{6, 5}; + constexpr std::array<DimSize_t, DIM> outDataSize{16, 25}; + + constexpr std::array<DimSize_t, DIM> stride{1, 3}; + constexpr std::array<DimSize_t, DIM> dilation{2, 4}; + + auto input = std::make_shared<Tensor>( + Array4D<float, + batchSize, + inChannels, + inDataSize[0], + inDataSize[1]>({{{{{1., 2., 3., 4., 5.}, + {6., 7., 8., 9., 10.}, + {11., 12., 13., 14., 15.}, + {16., 17., 18., 19., 20.}, + {21., 22., 23., 24., 25.}, + {26., 27., 28., 29., 30.}}, + + {{31., 32., 33., 34., 35.}, + {36., 37., 38., 39., 40.}, + {41., 42., 43., 44., 45.}, + {46., 47., 48., 49., 50.}, + {51., 52., 53., 54., 55.}, + {56., 57., 58., 59., 60.}}, + + {{61., 62., 63., 64., 65.}, + {66., 67., 68., 69., 70.}, + {71., 72., 73., 74., 75.}, + {76., 77., 78., 79., 80.}, + {81., 82., 83., 84., 85.}, + {86., 87., 88., 89., 90.}}}}})); + + auto weights = std::make_shared<Tensor>(Array4D<float, + inChannels, + outChannels, + kernelSize[0], + kernelSize[1]>( + {{{{{0.100000, 0.200000, 0.300000, 0.400000}, + {0.500000, 0.600000, 0.700000, 0.800000}, + {0.900000, 1.000000, 1.100000, 1.200000}, + {1.300000, 1.400000, 1.500000, 1.600000}, + {1.700000, 1.800000, 1.900000, 2.000000}, + {2.100000, 2.200000, 2.300000, 2.400000}}, + + {{2.500000, 2.600000, 2.700000, 2.800000}, + {2.900000, 3.000000, 3.100000, 3.200000}, + {3.300000, 3.400000, 3.500000, 3.600000}, + {3.700000, 3.800000, 3.900000, 4.000000}, + {4.100000, 4.200000, 4.300000, 4.400000}, + {4.500000, 4.600000, 4.700000, 4.800000}}, + + {{4.900000, 5.000000, 5.100000, 5.200000}, + {5.300000, 5.400000, 5.500000, 5.600000}, + {5.700000, 5.800000, 5.900000, 6.000000}, + {6.100000, 6.200000, 6.300000, 6.400000}, + {6.500000, 6.600000, 6.700000, 6.800000}, + {6.900000, 7.000000, 7.100000, 7.200000}}, + + {{7.300000, 7.400000, 7.500000, 7.600000}, + {7.700000, 7.800000, 7.900000, 8.000000}, + {8.100000, 8.200000, 8.300000, 8.400001}, + {8.500000, 8.600000, 8.700000, 8.800000}, + {8.900001, 9.000000, 9.100000, 9.200000}, + {9.300000, 9.400001, 9.500000, 9.600000}}}, + + {{{9.700000, 9.800000, 9.900001, 10.000000}, + {10.100000, 10.200000, 10.300000, 10.400001}, + {10.500000, 10.600000, 10.700000, 10.800000}, + {10.900001, 11.000000, 11.100000, 11.200000}, + {11.300000, 11.400001, 11.500000, 11.600000}, + {11.700000, 11.800000, 11.900001, 12.000000}}, + + {{12.100000, 12.200000, 12.300000, 12.400001}, + {12.500000, 12.600000, 12.700000, 12.800000}, + {12.900001, 13.000000, 13.100000, 13.200000}, + {13.300000, 13.400001, 13.500000, 13.600000}, + {13.700000, 13.800000, 13.900001, 14.000000}, + {14.100000, 14.200000, 14.300000, 14.400001}}, + + {{14.500000, 14.600000, 14.700000, 14.800000}, + {14.900001, 15.000000, 15.100000, 15.200000}, + {15.300000, 15.400001, 15.500000, 15.600000}, + {15.700000, 15.800000, 15.900001, 16.000000}, + {16.100000, 16.200001, 16.300001, 16.400000}, + {16.500000, 16.600000, 16.700001, 16.800001}}, + + {{16.900000, 17.000000, 17.100000, 17.200001}, + {17.300001, 17.400000, 17.500000, 17.600000}, + {17.700001, 17.800001, 17.900000, 18.000000}, + {18.100000, 18.200001, 18.300001, 18.400000}, + {18.500000, 18.600000, 18.700001, 18.800001}, + {18.900000, 19.000000, 19.100000, 19.200001}}}, + + {{{19.300001, 19.400000, 19.500000, 19.600000}, + {19.700001, 19.800001, 19.900000, 20.000000}, + {20.100000, 20.200001, 20.300001, 20.400000}, + {20.500000, 20.600000, 20.700001, 20.800001}, + {20.900000, 21.000000, 21.100000, 21.200001}, + {21.300001, 21.400000, 21.500000, 21.600000}}, + + {{21.700001, 21.800001, 21.900000, 22.000000}, + {22.100000, 22.200001, 22.300001, 22.400000}, + {22.500000, 22.600000, 22.700001, 22.800001}, + {22.900000, 23.000000, 23.100000, 23.200001}, + {23.300001, 23.400000, 23.500000, 23.600000}, + {23.700001, 23.800001, 23.900000, 24.000000}}, + + {{24.100000, 24.200001, 24.300001, 24.400000}, + {24.500000, 24.600000, 24.700001, 24.800001}, + {24.900000, 25.000000, 25.100000, 25.200001}, + {25.300001, 25.400000, 25.500000, 25.600000}, + {25.700001, 25.800001, 25.900000, 26.000000}, + {26.100000, 26.200001, 26.300001, 26.400000}}, + + {{26.500000, 26.600000, 26.700001, 26.800001}, + {26.900000, 27.000000, 27.100000, 27.200001}, + {27.300001, 27.400000, 27.500000, 27.600000}, + {27.700001, 27.800001, 27.900000, 28.000000}, + {28.100000, 28.200001, 28.300001, 28.400000}, + {28.500000, 28.600000, 28.700001, 28.800001}}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{0.01, 0.02, 0.03, 0.04}})); + + auto op = setupTestConvTranspose<DIM>(batchSize, + inChannels, + outChannels, + kernelSize, + inDataSize, + stride, + dilation, + input, + weights, + biases); + + REQUIRE_NOTHROW(op->forward()); + + auto expectedOutput = + std::make_shared<Tensor>(Array4D<float, + batchSize, + outChannels, + outDataSize[0], + outDataSize[1]>( + {{{{{1478.110107, 0.010000, 0.010000, 1507.210083, + 1487.410034, 0.010000, 1536.310059, 1516.809937, + 1496.709961, 1565.410034, 1546.209961, 1526.410034, + 3100.510010, 1575.609985, 1556.109985, 1536.010010, + 1605.010010, 1585.810059, 1566.010010, 0.010000, + 1615.510010, 1596.010010, 0.010000, 0.010000, + 1626.010010}, + {1623.610107, 0.010000, 0.010000, 1652.710083, + 1634.410034, 0.010000, 1681.810059, 1663.809937, + 1645.209961, 1710.910034, 1693.209961, 1674.910034, + 3396.010010, 1722.609985, 1704.610107, 1686.010010, + 1752.010010, 1734.310059, 1716.010010, 0.010000, + 1764.010010, 1746.010010, 0.010000, 0.010000, + 1776.010010}, + {3284.410156, 0.010000, 0.010000, 3343.810303, + 3306.010010, 0.010000, 3403.210205, 3366.010010, + 3327.610107, 3462.610107, 3426.010010, 3388.209961, + 6871.209961, 3486.010010, 3448.810059, 3410.409912, + 3546.010010, 3509.409912, 3471.610107, 0.010000, + 3570.010010, 3532.810059, 0.010000, 0.010000, + 3594.010010}, + {3581.410156, 0.010000, 0.010000, 3640.810303, + 3606.010010, 0.010000, 3700.210205, 3666.010010, + 3630.610107, 3759.610107, 3726.010010, 3691.209961, + 7474.209961, 3786.010010, 3751.810059, 3716.409912, + 3846.010010, 3812.409912, 3777.610107, 0.010000, + 3873.010010, 3838.810059, 0.010000, 0.010000, + 3900.010010}, + {5430.910156, 0.010000, 0.010000, 5521.809570, + 5467.809570, 0.010000, 5612.709961, 5559.609863, + 5504.709961, 5703.609863, 5651.409668, 5597.409668, + 11336.110352, 5743.209961, 5690.109863, 5635.209473, + 5835.009766, 5782.809570, 5728.809570, 0.010000, + 5875.509766, 5822.409668, 0.010000, 0.010000, + 5916.009766}, + {5885.410156, 0.010000, 0.010000, 5976.310059, + 5926.809570, 0.010000, 6067.209961, 6018.609863, + 5968.209961, 6158.110352, 6110.409668, 6060.909668, + 12258.610352, 6202.209961, 6153.609375, 6103.209473, + 6294.009766, 6246.309570, 6196.809570, 0.010000, + 6339.009766, 6290.409668, 0.010000, 0.010000, + 6384.009766}, + {5578.509766, 0.010000, 0.010000, 5673.009766, + 5615.410156, 0.010000, 5767.510254, 5710.809570, + 5652.309570, 5862.009766, 5806.209961, 5748.609863, + 11645.710938, 5901.609863, 5844.909668, 5786.409668, + 5997.009766, 5941.209961, 5883.609375, 0.010000, + 6037.509766, 5980.809570, 0.010000, 0.010000, + 6078.009766}, + {6051.009766, 0.010000, 0.010000, 6145.509766, + 6092.410156, 0.010000, 6240.010254, 6187.810059, + 6133.809570, 6334.509766, 6283.209961, 6230.109863, + 12604.208984, 6378.610352, 6326.409668, 6272.410156, + 6474.009766, 6422.709961, 6369.609375, 0.010000, + 6519.009766, 6466.809570, 0.010000, 0.010000, + 6564.009766}, + {5726.109863, 0.010000, 0.010000, 5824.209473, + 5763.009766, 0.010000, 5922.309570, 5862.009766, + 5799.910156, 6020.409668, 5961.010254, 5899.809570, + 11955.309570, 6060.009766, 5999.709961, 5937.609863, + 6159.009766, 6099.609863, 6038.409668, 0.010000, + 6199.509766, 6139.209961, 0.010000, 0.010000, + 6240.009766}, + {6216.609863, 0.010000, 0.010000, 6314.709473, + 6258.009766, 0.010000, 6412.809570, 6357.009766, + 6299.410156, 6510.909668, 6456.010254, 6399.310059, + 12949.809570, 6555.009766, 6499.209961, 6441.609863, + 6654.009766, 6599.110352, 6542.409668, 0.010000, + 6699.009766, 6643.209961, 0.010000, 0.010000, + 6744.009766}, + {5873.709961, 0.010000, 0.010000, 5975.409668, + 5910.609863, 0.010000, 6077.109375, 6013.209473, + 5947.509766, 6178.809570, 6115.809570, 6051.009766, + 12264.910156, 6218.409668, 6154.510254, 6088.809570, + 6321.009766, 6258.009766, 6193.209961, 0.010000, + 6361.509766, 6297.610352, 0.010000, 0.010000, + 6402.009766}, + {6382.209473, 0.010000, 0.010000, 6483.910156, + 6423.609863, 0.010000, 6585.609375, 6526.209473, + 6465.009766, 6687.309570, 6628.809570, 6568.509766, + 13295.410156, 6731.409668, 6672.010254, 6610.810059, + 6834.009766, 6775.509766, 6715.209961, 0.010000, + 6879.009766, 6819.610352, 0.010000, 0.010000, + 6924.009766}, + {4320.009766, 0.010000, 0.010000, 4389.009766, + 4347.609863, 0.010000, 4458.009766, 4417.209961, + 4375.209961, 4527.009766, 4486.809570, 4445.409668, + 8998.809570, 4556.409668, 4515.609863, 4473.609863, + 4626.009766, 4585.809570, 4544.410156, 0.010000, + 4656.009766, 4615.209961, 0.010000, 0.010000, + 4686.009766}, + {4665.009766, 0.010000, 0.010000, 4734.009766, + 4695.609375, 0.010000, 4803.009766, 4765.209961, + 4726.209961, 4872.009766, 4834.809570, 4796.409668, + 9697.809570, 4904.409668, 4866.609863, 4827.609863, + 4974.009766, 4936.809570, 4898.410156, 0.010000, + 5007.009766, 4969.209961, 0.010000, 0.010000, + 5040.009766}, + {2366.110107, 0.010000, 0.010000, 2401.209961, + 2381.409912, 0.010000, 2436.310059, 2416.810059, + 2396.709961, 2471.410156, 2452.209961, 2432.409912, + 4918.509766, 2487.609863, 2468.110107, 2448.010010, + 2523.010010, 2503.810059, 2484.010010, 0.010000, + 2539.510010, 2520.010010, 0.010000, 0.010000, + 2556.010010}, + {2541.610107, 0.010000, 0.010000, 2576.710205, + 2558.409912, 0.010000, 2611.810059, 2593.810059, + 2575.209961, 2646.910156, 2629.209961, 2610.909912, + 5274.009766, 2664.609863, 2646.610107, 2628.010010, + 2700.010010, 2682.310059, 2664.010010, 0.010000, + 2718.010010, 2700.010010, 0.010000, 0.010000, + 2736.010010}}, + + {{1701.320068, 0.020000, 0.020000, 1737.620117, + 1710.620117, 0.020000, 1773.920044, 1747.220093, + 1719.920044, 1810.220093, 1783.820068, 1756.819946, + 3575.719971, 1820.420044, 1793.719971, 1766.420044, + 1857.020142, 1830.619995, 1803.619995, 0.020000, + 1867.520020, 1840.820068, 0.020000, 0.020000, + 1878.020020}, + {1882.820068, 0.020000, 0.020000, 1919.120117, + 1893.620117, 0.020000, 1955.420044, 1930.220093, + 1904.420044, 1991.720093, 1966.820068, 1941.319946, + 3943.219971, 2003.420044, 1978.219971, 1952.420044, + 2040.020142, 2015.119995, 1989.620117, 0.020000, + 2052.020020, 2026.820068, 0.020000, 0.020000, + 2064.020020}, + {3802.820068, 0.020000, 0.020000, 3876.620117, + 3824.420166, 0.020000, 3950.420166, 3898.820068, + 3846.020020, 4024.220215, 3973.220215, 3921.020020, + 7965.620117, 4047.620117, 3996.020020, 3943.219727, + 4122.020020, 4071.020020, 4018.820068, 0.020000, + 4146.020020, 4094.419922, 0.020000, 0.020000, + 4170.020020}, + {4171.819824, 0.020000, 0.020000, 4245.620117, + 4196.420410, 0.020000, 4319.420410, 4270.819824, + 4221.020020, 4393.220215, 4345.220215, 4296.020020, + 8712.620117, 4419.620605, 4371.020020, 4321.219727, + 4494.020020, 4446.020020, 4396.819824, 0.020000, + 4521.020020, 4472.419922, 0.020000, 0.020000, + 4548.020020}, + {6316.520020, 0.020000, 0.020000, 6429.020020, + 6353.420410, 0.020000, 6541.520508, 6466.819824, + 6390.319824, 6654.020020, 6580.220215, 6504.620117, + 13193.718750, 6693.620605, 6618.919922, 6542.420410, + 6807.020020, 6733.220215, 6657.619629, 0.020000, + 6847.520020, 6772.819824, 0.020000, 0.020000, + 6888.020020}, + {6879.020020, 0.020000, 0.020000, 6991.520020, + 6920.420410, 0.020000, 7104.020508, 7033.820312, + 6961.819824, 7216.520020, 7147.220215, 7076.120117, + 14332.218750, 7260.620605, 7190.420410, 7118.420410, + 7374.020020, 7304.720215, 7233.619629, 0.020000, + 7419.020020, 7348.819824, 0.020000, 0.020000, + 7464.020020}, + {6464.120117, 0.020000, 0.020000, 6580.219727, + 6501.020020, 0.020000, 6696.319824, 6618.020020, + 6537.920410, 6812.419922, 6735.020508, 6655.819824, + 13503.319336, 6852.020020, 6773.720215, 6693.620117, + 6969.020020, 6891.620605, 6812.419922, 0.020000, + 7009.520020, 6931.220215, 0.020000, 0.020000, + 7050.020020}, + {7044.620117, 0.020000, 0.020000, 7160.720215, + 7086.020020, 0.020000, 7276.819824, 7203.020020, + 7127.420410, 7392.919434, 7320.020508, 7245.320312, + 14677.819336, 7437.020020, 7363.220215, 7287.620117, + 7554.020020, 7481.120605, 7406.420410, 0.020000, + 7599.020020, 7525.220215, 0.020000, 0.020000, + 7644.020020}, + {6611.719727, 0.020000, 0.020000, 6731.420410, + 6648.620117, 0.020000, 6851.119629, 6769.219727, + 6685.520020, 6970.819824, 6889.819824, 6807.020020, + 13812.919922, 7010.419922, 6928.520508, 6844.819824, + 7131.020020, 7050.020020, 6967.220215, 0.020000, + 7171.520020, 7089.620605, 0.020000, 0.020000, + 7212.020020}, + {7210.219727, 0.020000, 0.020000, 7329.920410, + 7251.620117, 0.020000, 7449.619629, 7372.220215, + 7293.020020, 7569.319824, 7492.819824, 7414.520020, + 15023.418945, 7613.419434, 7536.020508, 7456.820312, + 7734.020020, 7657.520020, 7579.220215, 0.020000, + 7779.020020, 7701.620605, 0.020000, 0.020000, + 7824.020020}, + {6759.319824, 0.020000, 0.020000, 6882.620117, + 6796.219727, 0.020000, 7005.919922, 6920.420410, + 6833.120117, 7129.220215, 7044.619629, 6958.219727, + 14122.519531, 7168.819824, 7083.319824, 6996.020020, + 7293.020020, 7208.419922, 7122.020508, 0.020000, + 7333.520020, 7248.020020, 0.020000, 0.020000, + 7374.020020}, + {7375.819824, 0.020000, 0.020000, 7499.120117, + 7417.219727, 0.020000, 7622.420410, 7541.420410, + 7458.620117, 7745.720215, 7665.619629, 7583.720215, + 15369.019531, 7789.819824, 7708.819824, 7626.020020, + 7914.020020, 7833.919434, 7752.020508, 0.020000, + 7959.020020, 7878.020020, 0.020000, 0.020000, + 8004.020020}, + {4982.420410, 0.020000, 0.020000, 5065.819824, + 5010.020020, 0.020000, 5149.220215, 5094.020020, + 5037.619629, 5232.620605, 5178.020020, 5122.220215, + 10381.219727, 5262.020020, 5206.819824, 5150.419922, + 5346.020020, 5291.419922, 5235.620117, 0.020000, + 5376.020020, 5320.819824, 0.020000, 0.020000, + 5406.020020}, + {5399.420410, 0.020000, 0.020000, 5482.820312, + 5430.020020, 0.020000, 5566.220215, 5514.020020, + 5460.619629, 5649.620605, 5598.020020, 5545.220215, + 11224.219727, 5682.020020, 5629.819824, 5576.419922, + 5766.020020, 5714.419922, 5661.620117, 0.020000, + 5799.020020, 5746.819824, 0.020000, 0.020000, + 5832.020020}, + {2733.320068, 0.020000, 0.020000, 2775.620117, + 2748.620117, 0.020000, 2817.920166, 2791.219971, + 2763.919922, 2860.220215, 2833.820068, 2806.820068, + 5681.720215, 2876.420166, 2849.719971, 2822.419922, + 2919.020020, 2892.619873, 2865.620117, 0.020000, + 2935.520020, 2908.820068, 0.020000, 0.020000, + 2952.020020}, + {2944.820068, 0.020000, 0.020000, 2987.120117, + 2961.620117, 0.020000, 3029.420166, 3004.220215, + 2978.419922, 3071.720215, 3046.820068, 3021.320068, + 6109.220215, 3089.420166, 3064.219971, 3038.419922, + 3132.020020, 3107.119873, 3081.620117, 0.020000, + 3150.020020, 3124.820068, 0.020000, 0.020000, + 3168.020020}}, + + {{1924.530029, 0.030000, 0.030000, 1968.030029, + 1933.830078, 0.030000, 2011.530029, 1977.630127, + 1943.130127, 2055.030029, 2021.430054, 1987.230103, + 4050.929932, 2065.230225, 2031.330078, 1996.829956, + 2109.030029, 2075.430176, 2041.229980, 0.030000, + 2119.530029, 2085.630127, 0.030000, 0.030000, + 2130.030029}, + {2142.030029, 0.030000, 0.030000, 2185.530029, + 2152.830078, 0.030000, 2229.030029, 2196.630127, + 2163.630127, 2272.530029, 2240.430176, 2207.729980, + 4490.429688, 2284.230225, 2251.830078, 2218.830078, + 2328.030029, 2295.930176, 2263.229980, 0.030000, + 2340.030029, 2307.629883, 0.030000, 0.030000, + 2352.030029}, + {4321.229980, 0.030000, 0.030000, 4409.429688, + 4342.829590, 0.030000, 4497.629883, 4431.629883, + 4364.430176, 4585.829590, 4520.430176, 4453.829590, + 9060.030273, 4609.229980, 4543.229980, 4476.029785, + 4698.029785, 4632.630371, 4566.029785, 0.030000, + 4722.029785, 4656.029785, 0.030000, 0.030000, + 4746.029785}, + {4762.229980, 0.030000, 0.030000, 4850.429688, + 4786.829590, 0.030000, 4938.629883, 4875.629883, + 4811.430176, 5026.829590, 4964.430176, 4900.829590, + 9951.030273, 5053.229980, 4990.229980, 4926.029785, + 5142.029785, 5079.630371, 5016.029785, 0.030000, + 5169.029785, 5106.029785, 0.030000, 0.030000, + 5196.029785}, + {7202.129883, 0.030000, 0.030000, 7336.229492, + 7239.029785, 0.030000, 7470.329590, 7374.029785, + 7275.930176, 7604.429688, 7509.030273, 7411.829590, + 15051.330078, 7644.029785, 7547.729980, 7449.629883, + 7779.029785, 7683.630371, 7586.430176, 0.030000, + 7819.529785, 7723.229980, 0.030000, 0.030000, + 7860.029785}, + {7872.629883, 0.030000, 0.030000, 8006.729980, + 7914.029785, 0.030000, 8140.829590, 8049.029785, + 7955.430176, 8274.929688, 8184.030273, 8091.330078, + 16405.830078, 8319.030273, 8227.230469, 8133.629883, + 8454.030273, 8363.130859, 8270.430664, 0.030000, + 8499.030273, 8407.230469, 0.030000, 0.030000, + 8544.030273}, + {7349.729492, 0.030000, 0.030000, 7487.430176, + 7386.629883, 0.030000, 7625.129395, 7525.229980, + 7423.529785, 7762.829590, 7663.829590, 7563.029785, + 15360.929688, 7802.429688, 7702.530273, 7600.829590, + 7941.029785, 7842.029785, 7741.229980, 0.030000, + 7981.529785, 7881.630371, 0.030000, 0.030000, + 8022.029785}, + {8038.229492, 0.030000, 0.030000, 8175.930176, + 8079.629883, 0.030000, 8313.629883, 8218.230469, + 8121.029785, 8451.330078, 8356.830078, 8260.530273, + 16751.427734, 8495.429688, 8400.030273, 8302.831055, + 8634.030273, 8539.530273, 8443.230469, 0.030000, + 8679.030273, 8583.630859, 0.030000, 0.030000, + 8724.030273}, + {7497.329590, 0.030000, 0.030000, 7638.629883, + 7534.229492, 0.030000, 7779.930176, 7676.430176, + 7571.130371, 7921.229980, 7818.629395, 7714.229980, + 15670.530273, 7960.829590, 7857.329590, 7752.029785, + 8103.029785, 8000.429688, 7896.030273, 0.030000, + 8143.529785, 8040.029785, 0.030000, 0.030000, + 8184.029785}, + {8203.830078, 0.030000, 0.030000, 8345.129883, + 8245.229492, 0.030000, 8486.430664, 8387.430664, + 8286.630859, 8627.730469, 8529.629883, 8429.730469, + 17097.029297, 8671.830078, 8572.830078, 8472.030273, + 8814.030273, 8715.930664, 8616.030273, 0.030000, + 8859.030273, 8760.030273, 0.030000, 0.030000, + 8904.030273}, + {7644.930176, 0.030000, 0.030000, 7789.829590, + 7681.829590, 0.030000, 7934.729980, 7827.629883, + 7718.729980, 8079.630371, 7973.430176, 7865.430176, + 15980.130859, 8119.229980, 8012.129395, 7903.229980, + 8265.030273, 8158.830566, 8050.829590, 0.030000, + 8305.530273, 8198.430664, 0.030000, 0.030000, + 8346.030273}, + {8369.430664, 0.030000, 0.030000, 8514.331055, + 8410.830078, 0.030000, 8659.230469, 8556.629883, + 8452.231445, 8804.130859, 8702.430664, 8598.930664, + 17442.628906, 8848.230469, 8745.629883, 8641.230469, + 8994.030273, 8892.331055, 8788.830078, 0.030000, + 9039.030273, 8936.430664, 0.030000, 0.030000, + 9084.030273}, + {5644.829590, 0.030000, 0.030000, 5742.629883, + 5672.430176, 0.030000, 5840.430176, 5770.830078, + 5700.029785, 5938.229980, 5869.229980, 5799.029785, + 11763.630859, 5967.630371, 5898.029785, 5827.229980, + 6066.029785, 5997.029785, 5926.829590, 0.030000, + 6096.029785, 6026.430176, 0.030000, 0.030000, + 6126.029785}, + {6133.829590, 0.030000, 0.030000, 6231.629883, + 6164.430176, 0.030000, 6329.430176, 6262.830078, + 6195.029785, 6427.229980, 6361.229980, 6294.029785, + 12750.630859, 6459.630371, 6393.029785, 6325.229980, + 6558.029785, 6492.029785, 6424.829590, 0.030000, + 6591.029785, 6524.430176, 0.030000, 0.030000, + 6624.029785}, + {3100.530029, 0.030000, 0.030000, 3150.030029, + 3115.830078, 0.030000, 3199.530029, 3165.630127, + 3131.130127, 3249.030029, 3215.430176, 3181.230225, + 6444.930176, 3265.230225, 3231.330078, 3196.830078, + 3315.030029, 3281.430176, 3247.230225, 0.030000, + 3331.530029, 3297.630127, 0.030000, 0.030000, + 3348.030029}, + {3348.030029, 0.030000, 0.030000, 3397.530029, + 3364.830078, 0.030000, 3447.030029, 3414.630127, + 3381.630127, 3496.530029, 3464.430176, 3431.730225, + 6944.430176, 3514.230225, 3481.830078, 3448.830078, + 3564.030029, 3531.930176, 3499.230225, 0.030000, + 3582.030029, 3549.630127, 0.030000, 0.030000, + 3600.030029}}, + + {{2147.739990, 0.040000, 0.040000, 2198.439941, + 2157.040039, 0.040000, 2249.140137, 2208.040039, + 2166.340088, 2299.840088, 2259.040039, 2217.640137, + 4526.140137, 2310.040039, 2268.940186, 2227.240234, + 2361.040039, 2320.240234, 2278.840088, 0.040000, + 2371.540039, 2330.440186, 0.040000, 0.040000, + 2382.040039}, + {2401.239990, 0.040000, 0.040000, 2451.939941, + 2412.040039, 0.040000, 2502.640137, 2463.040039, + 2422.840088, 2553.340088, 2514.040039, 2474.140137, + 5037.640137, 2565.040039, 2525.440186, 2485.240234, + 2616.040039, 2576.740234, 2536.840088, 0.040000, + 2628.040039, 2588.440186, 0.040000, 0.040000, + 2640.040039}, + {4839.640137, 0.040000, 0.040000, 4942.240234, + 4861.240234, 0.040000, 5044.839844, 4964.439941, + 4882.839844, 5147.440430, 5067.640137, 4986.640137, + 10154.440430, 5170.839844, 5090.440430, 5008.840332, + 5274.040039, 5194.240234, 5113.240234, 0.040000, + 5298.040039, 5217.640625, 0.040000, 0.040000, + 5322.040039}, + {5352.640137, 0.040000, 0.040000, 5455.240234, + 5377.240234, 0.040000, 5557.839844, 5480.439941, + 5401.839844, 5660.440430, 5583.640137, 5505.640137, + 11189.439453, 5686.839844, 5609.440430, 5530.840332, + 5790.040039, 5713.240234, 5635.240234, 0.040000, + 5817.040039, 5739.640625, 0.040000, 0.040000, + 5844.040039}, + {8087.740234, 0.040000, 0.040000, 8243.440430, + 8124.640625, 0.040000, 8399.139648, 8281.240234, + 8161.540039, 8554.840820, 8437.839844, 8319.040039, + 16908.937500, 8594.440430, 8476.540039, 8356.840820, + 8751.040039, 8634.040039, 8515.240234, 0.040000, + 8791.540039, 8673.640625, 0.040000, 0.040000, + 8832.040039}, + {8866.240234, 0.040000, 0.040000, 9021.940430, + 8907.640625, 0.040000, 9177.639648, 9064.240234, + 8949.040039, 9333.340820, 9220.839844, 9106.540039, + 18479.437500, 9377.440430, 9264.040039, 9148.840820, + 9534.040039, 9421.540039, 9307.240234, 0.040000, + 9579.040039, 9465.640625, 0.040000, 0.040000, + 9624.040039}, + {8235.339844, 0.040000, 0.040000, 8394.639648, + 8272.240234, 0.040000, 8553.940430, 8432.440430, + 8309.140625, 8713.240234, 8592.639648, 8470.240234, + 17218.539062, 8752.840820, 8631.339844, 8508.040039, + 8913.040039, 8792.440430, 8670.040039, 0.040000, + 8953.540039, 8832.040039, 0.040000, 0.040000, + 8994.040039}, + {9031.839844, 0.040000, 0.040000, 9191.139648, + 9073.240234, 0.040000, 9350.440430, 9233.440430, + 9114.640625, 9509.740234, 9393.639648, 9275.740234, + 18825.039062, 9553.839844, 9436.839844, 9318.040039, + 9714.040039, 9597.940430, 9480.040039, 0.040000, + 9759.040039, 9642.040039, 0.040000, 0.040000, + 9804.040039}, + {8382.940430, 0.040000, 0.040000, 8545.840820, + 8419.839844, 0.040000, 8708.740234, 8583.639648, + 8456.740234, 8871.640625, 8747.440430, 8621.440430, + 17528.138672, 8911.240234, 8786.139648, 8659.240234, + 9075.040039, 8950.840820, 8824.839844, 0.040000, + 9115.540039, 8990.440430, 0.040000, 0.040000, + 9156.040039}, + {9197.440430, 0.040000, 0.040000, 9360.340820, + 9238.839844, 0.040000, 9523.240234, 9402.639648, + 9280.240234, 9686.140625, 9566.440430, 9444.940430, + 19170.638672, 9730.240234, 9609.639648, 9487.240234, + 9894.040039, 9774.339844, 9652.839844, 0.040000, + 9939.040039, 9818.440430, 0.040000, 0.040000, + 9984.040039}, + {8530.540039, 0.040000, 0.040000, 8697.040039, + 8567.440430, 0.040000, 8863.540039, 8734.840820, + 8604.339844, 9030.040039, 8902.240234, 8772.639648, + 17837.740234, 9069.640625, 8940.940430, 8810.440430, + 9237.040039, 9109.240234, 8979.639648, 0.040000, + 9277.540039, 9148.840820, 0.040000, 0.040000, + 9318.040039}, + {9363.040039, 0.040000, 0.040000, 9529.540039, + 9404.440430, 0.040000, 9696.040039, 9571.840820, + 9445.839844, 9862.540039, 9739.240234, 9614.139648, + 19516.240234, 9906.640625, 9782.440430, 9656.440430, + 10074.040039, 9950.740234, 9825.639648, 0.040000, + 10119.040039, 9994.839844, 0.040000, 0.040000, + 10164.040039}, + {6307.240234, 0.040000, 0.040000, 6419.439941, + 6334.839844, 0.040000, 6531.640137, 6447.640137, + 6362.440430, 6643.839844, 6560.440430, 6475.840332, + 13146.040039, 6673.240234, 6589.240234, 6504.040039, + 6786.040039, 6702.640625, 6618.040039, 0.040000, + 6816.040039, 6732.040039, 0.040000, 0.040000, + 6846.040039}, + {6868.240234, 0.040000, 0.040000, 6980.439941, + 6898.839844, 0.040000, 7092.640137, 7011.640137, + 6929.440430, 7204.839844, 7124.440430, 7042.840332, + 14277.040039, 7237.240234, 7156.240234, 7074.040039, + 7350.040039, 7269.640625, 7188.040039, 0.040000, + 7383.040039, 7302.040039, 0.040000, 0.040000, + 7416.040039}, + {3467.739990, 0.040000, 0.040000, 3524.439941, + 3483.040039, 0.040000, 3581.140137, 3540.040039, + 3498.340088, 3637.840088, 3597.040039, 3555.640137, + 7208.140137, 3654.040039, 3612.940186, 3571.240234, + 3711.040039, 3670.240234, 3628.840088, 0.040000, + 3727.540039, 3686.440186, 0.040000, 0.040000, + 3744.040039}, + {3751.239990, 0.040000, 0.040000, 3807.939941, + 3768.040039, 0.040000, 3864.640137, 3825.040039, + 3784.840088, 3921.340088, 3882.040039, 3842.140137, + 7779.640137, 3939.040039, 3899.440186, 3859.240234, + 3996.040039, 3956.740234, 3916.840088, 0.040000, + 4014.040039, 3974.440186, 0.040000, 0.040000, + 4032.040039}}}}})); + CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput)); + } + } +} + +} // namespace Aidge diff --git a/unit_tests/operator/Test_CryptoHash.cpp b/unit_tests/operator/Test_CryptoHash.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7453ea19c765d6a2bf79a66972d120b7a0ca6de5 --- /dev/null +++ b/unit_tests/operator/Test_CryptoHash.cpp @@ -0,0 +1,56 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cmath> // std::abs +#include <cstddef> // std::size_t +#include <memory> + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp" +#include "aidge/data/Data.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/CryptoHash.hpp" +#include "aidge/utils/ArrayHelpers.hpp" + +using namespace Aidge; + +#ifdef WITH_OPENSSL +TEST_CASE("[cpu/operator] CryptoHash(forward)") { + SECTION("1D Tensor") { + std::shared_ptr<Tensor> input0 = + std::make_shared<Tensor>(Array1D<uint8_t, 5>{ + {'a', 'b', 'c', 'd', 'e'}}); + std::shared_ptr<Tensor> expectedOutput = + std::make_shared<Tensor>(Array1D<uint8_t, 32>{ + {0x36, 0xbb, 0xe5, 0x0e, 0xd9, 0x68, 0x41, 0xd1, + 0x04, 0x43, 0xbc, 0xb6, 0x70, 0xd6, 0x55, 0x4f, + 0x0a, 0x34, 0xb7, 0x61, 0xbe, 0x67, 0xec, 0x9c, + 0x4a, 0x8a, 0xd2, 0xc0, 0xc4, 0x4c, 0xa4, 0x2c}}); + + std::shared_ptr<Node> myCryptoHash = CryptoHash(); + auto op = std::static_pointer_cast<CryptoHash_Op>(myCryptoHash->getOperator()); + op->associateInput(0, input0); + op->setDataType(DataType::UInt8); + op->setBackend("cpu"); + myCryptoHash->forward(); + + REQUIRE(op->getOutput(0)->size() == 32); + + uint8_t* resPtr = static_cast<uint8_t*>(op->getOutput(0)->getImpl()->rawPtr()); + uint8_t* expectedPtr = static_cast<uint8_t*>(expectedOutput->getImpl()->rawPtr()); + for (std::size_t i = 0; i < expectedOutput->size(); ++i) { + REQUIRE(resPtr[i] == expectedPtr[i]); + } + } +} +#endif diff --git a/unit_tests/operator/Test_DivImpl.cpp b/unit_tests/operator/Test_DivImpl.cpp index 4037b2ad4e117573279f07d0c1819d3435ee7ada..37d11599f012c68579f21c01e08e868091711127 100644 --- a/unit_tests/operator/Test_DivImpl.cpp +++ b/unit_tests/operator/Test_DivImpl.cpp @@ -322,4 +322,274 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") { } } } + +TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") { + std::shared_ptr<Div_Op> op = std::make_shared<Div_Op>(); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + // NOTE: The first four tests use fixed values, the last one uses random values but static dimensions. + + SECTION("Case 1: 1D and 2D Tensors") { + const auto T0 = std::make_shared<Tensor>( + Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 2, 3}, {4, 5, 6}}})); + + const auto T1 = + std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, 3>({0.1, 0.2, 0.3})); + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->forwardDims(); + + op->getOutput(0)->setGrad(std::make_shared<Tensor>( + Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}))); + op->backward(); + + const Tensor expectedGrad0 = + Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{10, 5, 3.3333}, {10, 5, 3.3333}}}); + + const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({-500, -175, -100}); + + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1)); + } + + SECTION("Case 2: 3D and 1D tensors") { + const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}, + {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}})); + + const auto T1 = + std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1})); + + const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}})); + + const Tensor expectedGrad0 = + Array3D<float, 2, 2, 3>({{{{3.3333, 5.0, 10}, {3.3333, 5.0, 10}}, + {{3.3333, 5.0, 10}, {3.3333, 5.0, 10}}}}); + + const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({-244.4444, -650.0, -3000.0}); + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->forwardDims(); + + op->getOutput(0)->setGrad(newGrad); + op->backward(); + + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1)); + } + + SECTION("Case 3: 4D and 2D tensors") { + const auto T0 = std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>( + {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}}, + {{10.0, 11.0, 12.0}, {13.0, 14.0, 15.0}, {16.0, 17.0, 18.0}}}, + {{{19.0, 20.0, 21.0}, {22.0, 23.0, 24.0}, {25.0, 26.0, 27.0}}, + {{28.0, 29.0, 30.0}, + {31.0, 32.0, 33.0}, + {34.0, 35.0, 36.0}}}}})); + + const auto T1 = std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 3>( + {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}})); + + const auto newGrad = + std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>( + {{{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}, + {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}, + {{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}, + {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}})); + + const Tensor expectedGrad0 = + Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>( + {{{{{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}}, + {{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}}}, + {{{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}}, + {{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}}}}}); + + const Tensor expectedGrad1 = + Array2D<cpptype_t<DataType::Float32>, 3, 3>({{{-232.0, -688.888, -6600.0}, + {-437.5, -1850.0, -216.66667}, + {-167.3469, -134.3750, -111.111}}}); + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->forwardDims(); + + op->getOutput(0)->setGrad(newGrad); + op->backward(); + + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1)); + } + + SECTION("Case 4: 3D and 2D tensors") { + const auto T0 = std::make_shared<Tensor>( + Array3D<float, 2, 3, 4>({{{ + {1.0, 2.0, 3.0, 4.0}, + {5.0, 6.0, 7.0, 8.0}, + {9.0, 10.0, 11.0, 12.0}, + }, + { + {13.0, 14.0, 15.0, 16.0}, + {17.0, 18.0, 19.0, 20.0}, + {21.0, 22.0, 23.0, 24.0}, + }}})); + + const auto T1 = std::make_shared<Tensor>( + Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{0.1, 0.2, 0.3, 0.4}, + {0.5, 0.6, 0.7, 0.8}, + {0.9, 1.0, 1.1, 1.2}}})); + + const auto newGrad = std::make_shared<Tensor>( + Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{ + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + }, + { + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + }}})); + + const Tensor expectedGrad0 = + Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{ + {10, 5, 3.33333, 2.5}, + {2, 1.66667, 1.42857, 1.2500}, + {1.11111, 1.0, 0.90909, 0.83333}}, + {{10, 5, 3.33333, 2.5}, + {2, 1.66667, 1.42857, 1.2500}, + {1.11111, 1.0, 0.90909, 0.83333}}}}); + + const Tensor expectedGrad1 = + Array2D<cpptype_t<DataType::Float32>, 3, 4>({{ + {-1400.0, -400.0, -200.0, -125.0}, + {-88.0, -66.66667, -53.0612, -43.750}, + {-37.0370, -32.0, -28.0992, -25.00}}}); + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->forwardDims(); + + op->getOutput(0)->setGrad(newGrad); + op->backward(); + + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1)); + } + + SECTION("Case 5: Tensors with random values") { + + // Use random values + const std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor + const std::vector<std::size_t> dims1 = {2, 6, 7}; // Second tensor + const std::vector<std::size_t> outputDims = {5, 2, 6, 7}; + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> dist(0.1f, 1.0f); + + auto T0 = std::make_shared<Tensor>(dims0); + T0->setDataType(DataType::Float32); + T0->setBackend("cpu"); + float* input0Data = static_cast<float*>(T0->getImpl()->rawPtr()); + // Fill with random values + for (std::size_t i = 0; i < T0->size(); ++i) { + input0Data[i] = dist(gen); + } + + auto T1 = std::make_shared<Tensor>(dims1); + T1->setDataType(DataType::Float32); + T1->setBackend("cpu"); + float* input1Data = static_cast<float*>(T1->getImpl()->rawPtr()); + // Fill with random values + for (std::size_t i = 0; i < T1->size(); ++i) { + input1Data[i] = dist(gen); + } + + op->associateInput(0, T0); + op->associateInput(1, T1); + + op->forwardDims(); + op->forward(); + + Tensor expectedOutput{outputDims}; + expectedOutput.setBackend("cpu"); + float* expectedOutputData = static_cast<float*>(expectedOutput.getImpl()->rawPtr()); + + for (std::size_t n = 0; n < 5; ++n) { + for (std::size_t c = 0; c < 2; ++c) { + for (std::size_t h = 0; h < 6; ++h) { + for (std::size_t w = 0; w < 7; ++w) { + std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n)); + std::size_t in0Idx = + w + 7 * (0 + 1 * (c + 2 * n)); // middle dim is 1 + std::size_t in1Idx = + w + 7 * (h + 6 * c); // no n dimension + + expectedOutputData[outIdx] = input0Data[in0Idx] / input1Data[in1Idx]; + } + } + } + } + + auto outputTensor = op->getOutput(0); + + REQUIRE(approxEq<float>(*outputTensor, expectedOutput)); + + // Backward pass + std::vector<float> gradOutputData(expectedOutput.size()); + for (auto &val : gradOutputData) { + val = dist(gen); + } + + op->getOutput(0)->setGrad(std::make_shared<Tensor>(outputDims)); + op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(), + expectedOutput.size()); + + // Compute reference gradients + std::vector<float> expectedGrad0(T0->size(), 0.0f); + std::vector<float> expectedGrad1(T1->size(), 0.0f); + + for (std::size_t n = 0; n < 5; ++n) { + for (std::size_t c = 0; c < 2; ++c) { + for (std::size_t h = 0; h < 6; ++h) { + for (std::size_t w = 0; w < 7; ++w) { + std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n)); + std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n)); + std::size_t in1Idx = w + 7 * (h + 6 * c); + + expectedGrad0[in0Idx] += + gradOutputData[outIdx] * (1.0f / input1Data[in1Idx]); + + expectedGrad1[in1Idx] += + gradOutputData[outIdx] * (-input0Data[in0Idx] / (input1Data[in1Idx] * input1Data[in1Idx])); + } + } + } + } + + // Perform backward pass + op->backward(); + + auto expectedGrad0Tensor = std::make_shared<Tensor>(); + expectedGrad0Tensor->resize(T0->dims()); + expectedGrad0Tensor->setBackend("cpu"); + expectedGrad0Tensor->setDataType(DataType::Float32); + expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(), + expectedGrad0.size()); + + auto expectedGrad1Tensor = std::make_shared<Tensor>(T1->dims()); + expectedGrad1Tensor->setBackend("cpu"); + expectedGrad1Tensor->setDataType(DataType::Float32); + expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(), + expectedGrad1.size()); + + // Verify backward pass + REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor)); + REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor)); + } +} } // namespace Aidge diff --git a/unit_tests/operator/Test_DropoutImpl.cpp b/unit_tests/operator/Test_DropoutImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a3c10eadc847dce5d02e2464d571ee54222d1f9a --- /dev/null +++ b/unit_tests/operator/Test_DropoutImpl.cpp @@ -0,0 +1,103 @@ +/******************************************************************************** + * Copyright (c) 2025 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cstddef> // std::size_t +#include <memory> + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/Dropout.hpp" +#include "aidge/utils/TensorUtils.hpp" + +using namespace Aidge; + + TEST_CASE("[cpu/operator] Dropout(forward - inference mode / MC dropout)", "[Dropout][CPU]") { + + SECTION("MC Dropout - check stochastic output and scaling") { + constexpr const std::size_t nb_elements = 6; + std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>,nb_elements> { + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f} + }); + + constexpr const float dropout_prob = 0.5f; + std::shared_ptr<Node> myDropout = Dropout(dropout_prob); // assumes dropout always active + auto op = std::static_pointer_cast<OperatorTensor>(myDropout->getOperator()); + + op->associateInput(0, input); + op->setBackend("cpu"); + op->forwardDType(); + + myDropout->forward(); + auto output = op->getOutput(0); + + std::size_t num_zero = 0, num_scaled = 0; + constexpr const float scale = 1.0f / (1.0f - dropout_prob); + + for (std::size_t i = 0; i < nb_elements; ++i) { + const float out = output->get<cpptype_t<DataType::Float32>>(i); + if (out == 0.0f) + ++num_zero; + else { + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(Tensor(out),Tensor(scale))); // scaled version of 1.0f + ++num_scaled; + } + } + + // Ensure dropout is working + REQUIRE(num_zero + num_scaled == nb_elements); + REQUIRE(output->dims() == input->dims()); // TODO: test this in core module + } + + SECTION("Stochasticity - multiple forward passes differ") { + // /!\ Warning: With too few elements, this test has a small + // but real chance of failing. + constexpr const std::size_t nb_elements = 100; + std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, nb_elements> { + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f} + }); + + constexpr const float dropout_prob = 0.3f; + std::shared_ptr<Node> myDropout = Dropout(dropout_prob); + auto op = std::static_pointer_cast<OperatorTensor>(myDropout->getOperator()); + op->associateInput(0, input); + op->setBackend("cpu"); + op->forwardDType(); + + std::vector<cpptype_t<DataType::Float32>> run1, run2; + + myDropout->forward(); + auto out1 = op->getOutput(0); + for (std::size_t i = 0; i < nb_elements; ++i) + run1.push_back(out1->get<cpptype_t<DataType::Float32>>(i)); + + myDropout->forward(); + auto out2 = op->getOutput(0); + for (std::size_t i = 0; i < nb_elements; ++i) + run2.push_back(out2->get<cpptype_t<DataType::Float32>>(i)); + + // Not all elements should be identical between the two runs + std::size_t same_count = 0; + for (std::size_t i = 0; i < nb_elements; ++i) { + if (run1[i] == run2[i]) + same_count++; + } + } +} \ No newline at end of file diff --git a/unit_tests/operator/Test_EqualImpl.cpp b/unit_tests/operator/Test_EqualImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bd9fa94fdd09ef70af2e218b3b33636f3d83cc97 --- /dev/null +++ b/unit_tests/operator/Test_EqualImpl.cpp @@ -0,0 +1,202 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <catch2/catch_test_macros.hpp> +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/Equal.hpp" + +using namespace Aidge; + +TEST_CASE("[cpu/operator] Equal(forwardDims)", "[Equal][CPU]") { + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5)); + std::uniform_int_distribution<int> boolDist(0,1); + + SECTION("Same dimensions") { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + DimSize_t nbDims = nbDimsDist(gen); + std::vector<DimSize_t> dims(nbDims); + for (std::size_t i = 0; i < nbDims; i++) { + dims[i] = dimSizeDist(gen); + } + + std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims); + myInput1->setBackend("cpu"); + myInput1->setDataType(DataType::Float32); + myInput1->zeros(); + std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims); + myInput2->setBackend("cpu"); + myInput2->setDataType(DataType::Float32); + myInput2->zeros(); + std::shared_ptr<Node> myEqual = Equal(); + auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator()); + op->associateInput(0,myInput1); + op->associateInput(1,myInput2); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->forwardDims(); + + const auto outputDims = op->getOutput(0)->dims(); + REQUIRE(outputDims == dims); + } + } + SECTION("Broadcasting") { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + DimSize_t nbDims = nbDimsDist(gen); + std::vector<DimSize_t> dims1(nbDims, 1); + std::vector<DimSize_t> dims2(nbDims, 1); + std::vector<DimSize_t> expectedOutDims; + for (std::size_t i = 0; i < nbDims; i++) { + DimSize_t dim = dimSizeDist(gen); + if (boolDist(gen)) { + dims1[i] = dim; + } + if (boolDist(gen)) { + dims2[i] = dim; + } + expectedOutDims.push_back(std::max(dims1[i],dims2[i])); + } + + + std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1); + myInput1->setBackend("cpu"); + myInput1->setDataType(DataType::Float32); + myInput1->zeros(); + std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2); + myInput2->setBackend("cpu"); + myInput2->setDataType(DataType::Float32); + myInput2->zeros(); + std::shared_ptr<Node> myEqual = Equal(); + auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator()); + op->associateInput(0,myInput1); + op->associateInput(1,myInput2); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + op->forwardDims(); + + const auto outputDims = op->getOutput(0)->dims(); + REQUIRE(outputDims == expectedOutDims); + } + } +} +TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") { + SECTION("Same size inputs") { + std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { + { // + { // + {{20, 15},{31, 11},{22, 49}}, // + {{41, 10},{24, 51},{27, 52}}, // + {{26, 53},{27, 54},{28, 55}} // + }, // + { // + {{29, 56},{30, 57},{31, 58}}, // + {{32, 59},{33, 60},{34, 61}}, // + {{35, 62},{36, 63},{37, 64}} // + }, // + { // + {{38, 65},{39, 66},{40, 67}}, // + {{41, 68},{42, 69},{43, 70}}, // + {{44, 71},{45, 72},{46, 73}} // + } // + } // + }); // + std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { + { // + { // + {{20, 47},{21, 48},{22, 49}}, // + {{23, 50},{24, 51},{25, 52}}, // + {{17, 53},{27, 26},{14, 33}} // + }, // + { // + {{29, 56},{30, 57},{31, 58}}, // + {{72, 44},{33, 20},{27, 55}}, // + {{35, 24},{25, 63},{28, 64}} // + }, // + { // + {{32, 65},{39, 66},{40, 70}}, // + {{41, 53},{42, 60},{34, 70}}, // + {{44, 71},{30, 12},{46, 73}} // + } // + } // + }); // + Tensor expectedOutput =Tensor(Array4D<int,3,3,3,2> { + { + { + {{1, 0},{0, 0},{1, 1}}, + {{0, 0},{1, 1},{0, 1}}, + {{0, 1},{1, 0},{0, 0}} + }, + { + {{1, 1},{1, 1},{1, 1}}, + {{0, 0},{1, 0},{0, 0}}, + {{1, 0},{0, 1},{0, 1}} + }, + { + {{0, 1},{1, 1},{1, 0}}, + {{1, 0},{1, 0},{0, 1}}, + {{1, 1},{0, 0},{1, 1}} + } + } + }); + + std::shared_ptr<Node> myEqual = Equal(); + auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator()); + op->associateInput(0, input1); + op->associateInput(1, input2); + op->setBackend("cpu"); + op->setDataType(DataType::Int32); + myEqual->forward(); + + REQUIRE(*(op->getOutput(0)) == expectedOutput); + } + + SECTION("Broadcasting") { + std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> { + { // + { // + {{10, 20},{22, 23},{20, 20}}, // + {{10, 15},{10, 29},{20, 20}}, // + {{26, 25},{33, 20},{10, 20}} // + } // + } // + }); // + + std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}}); + Tensor expectedOutput = Tensor(Array4D<int,1,3,3,2> { + { // + { // + {{ 1, 1},{ 0, 0},{ 0, 1}}, // + {{ 1, 0},{ 1, 0},{ 0, 1}}, // + {{ 0, 0},{ 0, 1},{ 1, 1}} // + } // + } // + }); // + + std::shared_ptr<Node> myEqual = Equal(); + auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator()); + op->associateInput(0, input_1); + op->associateInput(1, input_2); + op->setDataType(DataType::Int32); + op->setBackend("cpu"); + myEqual->forward(); + op->getOutput(0)->print(); + + REQUIRE(*op->getOutput(0) == expectedOutput); + } +} \ No newline at end of file diff --git a/unit_tests/operator/Test_ExpandImpl.cpp b/unit_tests/operator/Test_ExpandImpl.cpp index 878c608110eabb824d8a6c0d1ceb0853b3c1449d..ad30457d33307ca595ecddfd3b06d58e118a02d0 100644 --- a/unit_tests/operator/Test_ExpandImpl.cpp +++ b/unit_tests/operator/Test_ExpandImpl.cpp @@ -13,20 +13,20 @@ #include <catch2/catch_test_macros.hpp> -#include "aidge/backend/cpu/data/TensorImpl.hpp" -#include "aidge/backend/cpu/operator/ExpandImpl.hpp" #include "aidge/data/DataType.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/operator/Expand.hpp" #include "aidge/utils/ArrayHelpers.hpp" -using std::shared_ptr; -using namespace Aidge; +namespace Aidge { + +using std::shared_ptr; -void setupTestExpand(shared_ptr<Tensor> inputData, - shared_ptr<Tensor> inputShape, - shared_ptr<Expand_Op> &op) { +static void setupTestExpand(shared_ptr<Tensor> inputData, + shared_ptr<Tensor> inputShape, + shared_ptr<Expand_Op> &op, + Tensor &expectedOutput) { op->getOutput(0)->setDataType(inputData->dataType()); @@ -35,6 +35,9 @@ void setupTestExpand(shared_ptr<Tensor> inputData, inputShape->setBackend("cpu"); op->associateInput(1, inputShape); + + expectedOutput.setBackend("cpu"); + expectedOutput.setDataType(DataType::Int32); } TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") { @@ -49,7 +52,7 @@ TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") { Array4D<cpptype_t<DataType::Int32>, 1, 3, 4, 2>({{{{{1, 3}, {1, 3}, {1, 3}, {1, 3}}, {{1, 3}, {1, 3}, {1, 3}, {1, 3}}, {{1, 3}, {1, 3}, {1, 3}, {1, 3}}}}}); - setupTestExpand(inputData, inputShape, op); + setupTestExpand(inputData, inputShape, op, expectedOutput); // forwardDims has already been tested in core CHECK(op->forwardDims(true)); @@ -63,7 +66,7 @@ TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") { std::make_shared<Tensor>(Array1D<std::int64_t, 2>({2, 3})); Tensor expectedOutput = Array3D<cpptype_t<DataType::Int32>, 2, 2, 3>( {{{{2, 1, 3}, {2, 1, 3}}, {{2, 1, 3}, {2, 1, 3}}}}); - setupTestExpand(inputData, inputShape, op); + setupTestExpand(inputData, inputShape, op,expectedOutput); // forwardDims has already been tested in core CHECK(op->forwardDims(true)); @@ -77,7 +80,7 @@ TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") { std::make_shared<Tensor>(Array1D<std::int64_t, 1>({1})); Tensor expectedOutput = Array4D<cpptype_t<DataType::Int32>, 2, 1, 3, 1>({{{2, 1, 3}, {2, 1, 3}}}); - setupTestExpand(inputData, inputShape, op); + setupTestExpand(inputData, inputShape, op, expectedOutput); // forwardDims has already been tested in core CHECK(op->forwardDims(true)); @@ -91,7 +94,7 @@ TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") { std::make_shared<Tensor>(Array1D<std::int64_t, 3>({2, 1, 1})); Tensor expectedOutput = Array4D<cpptype_t<DataType::Int32>, 1, 2, 3, 1>({{{{2, 1, 3}, {2, 1, 3}}}}); - setupTestExpand(inputData, inputShape, op); + setupTestExpand(inputData, inputShape, op,expectedOutput); // forwardDims has already been tested in core CHECK(op->forwardDims(true)); @@ -101,3 +104,4 @@ TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") { SECTION("N-Dim to N-Dim") {} auto inputData = std::shared_ptr<Tensor>(); } +} // namespace Aidge diff --git a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp index 8e8536accadcb874f74d4d962aae435bc1351d6e..0ecb3163d80bb90f229b01598770c3a1d0b86da9 100644 --- a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp +++ b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp @@ -558,6 +558,27 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling", Log::info("Number of operations : {}\n", number_of_operation); Log::info("Operation / µs = {}\n", number_of_operation / duration.count()); } + + SECTION("Simple test") { + std::shared_ptr<Tensor> tensor = + std::make_shared<Tensor>(Array4D<int32_t, 1, 1, 7, 7>{{{{ + {0, 8, 26, 35, 49, 45, 22}, + {2, 24, 48, 66, 60, 46, 26}, + {8, 41, 64, 68, 39, 18, 9}, + {10, 48, 72, 76, 42, 14, 9}, + {6, 29, 52, 65, 27, 7, 3}, + {1, 9, 24, 31, 18, 7, 1}, + {0, 0, 4, 6, 7, 1, 1}}}}}); + + auto op = GlobalAveragePooling_Op(); + op.setDataType(DataType::Int32); + op.setBackend("cpu"); + + op.associateInput(0, tensor); + op.forwardDims(); + op.forward(); + REQUIRE(op.getOutput(0)->get<int32_t>(0) == 26); + } } } } // namespace Aidge diff --git a/unit_tests/operator/Test_HeavisideImpl.cpp b/unit_tests/operator/Test_HeavisideImpl.cpp index 4cbdf1a0e29f8670e45897236374726dac62bb43..16fad24d78aba961f6869af5f87d06bc602d7d74 100644 --- a/unit_tests/operator/Test_HeavisideImpl.cpp +++ b/unit_tests/operator/Test_HeavisideImpl.cpp @@ -12,15 +12,21 @@ #include "aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp" #include <memory> +#include <cmath> #include <cstdlib> #include <random> #include <catch2/catch_test_macros.hpp> -#include "aidge/data/Tensor.hpp" #include "aidge/backend/cpu/operator/HeavisideImpl.hpp" +#include "aidge/data/Tensor.hpp" #include "aidge/graph/Node.hpp" +#include "aidge/operator/Atan.hpp" +#include "aidge/operator/Mul.hpp" +#include "aidge/operator/Producer.hpp" #include "aidge/utils/TensorUtils.hpp" +#include "aidge/utils/Types.h" + namespace Aidge { @@ -95,4 +101,92 @@ TEST_CASE("[cpu/operator] Heaviside(forward)", "[Heaviside][CPU]") { REQUIRE(approxEq<float>(*(op->getOutput(0)), *T1)); } } + +TEST_CASE("[cpu/operator] Heaviside(backward)", "[Heaviside][CPU]") { + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist(-2.0f, 2.0f); + std::uniform_int_distribution<std::size_t> sizeDist(5, 100); + + const std::size_t tensorSize = sizeDist(gen); + + auto hs = Heaviside(1.0f); + auto op = std::static_pointer_cast<OperatorTensor>(hs->getOperator()); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + + + auto inputTensor = std::make_shared<Tensor>(std::vector<std::size_t>{tensorSize}); + inputTensor->setDataType(DataType::Float32); + inputTensor->setBackend("cpu"); + auto* inputData = static_cast<float*>(inputTensor->getImpl()->rawPtr()); + + for(std::size_t i = 0; i < tensorSize; ++i) { + inputData[i] = valueDist(gen); + } + + // Compare it to the real Atan implementation + auto mul = Mul(); + auto pi = std::make_shared<Tensor>(Array1D<float,1>{M_PI}); + auto producer = Producer(pi); + auto atan = Atan(); + auto mulOp = std::static_pointer_cast<OperatorTensor>(mul->getOperator()); + auto piOp = std::static_pointer_cast<OperatorTensor>(producer->getOperator()); + auto atanOp = std::static_pointer_cast<OperatorTensor>(atan->getOperator()); + mulOp->setBackend("cpu"); + piOp->setBackend("cpu"); + atanOp->setBackend("cpu"); + mulOp->setDataType(DataType::Float32); + piOp->setDataType(DataType::Float32); + atanOp->setDataType(DataType::Float32); + + + producer->addChild(mul,0,0); + mulOp->setInput(IOIndex_t(1), inputTensor); + mulOp->forward(); + auto outmul = mulOp->getOutput(0); + atanOp->setInput(0, inputTensor); + atanOp->forward(); + + auto gradTensor = std::make_shared<Tensor>(std::vector<std::size_t>{tensorSize}); + gradTensor->setDataType(DataType::Float32); + gradTensor->setBackend("cpu"); + auto* gradData = static_cast<float*>(gradTensor->getImpl()->rawPtr()); + + for (std::size_t i = 0; i < tensorSize; ++i) { + gradData[i] = valueDist(gen); + } + + op->setInput(IOIndex_t(0), inputTensor); + op->forward(); + + auto output = op->getOutput(0); + output->setGrad(gradTensor); + + // Backward pass + op->backward(); + + atanOp->setOutput(0, outmul); + atanOp->getOutput(0)->setGrad(gradTensor); + atanOp->backward(); + + // Compute expected gradient manually + auto expectedGrad = std::make_shared<Tensor>(std::vector<std::size_t>{tensorSize}); + expectedGrad->setDataType(DataType::Float32); + expectedGrad->setBackend("cpu"); + auto* expectedGradData = static_cast<float*>(expectedGrad->getImpl()->rawPtr()); + + for (std::size_t i = 0; i < tensorSize; ++i) { + expectedGradData[i] = gradData[i] * (1.0f / (1.0f + (inputData[i] * M_PI) * (inputData[i] * M_PI))); + } + + // Compare actual gradient with expected gradient + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad)); + + // Compare Atan(pi*input) to expected Gradient + REQUIRE(approxEq<float>(*(atanOp->getInput(0)->grad()), *expectedGrad)); +} + } diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp index de02df2b73bc461bbd76b089cd555d7c82bd173e..57d4190ea5fbbc279f5cc86979f052dc9ada9fd0 100644 --- a/unit_tests/operator/Test_MaxPoolingImpl.cpp +++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp @@ -55,7 +55,11 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") { } }); SECTION("Stride") { - std::shared_ptr<MaxPooling_Op<2>> op = std::make_shared<MaxPooling_Op<2>>(std::array<std::size_t, 2>({2,2}), std::array<std::size_t, 2>({2,2})); + std::shared_ptr<MaxPooling_Op<2>> op = + std::make_shared<MaxPooling_Op<2>>( + std::array<std::size_t, 2>({2, 2}), + std::array<std::size_t, 2>({2, 2}) + ); Tensor myOutput = Array4D<float,2,2,2,2> { { @@ -80,4 +84,273 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") { op->getOutput(0)->print(); REQUIRE(*(op->getOutput(0)) == myOutput); } -} \ No newline at end of file + SECTION("Dilation") { + std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2}, {2,2}); // Dilation 2x2 + auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator()); + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> { + { + { + { + {0.71470, 0.52770}, + {0.71470, 0.48740} + }, + { + {2.23290, 0.48590}, + {2.23290, 0.07000} + } + }, + { + { + {1.76530, 1.20710}, + {1.76530, 1.20710} + }, + { + {1.04290, 0.67760}, + {1.72170, 0.67760} + } + } + } + }); + myMaxPool->getOperator()->associateInput(0,myInput); + myMaxPool->getOperator()->setDataType(DataType::Float32); + myMaxPool->getOperator()->setBackend("cpu"); + myMaxPool->forward(); + op->getOutput(0)->print(); + REQUIRE(*(op->getOutput(0)) == *myOutput); + } + SECTION("Ceil Mode") { + std::shared_ptr<Tensor> myInput4 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW + { + { + { + { 1, 2, 3, 4, 5}, + { 6, 7, 8, 9, 10}, + {11, 12, 13, 14, 15}, + {16, 17, 18, 19, 20}, + {21, 22, 23, 24, 25} + } + } + } + }); + + // MaxPool with ceil_mode = true + std::shared_ptr<Node> myMaxPool1 = MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, true); + auto op1 = std::static_pointer_cast<OperatorTensor>(myMaxPool1 -> getOperator()); + + std::shared_ptr<Tensor> myOutput4 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { + { + { + { + { 7.0, 9.0, 10.0 }, + { 17.0, 19.0, 20.0 }, + { 22.0, 24.0, 25.0 } + } + } + } + }); + op1->associateInput(0, myInput4); + op1->setDataType(DataType::Float32); + op1->setBackend("cpu"); + myMaxPool1->forward(); + op1->getOutput(0)->print(); + REQUIRE(*(op1->getOutput(0)) == *myOutput4); + + // MaxPool with ceil_mode = false + std::shared_ptr<Node> myMaxPool2 = MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, false); + auto op2 = std::static_pointer_cast<OperatorTensor>(myMaxPool2 -> getOperator()); + std::shared_ptr<Tensor> myOutput5 = std::make_shared<Tensor>(Array4D<float,1,1,2,2> { + { + { + { + { 7.0, 9.0 }, + { 17.0, 19.0 } + } + } + } + }); + op2->associateInput(0, myInput4); + op2->setDataType(DataType::Float32); + op2->setBackend("cpu"); + myMaxPool2->forward(); + op2->getOutput(0)->print(); + REQUIRE(*(op2->getOutput(0)) == *myOutput5); + } +} + + + +TEST_CASE("[cpu/operator] MaxPooling(backward)", "[MaxPooling][CPU]") { + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW + { + { + {{-0.3848, 0.2166, -0.4373, 0.6142, 0.5277}, + {0.7995, 0.3638, -1.4589, -1.0843, 1.0918}, + {0.7147, 0.0936, -1.2902, 1.2037, 0.4874}, + {-0.5981, 2.1184, -0.9175, 1.3859, 0.3305}, + {-1.7700, 0.0563, -0.3914, 0.0538, -0.3955}}, + + {{-3.1409, -0.4554, 0.0524, 2.2291, 0.4859}, + {-0.7465, -0.6567, -2.3703, -0.6386, -1.4152}, + { 2.2329, -0.5850, 0.0700, 1.2838, -1.7363}, + { 0.2139, 0.0624, -1.0689, -0.8221, -0.8038}, + { 0.1886, -0.7840, -0.2313, 0.2651, -1.6244}} + }, + { + {{ 0.4371, 1.6417, 0.9129, 0.6325, 0.5438}, + {-2.3552, -0.8850, -0.0232, -0.5462, -1.2011}, + {1.7653, -1.6668, -1.0814, 0.6182, 1.2071}, + {0.9541, -0.5133, 0.8664, -0.8892, 1.4585}, + {1.0220, -0.5107, 0.1829, -0.2301, -0.4268}}, + + {{ 1.0429, 0.6279, -0.2875, 0.7187, -0.1500}, + {1.6041, 2.9635, 1.4172, -0.7517, 0.5441}, + {-0.2276, 0.0857, 0.6776, -0.1389, -0.0614}, + {-0.1547, -0.3435, 0.0650, -0.5095, -1.8073}, + {1.7217, 0.3999, -0.5953, 1.0604, -0.4126}} + } + } + }); + SECTION("Stride") { + std::shared_ptr<MaxPooling_Op<2>> op = + std::make_shared<MaxPooling_Op<2>>( + std::array<std::size_t, 2>({2,2}), + std::array<std::size_t, 2>({2,2}) + ); + + Tensor grad = Array4D<float,2,2,5,5> { + { + { + {{0, 0, 0, 1, 0}, + {1, 0, 0, 0, 0}, + {0, 0, 0, 0, 0}, + {0, 1, 0, 1, 0}, + {0, 0, 0, 0, 0}}, + + {{0, 1, 0, 1, 0}, + {0, 0, 0, 0, 0}, + {1, 0, 0, 1, 0}, + {0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0}} + }, + { + {{0, 1, 1, 0, 0}, + {0, 0, 0, 0, 0}, + {1, 0, 0, 0, 0}, + {0, 0, 1, 0, 0}, + {0, 0, 0, 0, 0}}, + + {{0, 0, 0, 0, 0}, + {0, 1, 1, 0, 0}, + {0, 1, 1, 0, 0}, + {0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0}} + } + } + }; + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->backward(); + //op->getInput(0)->grad()->print(); + REQUIRE(*(op->getInput(0)->grad()) == grad); + } + SECTION("Dilation"){ + std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2}, {2,2}); // Dilation 2x2 + auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator()); + + Tensor grad = Array4D<float,2,2,5,5> { + {{{{0., 0., 0., 0., 1.}, + {0., 0., 0., 0., 0.}, + {2., 0., 0., 0., 1.}, + {0., 0., 0., 0., 0.}, + {0., 0., 0., 0., 0.}}, + + {{0., 0., 0., 0., 1.}, + {0., 0., 0., 0., 0.}, + {2., 0., 1., 0., 0.}, + {0., 0., 0., 0., 0.}, + {0., 0., 0., 0., 0.}}}, + + + {{{0., 0., 0., 0., 0.}, + {0., 0., 0., 0., 0.}, + {2., 0., 0., 0., 2.}, + {0., 0., 0., 0., 0.}, + {0., 0., 0., 0., 0.}}, + + {{1., 0., 0., 0., 0.}, + {0., 0., 0., 0., 0.}, + {0., 0., 2., 0., 0.}, + {0., 0., 0., 0., 0.}, + {1., 0., 0., 0., 0.}}}} + }; + myMaxPool->getOperator()->associateInput(0,myInput); + myMaxPool->getOperator()->setDataType(DataType::Float32); + myMaxPool->getOperator()->setBackend("cpu"); + op->backward(); + //op->getInput(0)->grad()->print(); + REQUIRE(*(op->getInput(0)->grad()) == grad); + } + SECTION("Ceil mode"){ + std::shared_ptr<Tensor> myInput4 = + std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW + {{{ + { 1, 2, 3, 4, 5}, + { 6, 7, 8, 9, 10}, + {11, 12, 13, 14, 15}, + {16, 17, 18, 19, 20}, + {21, 22, 23, 24, 25} + }}} + }); + + // MaxPool with ceil_mode = true + std::shared_ptr<Node> myMaxPool1 = + MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, true); + auto op1 = std::static_pointer_cast<OperatorTensor>( + myMaxPool1 -> getOperator() + ); + Tensor grad = Array4D<float,1,1,5,5> { + {{{ + {0, 0, 0, 0, 0}, + {0, 1, 0, 1, 1}, + {0, 0, 0, 0, 0}, + {0, 1, 0, 1, 1}, + {0, 1, 0, 1, 1} + }}} + }; + + op1->associateInput(0, myInput4); + op1->setDataType(DataType::Float32); + op1->setBackend("cpu"); + op1->backward(); + //op1->getInput(0)->grad()->print(); + REQUIRE(*(op1->getInput(0)->grad()) == grad); + + // MaxPool with ceil_mode = false + std::shared_ptr<Node> myMaxPool2 = + MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, false); + auto op2 = std::static_pointer_cast<OperatorTensor>( + myMaxPool2 -> getOperator() + ); + + Tensor grad2 = Array4D<float,1,1,5,5> { + {{{ + {0, 0, 0, 0, 0}, + {0, 1, 0, 1, 0}, + {0, 0, 0, 0, 0}, + {0, 1, 0, 1, 0}, + {0, 0, 0, 0, 0} + }}} + }; + + myInput4->setGrad(nullptr); + op2->associateInput(0, myInput4); + op2->setDataType(DataType::Float32); + op2->setBackend("cpu"); + myMaxPool2->backward(); + op2->getInput(0)->grad()->print(); + REQUIRE(*(op2->getInput(0)->grad()) == grad2); + } +} diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp index 23bacda590dfed82eca623016787388e56ceed79..7b0b80d816eba8000e782e0e5238c2550dd4eed9 100644 --- a/unit_tests/operator/Test_MetaOperator.cpp +++ b/unit_tests/operator/Test_MetaOperator.cpp @@ -18,6 +18,7 @@ #include "aidge/backend/cpu/operator/ConvImpl.hpp" #include "aidge/backend/cpu/operator/PadImpl.hpp" +#include "aidge/backend/cpu/operator/TanhImpl.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/filler/Filler.hpp" #include "aidge/operator/Conv.hpp" @@ -278,9 +279,9 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") { REQUIRE(op->getNbConsumedData(1).data == 32768); REQUIRE(op->getNbProducedData(0).data == 34816); REQUIRE(op->getNbProducedData(1).data == 34816); - REQUIRE(microGraphScheduler->getStaticScheduling(0).size() == 26); - REQUIRE(microGraphScheduler->getStaticScheduling(1).size() == 24); - REQUIRE(microGraphScheduler->getStaticScheduling(15).size() == 24); + REQUIRE(microGraphScheduler->getSequentialStaticScheduling(0).size() == 26); + REQUIRE(microGraphScheduler->getSequentialStaticScheduling(1).size() == 24); + REQUIRE(microGraphScheduler->getSequentialStaticScheduling(15).size() == 24); } SECTION("LSTM(forward_values)") { @@ -705,7 +706,7 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") { auto fc2 = FC(outChannels, inChannels, true, "fc2"); // NOTE: Account for init step by adding 1 to the max timestep // parameter. - auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, "leaky"); + auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, LeakyReset::Subtraction, "leaky"); // associateInput() does not work fc1->input(1).first->getOperator()->setOutput(0, myWeights); @@ -744,160 +745,105 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") { REQUIRE( approxEq<float>(*(fc2Op->getOutput(0)), *(expectedOutputfc2ts2))); } +} +TEST_CASE("[cpu/operator] MetaOperator", "[Leaky][CPU]") { SECTION("Leaky(forward)") { std::random_device rd; std::mt19937 gen(rd()); - std::uniform_real_distribution<float> valueDist( - 0.1f, - 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), - std::size_t(4)); - std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(3), - std::size_t(3)); + std::uniform_real_distribution<float> valueDist(0.1f,1.1f); + std::uniform_int_distribution<std::size_t> dimSizeDist(2,4); + std::uniform_int_distribution<std::size_t> nbDimsDist(3,3); // fixed to 3. std::uniform_int_distribution<int> boolDist(0, 1); std::uniform_real_distribution<float> betaDist(0,1); + std::uniform_real_distribution<float> thresholDist(0.1,3); - const std::size_t nbDims = nbDimsDist(gen); - Log::info("Nbdims : {}", nbDims); - std::vector<std::size_t> dims; - for (std::size_t i = 0; i < nbDims; ++i) { - dims.push_back(dimSizeDist(gen)); - } - Log::info("timesteps : {}", dims[0]); - Log::info("dimensions : "); - for (auto dim : dims) { - Log::info("{}", dim); - } - + const auto beta = betaDist(gen); + const auto threshold = thresholDist(gen); + const auto nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims(nbDims); + std::generate(dims.begin(), dims.end(), [&]() { return dimSizeDist(gen); }); const auto nbTimeSteps = dims[0]; - const auto beta = betaDist(gen); - - auto myLeaky = Leaky(nbTimeSteps, beta, 1.0, "leaky"); - auto op = - std::static_pointer_cast<MetaOperator_Op>(myLeaky->getOperator()); - // auto stack = Stack(2); - auto mem_rec = Stack(nbTimeSteps, "mem_rec"); - auto spk_rec = Stack(nbTimeSteps, "spk_rec"); - auto pop = Pop("popinput"); - - // Here we test LSTM as it is was flatten in the graph. - // We just borrow its micro-graph into our larger myGraph graph. - auto myGraph = std::make_shared<GraphView>(); - - pop->addChild(op->getMicroGraph()->getOrderedInputs()[0].first, 0, 0); - // 0 for mem 1 for stack - op->getMicroGraph()->getOrderedOutputs()[1].first->addChild(mem_rec, - 0, - 0); - op->getMicroGraph()->getOrderedOutputs()[0].first->addChild(spk_rec, - 0, - 0); - for (auto node : op->getMicroGraph()->getOrderedOutputs()) { - Log::info("name of output {}", node.first->name()); - } - - myGraph->add(pop); - myGraph->add(op->getMicroGraph()); - myGraph->add(mem_rec); - myGraph->add(spk_rec); - myGraph->save("mg", true, true); - - // 3 outputs - REQUIRE(myLeaky->nbInputs() == 3); - REQUIRE(myLeaky->inputCategory(0) == InputCategory::Data); - // Two spikes connected to nothing, + the Add node real output - REQUIRE(myLeaky->nbOutputs() == 4); - - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>( - Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, - {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}}); - // std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>( - // Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, - // {{2.0, 3.0}, {4.0, 5.0}, - // {6.0, 7.0}}}}); + auto leakyNode = Leaky(nbTimeSteps, beta, threshold, LeakyReset::Subtraction, "leaky"); + auto leakyOp = std::static_pointer_cast<MetaOperator_Op>(leakyNode->getOperator()); + auto memoryRecord = Stack(nbTimeSteps, "mem_rec"); + auto spikeRecord = Stack(nbTimeSteps, "spk_rec"); + auto popNode = Pop("input"); - // Generate input - std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>(); - T0->setDataType(DataType::Float32); - T0->setBackend("cpu"); - - std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(); - expectedOutput->setDataType(DataType::Float32); - expectedOutput->setBackend("cpu"); + REQUIRE(leakyNode->nbInputs() == 3); + REQUIRE(leakyNode->inputCategory(0) == InputCategory::Data); + REQUIRE(leakyNode->nbOutputs() == 4); const auto nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); - float *input = new float[nb_elements]; - float *result = new float[nb_elements]; + const auto nbElementsPerTimeStep = nb_elements / dims[0]; - for (std::size_t i = 0; i < nb_elements; ++i) { - input[i] = valueDist(gen); - } - T0->resize(dims); - T0->getImpl()->setRawPtr(input, nb_elements); - T0->print(); - // Elements popped at each time step - auto nbElementsPerTimeStep = nb_elements / dims[0]; + // Compute the expected result using ad-hoc implementation // Init - for (int i = 0; i < nbElementsPerTimeStep; ++i) { - result[i] = input[i]; - } - - // Reccurence - for (int i = 1; i < dims[0]; ++i) { - auto offset = nbElementsPerTimeStep * i; - auto prev = nbElementsPerTimeStep * (i - 1); - for (int j = 0; j < nbElementsPerTimeStep; ++j) { - auto reset = (result[prev + j] > 1.0 ? 1 : 0); - result[offset + j] = - result[prev + j] * beta + input[offset + j] - reset; + auto *input = new float[nb_elements]; + std::generate_n(input, nb_elements, [&]() { return valueDist(gen); }); + auto *result = new float[nb_elements]; + std::copy(input, input + nbElementsPerTimeStep, result); + + // Recurrence calculation for each timestep + for (int timestep = 1; timestep < nbTimeSteps; ++timestep) { + const auto currentOffset = nbElementsPerTimeStep * timestep; + const auto previousOffset = nbElementsPerTimeStep * (timestep - 1); + + for (int element = 0; element < nbElementsPerTimeStep; ++element) { + const auto previousValue = result[previousOffset + element]; + const auto resetValue = (previousValue > threshold) ? threshold : 0; + + result[currentOffset + element] = + previousValue * beta + input[currentOffset + element] - resetValue; } } + auto expectedOutput = std::make_shared<Tensor>(DataType::Float32); + expectedOutput->setBackend("cpu"); expectedOutput->resize(dims); expectedOutput->getImpl()->setRawPtr(result, nb_elements); - Log::info("Expected ouptut : "); - expectedOutput->print(); - std::shared_ptr<Tensor> myInit = - std::make_shared<Tensor>(Array2D<float, 3, 3>{ - {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}}); - auto initMemdims = - std::vector<std::size_t>(dims.begin() + 1, dims.end()); - Log::info("dimensions : "); - for (auto dim : initMemdims) { - Log::info("{}", dim); - } - std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>( - Array2D<float, 3, 2>{{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}}); + // Compute the real result using our operator implemenation + auto inputTensor = std::make_shared<Tensor>(DataType::Float32); + inputTensor->setBackend("cpu"); + inputTensor->resize(dims); + inputTensor->getImpl()->setRawPtr(input, nb_elements); - std::shared_ptr<Tensor> myInitR = - std::make_shared<Tensor>(initMemdims); - myInitR->setDataType(DataType::Float32); - myInitR->setBackend("cpu"); - uniformFiller<float>(myInitR, 0, 0); + auto memoryInit = std::make_shared<Tensor>(DataType::Float32); + memoryInit->setBackend("cpu"); + memoryInit->resize(std::vector<std::size_t>(dims.begin() + 1, dims.end())); + memoryInit->zeros(); + auto memoryInitNode = Producer(memoryInit); - pop->getOperator()->associateInput(0, T0); - op->associateInput(1, myInitR); - op->associateInput(2, myInitR); + popNode->getOperator()->associateInput(0, inputTensor); + popNode->addChild(leakyNode,0, 0); + memoryInitNode->addChild(leakyNode, 0, 1); + memoryInitNode->addChild(leakyNode, 0, 2); + leakyNode->addChild(memoryRecord, 1, 0); + leakyNode->addChild(spikeRecord, 0, 0); - myGraph->compile("cpu", DataType::Float32); + auto g = std::make_shared<GraphView>(); + g->add({popNode, leakyNode, memoryRecord, spikeRecord, memoryInitNode}); + g->setDataType(DataType::Float32); + g->setBackend("cpu"); - auto scheduler = SequentialScheduler(myGraph); + auto scheduler = SequentialScheduler(g); REQUIRE_NOTHROW(scheduler.generateScheduling()); REQUIRE_NOTHROW(scheduler.forward(true)); + // Compare expected output with actual output auto memOp = - std::static_pointer_cast<OperatorTensor>(spk_rec->getOperator()); + std::static_pointer_cast<OperatorTensor>(spikeRecord->getOperator()); + //memOp->getOutput(0)->print(); REQUIRE(approxEq<float>(*(memOp->getOutput(0)), *(expectedOutput))); } } diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp index 2937e94938c671140eeeee87d47d5c48f685203e..a8e0fbdd018f3fea906d6cf7fed2fa2f90cd6366 100644 --- a/unit_tests/operator/Test_MulImpl.cpp +++ b/unit_tests/operator/Test_MulImpl.cpp @@ -46,10 +46,10 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") { op->associateInput(0, T0); op->associateInput(1, T1); - op->getOutput(0)->setGrad(std::make_shared<Tensor>( - Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}))); op->forwardDims(); + op->getOutput(0)->setGrad(std::make_shared<Tensor>( + Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}))); op->backward(); const Tensor expectedGrad0 = @@ -80,9 +80,9 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") { op->associateInput(0, T0); op->associateInput(1, T1); - op->getOutput(0)->setGrad(newGrad); op->forwardDims(); + op->getOutput(0)->setGrad(newGrad); op->backward(); REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); @@ -122,9 +122,9 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") { op->associateInput(0, T0); op->associateInput(1, T1); - op->getOutput(0)->setGrad(newGrad); op->forwardDims(); + op->getOutput(0)->setGrad(newGrad); op->backward(); REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); @@ -176,9 +176,9 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") { op->associateInput(0, T0); op->associateInput(1, T1); - op->getOutput(0)->setGrad(newGrad); op->forwardDims(); + op->getOutput(0)->setGrad(newGrad); op->backward(); REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); @@ -250,8 +250,7 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") { val = dist(gen); } - op->getOutput(0)->setGrad(std::make_shared<Tensor>()); - op->getOutput(0)->grad()->resize(outputDims); + op->getOutput(0)->setGrad(std::make_shared<Tensor>(outputDims)); op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(), expectedOutput.size()); diff --git a/unit_tests/operator/Test_PowImpl.cpp b/unit_tests/operator/Test_PowImpl.cpp index 55a416c3f404506359e06f9937dd958503236901..8f3b2c352c09c34871ea8a5b4d96bb8d4f4c378e 100644 --- a/unit_tests/operator/Test_PowImpl.cpp +++ b/unit_tests/operator/Test_PowImpl.cpp @@ -1,486 +1,488 @@ -/******************************************************************************** - * Copyright (c) 2023 CEA-List - * - * This program and the accompanying materials are made available under the - * terms of the Eclipse Public License 2.0 which is available at - * http://www.eclipse.org/legal/epl-2.0. - * - * SPDX-License-Identifier: EPL-2.0 - * - ********************************************************************************/ - -#include <chrono> // std::micro, std::chrono::time_point, - // std::chrono::system_clock, std::chrono::duration -#include <cstddef> // std::size_t -#include <cstdint> // std::uint16_t -#include <functional> // std::multiplies -#include <memory> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937 - // std::uniform_int_distribution, std::uniform_real_distribution -#include <vector> - -#include <catch2/catch_test_macros.hpp> -#include <fmt/core.h> - -#include "aidge/backend/cpu/data/TensorImpl.hpp" -#include "aidge/backend/cpu/operator/PowImpl.hpp" -#include "aidge/data/Data.hpp" -#include "aidge/data/Tensor.hpp" -#include "aidge/operator/Pow.hpp" -#include "aidge/utils/ArrayHelpers.hpp" -#include "aidge/utils/TensorUtils.hpp" - -namespace Aidge { - -TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") { - constexpr std::uint16_t NBTRIALS = 10; - // Create a random number generator - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10)); - std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5)); - std::uniform_int_distribution<int> boolDist(0,1); - - // Create MatPow Operator - std::shared_ptr<Node> myPow = Pow(); - auto op = std::static_pointer_cast<OperatorTensor>(myPow-> getOperator()); - op->setDataType(DataType::Float32); - op->setBackend("cpu"); - - // Create 2 input Tensors - std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>(); - op->associateInput(0,T0); - T0->setDataType(DataType::Float32); - T0->setBackend("cpu"); - std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>(); - op -> associateInput(1,T1); - T1->setDataType(DataType::Float32); - T1->setBackend("cpu"); - - // Create results Tensor - std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(); - Tres->setDataType(DataType::Float32); - Tres->setBackend("cpu"); - - // To measure execution time of 'MatPow_Op::forward()' member function call - std::chrono::time_point<std::chrono::system_clock> start; - std::chrono::time_point<std::chrono::system_clock> end; - std::chrono::duration<double, std::micro> duration{}; - - SECTION("PowImpl_cpu::forward()") { - SECTION("Scalar / Scalar") { - - } - SECTION("Scalar / +1-D Tensor") { - - } - SECTION("+1-D Tensor / +1-D Tensor - same dimensions") { - std::size_t number_of_operation = 0; - - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { - // generate 2 random Tensors - const std::size_t nbDims = nbDimsDist(gen); - std::vector<std::size_t> dims; - for (std::size_t i = 0; i < nbDims; ++i) { - dims.push_back(dimSizeDist(gen)); - } - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); - number_of_operation += nb_elements; - - // without broadcasting - float* array0 = new float[nb_elements]; - float* array1 = new float[nb_elements]; - float* result = new float[nb_elements]; - - for (std::size_t i = 0; i < nb_elements; ++i) { - array0[i] = valueDist(gen); - array1[i] = valueDist(gen); - result[i] = std::pow(array0[i], array1[i]); - } - - // input0 - T0->resize(dims); - T0 -> getImpl() -> setRawPtr(array0, nb_elements); - - // input1 - T1->resize(dims); - T1 -> getImpl() -> setRawPtr(array1, nb_elements); - - // results - Tres->resize(dims); - Tres -> getImpl() -> setRawPtr(result, nb_elements); - - op->forwardDims(); - start = std::chrono::system_clock::now(); - myPow->forward(); - end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); - - REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); - - delete[] array0; - delete[] array1; - delete[] result; - - // with broadcasting - } - Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); - Log::info("total time: {} μs\n", duration.count()); - } - - SECTION("+1-D Tensor / +1-D Tensor - broadcasting") { - std::size_t number_of_operation = 0; - - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { - // generate 2 random Tensors - // handle dimensions, replace some dimensions with '1' to get broadcasting - constexpr std::size_t nbDims = 4; - std::vector<std::size_t> dims; - for (std::size_t i = 0; i < nbDims; ++i) { - dims.push_back(dimSizeDist(gen)); - } - std::vector<std::size_t> dims0 = dims; - std::vector<std::size_t> dims1 = dims; - std::vector<std::size_t> dimsOut = dims; - for (std::size_t i = 0; i < nbDims; ++i) { - if (boolDist(gen)) { - dims0[i] = 1; - } - if (boolDist(gen)) { - dims1[i] = 1; - } - dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i]; - } - - // create arrays and fill them with random values - float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]]; - float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]]; - float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]]; - - for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) { - array0[i] = valueDist(gen); - } - for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) { - array1[i] = valueDist(gen); - } - - // compute true result - const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1}; - const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1}; - for (std::size_t a = 0; a < dimsOut[0]; ++a) { - for (std::size_t b = 0; b < dimsOut[1]; ++b) { - const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0) - + strides0[1] * ((dims0[1] > 1) ? b : 0); - const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0) - + strides1[1] * ((dims1[1] > 1) ? b : 0); - for (std::size_t c = 0; c < dimsOut[2]; ++c) { - const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a)); - for (std::size_t d = 0; d < dimsOut[3]; ++d) { - std::size_t idx0 = idx0_0 - + strides0[2] * ((dims0[2] > 1) ? c : 0) - + ((dims0[3] > 1) ? d : 0); - std::size_t idx1 = idx1_0 - + strides1[2] * ((dims1[2] > 1) ? c : 0) - + ((dims1[3] > 1) ? d : 0); - result[idx_out + d] = std::pow(array0[idx0], array1[idx1]); - // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " ** " << array1[idx1] << " -> " << idx_out + d << std::endl; - } - } - } - } - - // conversion to Aidge::Tensors - // input0 - T0->resize(dims0); - T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]); - - // input1 - T1->resize(dims1); - T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]); - - // results - Tres->resize(dimsOut); - Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]); - - // compute result - op->forwardDims(); - start = std::chrono::system_clock::now(); - myPow->forward(); - end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); - - // comparison between truth and computed result - REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); - - delete[] array0; - delete[] array1; - delete[] result; - - const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); - number_of_operation += nb_elements; - } - Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); - Log::info("total time: {} μs\n", duration.count()); - } - SECTION("+1-D Tensor / 1-D Tensor") { - std::size_t number_of_operation = 0; - std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3)); - - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { - // generate 2 random Tensors - // handle dimensions - constexpr std::size_t nbDims = 4; - std::vector<std::size_t> dims0(4); - for (std::size_t i = 0; i < nbDims; ++i) { - dims0[i] = dimSizeDist(gen); - } - std::vector<std::size_t> dimsOut = dims0; - std::vector<std::size_t> dims1 = dims0; - for (std::size_t i = 0; i < nbDims; ++i) { - if (boolDist(gen)) { - dims1[i] = 1; - } - } - dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen)); - - // create arrays and fill them with random values - float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]]; - std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); - float* array1 = new float[array1_size]; - float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]]; - - for (std::size_t i = 0; i < (dims0[0]*dims0[1]*dims0[2]*dims0[3]); ++i) { - array0[i] = valueDist(gen); - } - for (std::size_t i = 0; i < array1_size; ++i) { - array1[i] = valueDist(gen); - } - - // compute true result - auto dims1_tmp = dims1; - dims1_tmp.insert(dims1_tmp.cbegin(), 4 - dims1_tmp.size(), std::size_t(1)); - - const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1}; - const std::size_t strides1[nbDims] = {dims1_tmp[1]*dims1_tmp[2]*dims1_tmp[3], dims1_tmp[2]*dims1_tmp[3], dims1_tmp[3], 1}; - for (std::size_t a = 0; a < dimsOut[0]; ++a) { - for (std::size_t b = 0; b < dimsOut[1]; ++b) { - const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0) - + strides0[1] * ((dims0[1] > 1) ? b : 0); - const std::size_t idx1_0 = strides1[0] * ((dims1_tmp[0] > 1) ? a : 0) - + strides1[1] * ((dims1_tmp[1] > 1) ? b : 0); - for (std::size_t c = 0; c < dimsOut[2]; ++c) { - const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a)); - for (std::size_t d = 0; d < dimsOut[3]; ++d) { - std::size_t idx0 = idx0_0 - + strides0[2] * ((dims0[2] > 1) ? c : 0) - + ((dims0[3] > 1) ? d : 0); - std::size_t idx1 = idx1_0 - + strides1[2] * ((dims1_tmp[2] > 1) ? c : 0) - + ((dims1_tmp[3] > 1) ? d : 0); - result[idx_out + d] = std::pow(array0[idx0], array1[idx1]); - // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " ** " << array1[idx1] << " -> " << idx_out + d << std::endl; - } - } - } - } - - // conversion to Aidge::Tensors - // input0 - T0->resize(dims0); - T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]); - - // input1 - T1->resize(dims1); - T1 -> getImpl() -> setRawPtr(array1, array1_size); - - // results - Tres->resize(dimsOut); - Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]); - - // compute result - op->forwardDims(); - start = std::chrono::system_clock::now(); - myPow->forward(); - end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); - - // comparison between truth and computed result - REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); - - delete[] array0; - delete[] array1; - delete[] result; - - const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); - number_of_operation += nb_elements; - } - - Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); - Log::info("total time: {} μs\n", duration.count()); - } - } - - - SECTION("PowImpl_cpu::backward()") { - SECTION("3D Tensors") { - const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>( - { - { - { - {2.0, 3.0}, - {4.0, 5.0} - }, - { - {6.0, 7.0}, - {8.0, 9.0} - } - } - } - )); - const auto input1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>( - { - { - { - {1.0, 2.0}, - {3.0, 2.0} - }, - { - {2.0, 3.0}, - {1.0, 0.5} - } - } - } - )); - const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>( - { - { - { - {0.5, 1.0}, - {1.5, 2.0} - }, - { - {2.5, 3.0}, - {3.5, 4.0} - } - } - } - )); - const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>( - { - { - { - {0.50000000, 6.00000000}, - {72.00000000, 20.00000000} - }, - { - {30.00000000, 441.00000000}, - {3.50000000, 0.66666669} - } - } - } - )); - const auto expectedGrad1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>( - { - { - { - { 0.693147182, 9.88751030}, - {1.33084259e+02, 8.04718933e+01} - }, - { - {1.61258362e+02, 2.00234143e+03}, - {5.82243652e+01, 2.63666954e+01} - } - } - } - )); - for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1}) - { - T->setBackend("cpu") ; - T->setDataType(DataType::Float32); - } - std::shared_ptr<Node> powOp = Pow(); - auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator()); - opr->setDataType(DataType::Float32); - opr->setBackend("cpu"); - opr->associateInput(0, input0); - opr->associateInput(1, input1); - opr->getOutput(0)->setGrad(gradOut); - opr->forward(); - - powOp->backward(); - REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0)); - REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1)); - } - SECTION("Broadcasting") { - const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( - { - { - { - {1.0, 2.0, 3.0}, - {4.0, 5.0, 6.0} - }, - { - {1.5, 2.5, 3.5}, - {4.5, 5.5, 6.5} - } - } - } - )); - const auto input1 = std::make_shared<Tensor>(Array1D<float, 3>( - { - {0.1, 0.2, 0.3} - } - )); - - const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( - { - { - { - {1.0, 2.0, 3.0}, - {4.0, 5.0, 6.0} - }, - { - {6.0, 5.0, 4.0}, - {3.0, 2.0, 1.0} - } - } - } - )); - const Tensor expectedGrad0 = Array3D<float, 2, 2, 3>( - { - { - { - {0.10000000, 0.22973967, 0.41711676}, - {0.11486985, 0.27594593, 0.51353097} - }, - { - {0.41655189, 0.48044977, 0.49926791}, - {0.07748720, 0.10227509, 0.08092485} - } - } - } - ); - const Tensor expectedGrad1 = Array1D<float, 3>( - { - {14.14779854, 22.99299049, 33.56402588} - } - ); - - std::shared_ptr<Node> powOp = Pow(); - auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator()); - opr->setDataType(DataType::Float32); - opr->setBackend("cpu"); - opr->associateInput(0, input0); - opr->associateInput(1, input1); - opr->getOutput(0)->setGrad(gradOut); - powOp->forward(); - - powOp->backward(); - REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), expectedGrad0)); - REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), expectedGrad1)); - } - } -} -} // namespace Aidge +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock, std::chrono::duration +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies +#include <memory> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> + +#include <catch2/catch_test_macros.hpp> +#include <fmt/core.h> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/PowImpl.hpp" +#include "aidge/data/Data.hpp" +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/Pow.hpp" +#include "aidge/utils/ArrayHelpers.hpp" +#include "aidge/utils/TensorUtils.hpp" + +namespace Aidge { + +TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") { + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5)); + std::uniform_int_distribution<int> boolDist(0,1); + + // Create MatPow Operator + std::shared_ptr<Node> myPow = Pow(); + auto op = std::static_pointer_cast<OperatorTensor>(myPow-> getOperator()); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + // Create 2 input Tensors + std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>(); + op->associateInput(0,T0); + T0->setDataType(DataType::Float32); + T0->setBackend("cpu"); + std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>(); + op -> associateInput(1,T1); + T1->setDataType(DataType::Float32); + T1->setBackend("cpu"); + + // Create results Tensor + std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(); + Tres->setDataType(DataType::Float32); + Tres->setBackend("cpu"); + + // To measure execution time of 'MatPow_Op::forward()' member function call + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration{}; + + SECTION("PowImpl_cpu::forward()") { + SECTION("Scalar / Scalar") { + + } + SECTION("Scalar / +1-D Tensor") { + + } + SECTION("+1-D Tensor / +1-D Tensor - same dimensions") { + std::size_t number_of_operation = 0; + + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate 2 random Tensors + const std::size_t nbDims = nbDimsDist(gen); + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) { + dims.push_back(dimSizeDist(gen)); + } + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + + // without broadcasting + float* array0 = new float[nb_elements]; + float* array1 = new float[nb_elements]; + float* result = new float[nb_elements]; + + for (std::size_t i = 0; i < nb_elements; ++i) { + array0[i] = valueDist(gen); + array1[i] = valueDist(gen); + result[i] = std::pow(array0[i], array1[i]); + } + + // input0 + T0->resize(dims); + T0 -> getImpl() -> setRawPtr(array0, nb_elements); + + // input1 + T1->resize(dims); + T1 -> getImpl() -> setRawPtr(array1, nb_elements); + + // results + Tres->resize(dims); + Tres -> getImpl() -> setRawPtr(result, nb_elements); + + op->forwardDims(); + start = std::chrono::system_clock::now(); + myPow->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); + + delete[] array0; + delete[] array1; + delete[] result; + + // with broadcasting + } + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {} μs\n", duration.count()); + } + + SECTION("+1-D Tensor / +1-D Tensor - broadcasting") { + std::size_t number_of_operation = 0; + + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate 2 random Tensors + // handle dimensions, replace some dimensions with '1' to get broadcasting + constexpr std::size_t nbDims = 4; + std::vector<std::size_t> dims; + for (std::size_t i = 0; i < nbDims; ++i) { + dims.push_back(dimSizeDist(gen)); + } + std::vector<std::size_t> dims0 = dims; + std::vector<std::size_t> dims1 = dims; + std::vector<std::size_t> dimsOut = dims; + for (std::size_t i = 0; i < nbDims; ++i) { + if (boolDist(gen)) { + dims0[i] = 1; + } + if (boolDist(gen)) { + dims1[i] = 1; + } + dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i]; + } + + // create arrays and fill them with random values + float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]]; + float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]]; + float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]]; + + for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) { + array0[i] = valueDist(gen); + } + for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) { + array1[i] = valueDist(gen); + } + + // compute true result + const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1}; + const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1}; + for (std::size_t a = 0; a < dimsOut[0]; ++a) { + for (std::size_t b = 0; b < dimsOut[1]; ++b) { + const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0) + + strides0[1] * ((dims0[1] > 1) ? b : 0); + const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0) + + strides1[1] * ((dims1[1] > 1) ? b : 0); + for (std::size_t c = 0; c < dimsOut[2]; ++c) { + const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a)); + for (std::size_t d = 0; d < dimsOut[3]; ++d) { + std::size_t idx0 = idx0_0 + + strides0[2] * ((dims0[2] > 1) ? c : 0) + + ((dims0[3] > 1) ? d : 0); + std::size_t idx1 = idx1_0 + + strides1[2] * ((dims1[2] > 1) ? c : 0) + + ((dims1[3] > 1) ? d : 0); + result[idx_out + d] = std::pow(array0[idx0], array1[idx1]); + // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " ** " << array1[idx1] << " -> " << idx_out + d << std::endl; + } + } + } + } + + // conversion to Aidge::Tensors + // input0 + T0->resize(dims0); + T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]); + + // input1 + T1->resize(dims1); + T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]); + + // results + Tres->resize(dimsOut); + Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]); + + // compute result + op->forwardDims(); + start = std::chrono::system_clock::now(); + myPow->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // comparison between truth and computed result + REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); + + delete[] array0; + delete[] array1; + delete[] result; + + const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + } + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {} μs\n", duration.count()); + } + SECTION("+1-D Tensor / 1-D Tensor") { + std::size_t number_of_operation = 0; + std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3)); + + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate 2 random Tensors + // handle dimensions + constexpr std::size_t nbDims = 4; + std::vector<std::size_t> dims0(4); + for (std::size_t i = 0; i < nbDims; ++i) { + dims0[i] = dimSizeDist(gen); + } + std::vector<std::size_t> dimsOut = dims0; + std::vector<std::size_t> dims1 = dims0; + for (std::size_t i = 0; i < nbDims; ++i) { + if (boolDist(gen)) { + dims1[i] = 1; + } + } + dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen)); + + // create arrays and fill them with random values + float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]]; + std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + float* array1 = new float[array1_size]; + float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]]; + + for (std::size_t i = 0; i < (dims0[0]*dims0[1]*dims0[2]*dims0[3]); ++i) { + array0[i] = valueDist(gen); + } + for (std::size_t i = 0; i < array1_size; ++i) { + array1[i] = valueDist(gen); + } + + // compute true result + auto dims1_tmp = dims1; + dims1_tmp.insert(dims1_tmp.cbegin(), 4 - dims1_tmp.size(), std::size_t(1)); + + const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1}; + const std::size_t strides1[nbDims] = {dims1_tmp[1]*dims1_tmp[2]*dims1_tmp[3], dims1_tmp[2]*dims1_tmp[3], dims1_tmp[3], 1}; + for (std::size_t a = 0; a < dimsOut[0]; ++a) { + for (std::size_t b = 0; b < dimsOut[1]; ++b) { + const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0) + + strides0[1] * ((dims0[1] > 1) ? b : 0); + const std::size_t idx1_0 = strides1[0] * ((dims1_tmp[0] > 1) ? a : 0) + + strides1[1] * ((dims1_tmp[1] > 1) ? b : 0); + for (std::size_t c = 0; c < dimsOut[2]; ++c) { + const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a)); + for (std::size_t d = 0; d < dimsOut[3]; ++d) { + std::size_t idx0 = idx0_0 + + strides0[2] * ((dims0[2] > 1) ? c : 0) + + ((dims0[3] > 1) ? d : 0); + std::size_t idx1 = idx1_0 + + strides1[2] * ((dims1_tmp[2] > 1) ? c : 0) + + ((dims1_tmp[3] > 1) ? d : 0); + result[idx_out + d] = std::pow(array0[idx0], array1[idx1]); + // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " ** " << array1[idx1] << " -> " << idx_out + d << std::endl; + } + } + } + } + + // conversion to Aidge::Tensors + // input0 + T0->resize(dims0); + T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]); + + // input1 + T1->resize(dims1); + T1 -> getImpl() -> setRawPtr(array1, array1_size); + + // results + Tres->resize(dimsOut); + Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]); + + // compute result + op->forwardDims(); + start = std::chrono::system_clock::now(); + myPow->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + // comparison between truth and computed result + REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); + + delete[] array0; + delete[] array1; + delete[] result; + + const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); + number_of_operation += nb_elements; + } + + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {} μs\n", duration.count()); + } + } + + + SECTION("PowImpl_cpu::backward()") { + SECTION("3D Tensors") { + const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>( + { + { + { + {2.0, 3.0}, + {4.0, 5.0} + }, + { + {6.0, 7.0}, + {8.0, 9.0} + } + } + } + )); + const auto input1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>( + { + { + { + {1.0, 2.0}, + {3.0, 2.0} + }, + { + {2.0, 3.0}, + {1.0, 0.5} + } + } + } + )); + const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>( + { + { + { + {0.5, 1.0}, + {1.5, 2.0} + }, + { + {2.5, 3.0}, + {3.5, 4.0} + } + } + } + )); + const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>( + { + { + { + {0.50000000, 6.00000000}, + {72.00000000, 20.00000000} + }, + { + {30.00000000, 441.00000000}, + {3.50000000, 0.66666669} + } + } + } + )); + const auto expectedGrad1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>( + { + { + { + { 0.693147182, 9.88751030}, + {1.33084259e+02, 8.04718933e+01} + }, + { + {1.61258362e+02, 2.00234143e+03}, + {5.82243652e+01, 2.63666954e+01} + } + } + } + )); + for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1}) + { + T->setBackend("cpu") ; + T->setDataType(DataType::Float32); + } + std::shared_ptr<Node> powOp = Pow(); + auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator()); + opr->setDataType(DataType::Float32); + opr->setBackend("cpu"); + opr->associateInput(0, input0); + opr->associateInput(1, input1); + opr->forward(); + + opr->getOutput(0)->setGrad(gradOut); + powOp->backward(); + + REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0)); + REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1)); + } + SECTION("Broadcasting") { + const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + { + { + { + {1.0, 2.0, 3.0}, + {4.0, 5.0, 6.0} + }, + { + {1.5, 2.5, 3.5}, + {4.5, 5.5, 6.5} + } + } + } + )); + const auto input1 = std::make_shared<Tensor>(Array1D<float, 3>( + { + {0.1, 0.2, 0.3} + } + )); + + const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + { + { + { + {1.0, 2.0, 3.0}, + {4.0, 5.0, 6.0} + }, + { + {6.0, 5.0, 4.0}, + {3.0, 2.0, 1.0} + } + } + } + )); + const Tensor expectedGrad0 = Array3D<float, 2, 2, 3>( + { + { + { + {0.10000000, 0.22973967, 0.41711676}, + {0.11486985, 0.27594593, 0.51353097} + }, + { + {0.41655189, 0.48044977, 0.49926791}, + {0.07748720, 0.10227509, 0.08092485} + } + } + } + ); + const Tensor expectedGrad1 = Array1D<float, 3>( + { + {14.14779854, 22.99299049, 33.56402588} + } + ); + + std::shared_ptr<Node> powOp = Pow(); + auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator()); + opr->setDataType(DataType::Float32); + opr->setBackend("cpu"); + opr->associateInput(0, input0); + opr->associateInput(1, input1); + powOp->forward(); + + opr->getOutput(0)->setGrad(gradOut); + powOp->backward(); + + REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), expectedGrad0)); + REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), expectedGrad1)); + } + } +} +} // namespace Aidge diff --git a/unit_tests/operator/Test_ReduceMeanImpl.cpp b/unit_tests/operator/Test_ReduceMeanImpl.cpp index 30ffeb0dd0b584f50349c206863c7ab9ac776721..8841d6773dc5ce793ca75244fedc18fdf245ca26 100644 --- a/unit_tests/operator/Test_ReduceMeanImpl.cpp +++ b/unit_tests/operator/Test_ReduceMeanImpl.cpp @@ -156,7 +156,7 @@ TEST_CASE("[cpu/operator] ReduceMean(forward)", "[ReduceMean][CPU]") { } SECTION("KeepDims") { SECTION("test 1") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<cpptype_t<DataType::Float32>,3,2,2> { { { { 5.0, 1.0 }, @@ -172,12 +172,12 @@ TEST_CASE("[cpu/operator] ReduceMean(forward)", "[ReduceMean][CPU]") { } } }); - Tensor myOutput = Tensor(Array3D<float,3,1,2> { + Tensor myOutput = Tensor(Array3D<cpptype_t<DataType::Float32>,3,1,2> { { - {{ 12.5, 1.5 }}, - {{ 35.0, 1.5 }}, - {{ 57.5, 1.5 }} + {{ 12.5f, 1.5f }}, + {{ 35.0f, 1.5f }}, + {{ 57.5f, 1.5f }} } }); diff --git a/unit_tests/operator/Test_SubImpl.cpp b/unit_tests/operator/Test_SubImpl.cpp index 1317e88a371e9a6e7a3deae5b7f662a9cd879a60..f87f34d58731f16f1a90373a0f0cc0a9e02ea406 100644 --- a/unit_tests/operator/Test_SubImpl.cpp +++ b/unit_tests/operator/Test_SubImpl.cpp @@ -322,4 +322,163 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") { } } } + + +TEST_CASE("[CPU/Operator] Sub(Backward)", "[Sub][CPU][Backward]") { + std::shared_ptr<Node> mySub = Sub(); + auto op = std::static_pointer_cast<OperatorTensor>(mySub->getOperator()); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + SECTION("Case 1: 1D and 2D Tensors") { + const auto T0 = std::make_shared<Tensor>( + Array2D<float, 2, 3>({{{1, 2, 3}, {4, 5, 6}}})); + + const auto T1 = + std::make_shared<Tensor>(Array1D<float, 3>({0.1, 0.2, 0.3})); + + T0->setDataType(DataType::Float32); + T0->setBackend("cpu"); + T1->setDataType(DataType::Float32); + T1->setBackend("cpu"); + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->forwardDims(); + + op->getOutput(0)->setGrad(std::make_shared<Tensor>( + Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}))); + mySub->backward(); + + // For subtraction: grad_input0 = grad_output + const auto expectedGrad0 = std::make_shared<Tensor>( + Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})); + + // For subtraction: grad_input1 = -grad_output (summed across broadcast dimensions) + const auto expectedGrad1 = + std::make_shared<Tensor>(Array1D<float, 3>({-2, -2, -2})); + + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1)); + } + + SECTION("Case 2: 3D and 1D tensors") { + const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}, + {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}})); + + const auto T1 = + std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1})); + + const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}})); + + const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + {{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}, + {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}})); + + const auto expectedGrad1 = + std::make_shared<Tensor>(Array1D<float, 3>({-4.0, -4.0, -4.0})); + + for (auto T : {T0, T1, newGrad, expectedGrad0, expectedGrad1}) { + T->setBackend("cpu"); + T->setDataType(DataType::Float32); + } + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->forwardDims(); + + op->getOutput(0)->setGrad(newGrad); + mySub->backward(); + + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1)); + } + + SECTION("Case 3: Random values with broadcasting") { + // Use random values + std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor + std::vector<std::size_t> dims1 = {2, 6, 7}; // Second tensor + std::vector<std::size_t> outputDims = {5, 2, 6, 7}; + + const auto input0Size = 5 * 2 * 1 * 7; + const auto input1Size = 2 * 6 * 7; + const auto outputSize = 5 * 2 * 6 * 7; + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> dist(0.1f, 1.0f); + + std::vector<float> input0Data(input0Size); + std::vector<float> input1Data(input1Size); + std::vector<float> gradOutputData(outputSize); + + // Fill with random values + for (auto &val : input0Data) val = dist(gen); + for (auto &val : input1Data) val = dist(gen); + for (auto &val : gradOutputData) val = dist(gen); + + auto T0 = std::make_shared<Tensor>(); + auto T1 = std::make_shared<Tensor>(); + + T0->setDataType(DataType::Float32); + T0->setBackend("cpu"); + T0->resize(dims0); + T0->getImpl()->setRawPtr(input0Data.data(), input0Size); + + T1->setDataType(DataType::Float32); + T1->setBackend("cpu"); + T1->resize(dims1); + T1->getImpl()->setRawPtr(input1Data.data(), input1Size); + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->forwardDims(); + + // Set gradient of output + op->getOutput(0)->setGrad(std::make_shared<Tensor>(outputDims)); + op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(), outputSize); + + // Compute reference gradients + std::vector<float> expectedGrad0(input0Size, 0.0f); + std::vector<float> expectedGrad1(input1Size, 0.0f); + + for (std::size_t n = 0; n < 5; ++n) { + for (std::size_t c = 0; c < 2; ++c) { + for (std::size_t h = 0; h < 6; ++h) { + for (std::size_t w = 0; w < 7; ++w) { + std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n)); + std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n)); + std::size_t in1Idx = w + 7 * (h + 6 * c); + + // Gradient for input0: grad_output + expectedGrad0[in0Idx] += gradOutputData[outIdx]; + // Gradient for input1: -grad_output + expectedGrad1[in1Idx] += -gradOutputData[outIdx]; + } + } + } + } + + // Perform backward pass + mySub->backward(); + + auto expectedGrad0Tensor = std::make_shared<Tensor>(); + expectedGrad0Tensor->resize(T0->dims()); + expectedGrad0Tensor->setBackend("cpu"); + expectedGrad0Tensor->setDataType(DataType::Float32); + expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(), expectedGrad0.size()); + + auto expectedGrad1Tensor = std::make_shared<Tensor>(); + expectedGrad1Tensor->resize(T1->dims()); + expectedGrad1Tensor->setBackend("cpu"); + expectedGrad1Tensor->setDataType(DataType::Float32); + expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(), expectedGrad1.size()); + + // Verify backward pass + REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor)); + REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor)); + } +} } // namespace Aidge diff --git a/unit_tests/operator/Test_WeightInterleavingImpl.cpp b/unit_tests/operator/Test_WeightInterleavingImpl.cpp index c95c8fca19eb79eb78fc19e93ded3383054383e7..3c111625eb4c948637dc06b9a761243ccf424d51 100644 --- a/unit_tests/operator/Test_WeightInterleavingImpl.cpp +++ b/unit_tests/operator/Test_WeightInterleavingImpl.cpp @@ -23,7 +23,6 @@ using namespace Aidge; TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") { - std::shared_ptr<Node> myWeightInterleaving = WeightInterleaving(); auto opWeightInterleaving = std::static_pointer_cast<WeightInterleaving_Op>(myWeightInterleaving -> getOperator()); @@ -415,7 +414,6 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") { // Create convolution node std::shared_ptr<Node> conv = Conv(4, 2, {3, 3}, "conv1"); - // Place the weight tensor in the weight producer of the conv auto weightProducer = conv->getParent(1); weightProducer->getOperator()->setOutput(0, weight); diff --git a/unit_tests/recipies/Test_FoldConstantOfShape.cpp b/unit_tests/recipies/Test_FoldConstantOfShape.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a1c09b151ebfdb0ab54d8430c13c3ca9d3de3459 --- /dev/null +++ b/unit_tests/recipies/Test_FoldConstantOfShape.cpp @@ -0,0 +1,50 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + + #include "aidge/graph/GraphView.hpp" + #include "aidge/operator/Identity.hpp" + #include "aidge/recipes/Recipes.hpp" + + #include <cstdint> // std::int64_t + #include <memory> + + #include <catch2/catch_test_macros.hpp> + + #include "aidge/graph/OpArgs.hpp" + #include "aidge/operator/ConstantOfShape.hpp" + #include "aidge/operator/Conv.hpp" + #include "aidge/operator/Producer.hpp" + #include "aidge/operator/ReLU.hpp" + #include "aidge/recipes/Recipes.hpp" + #include "aidge/utils/ArrayHelpers.hpp" + #include "aidge/utils/Types.h" + + namespace Aidge { + + TEST_CASE("[cpu/recipes] foldConstantOfShape", + "[ConstantOfShape][foldConstantOfShape][recipes]") { + auto input_T = std::make_shared<Tensor>(Array1D<std::int64_t, 4>({1, 1, 3, 3})); + + auto model = std::make_shared<GraphView>(); + SECTION("Sequential model") { + model = Sequential({ + Producer(input_T, "prod_0", true), + ConstantOfShape(3, "constantOfShape_0"), + Conv(1, 1, {3, 3}, "Conv_0"), + ReLU("ReLU_1") + }); + // aidge_backend_cpu loaded. Recipe should work + REQUIRE(foldConstantOfShape(model) == 1); + CHECK(model->forwardDims()); + } + } + + } // namespace Aidge diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp index 956169c387c4a34f500f66b214dcf95a145feafd..0dfdbb304f6b593903165b7566c68dad71f0b8a4 100644 --- a/unit_tests/scheduler/Test_Scheduler.cpp +++ b/unit_tests/scheduler/Test_Scheduler.cpp @@ -17,19 +17,28 @@ #include "aidge/graph/Node.hpp" #include "aidge/graph/GraphView.hpp" #include "aidge/graph/OpArgs.hpp" +#include "aidge/operator/GenericOperator.hpp" #include "aidge/operator/Memorize.hpp" #include "aidge/operator/Pop.hpp" #include "aidge/operator/Stack.hpp" #include "aidge/operator/Identity.hpp" +#include "aidge/operator/CryptoHash.hpp" +#include "aidge/operator/Mod.hpp" +#include "aidge/operator/Tanh.hpp" +#include "aidge/operator/Select.hpp" #include "aidge/operator/MetaOperator.hpp" #include "aidge/scheduler/SequentialScheduler.hpp" #include "aidge/scheduler/ParallelScheduler.hpp" +#include "aidge/graph/Testing.hpp" #include "aidge/backend/cpu/operator/FCImpl.hpp" #include "aidge/backend/cpu/operator/ConvImpl.hpp" #include "aidge/backend/cpu/operator/ReLUImpl.hpp" #include "aidge/backend/cpu/operator/SqrtImpl.hpp" #include "aidge/backend/cpu/operator/AddImpl.hpp" +#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp" +#include "aidge/backend/cpu/operator/ModImpl.hpp" +#include "aidge/backend/cpu/operator/TanhImpl.hpp" #include "aidge/recipes/GraphViewHelper.hpp" @@ -427,6 +436,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(backward)", "[scheduler][backward // implem already set to default auto myProd = Producer(inputTensor, "prod"); myProd -> addChild(gv); + gv->add(myProd); gv -> compile("cpu", DataType::Float32); SequentialScheduler scheduler(gv); @@ -473,7 +483,7 @@ TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") { {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}}); std::shared_ptr<Tensor> MemInit = - std::make_shared<Tensor>(Array2D<float, 3, 2>{ + std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 2>{ {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}}); auto meta = Accumulate(2, "accumulate"); @@ -508,8 +518,129 @@ TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") { REQUIRE_NOTHROW(scheduler.forward(true)); std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>( - Array2D<float, 3, 2>{{{3.0, 5.0}, {7.0, 9.0}, {11.0, 13.0}}}); + Array2D<cpptype_t<DataType::Float32>, 3, 2>{{{3.0, 5.0}, {7.0, 9.0}, {11.0, 13.0}}}); std::shared_ptr<Tensor> output = std::static_pointer_cast<OperatorTensor>(pop_o->getOperator())->getOutput(0); REQUIRE(*output == *expectedOutput); } + +TEST_CASE("[cpu/scheduler] Branch", "[scheduler]") { + std::shared_ptr<Tensor> in = std::make_shared<Tensor>( + Array2D<cpptype_t<DataType::Float32>, 2, 3>{{{1, 2, 3}, {4, 5, 6}}}); + + std::shared_ptr<GraphView> g = Sequential({ + Producer(in, "input"), + Parallel({ + Sequential({ + GenericOperator("b0_op1", {InputCategory::Data}, 1), + GenericOperator("b0_op2", {InputCategory::Data}, 1), + GenericOperator("b0_op3", {InputCategory::Data}, 1), + GenericOperator("b0_op4", {InputCategory::Data}, 1), + GenericOperator("b0_op5", {InputCategory::Data}, 1) + }), + Sequential({ + GenericOperator("b1_op1", {InputCategory::Data}, 1), + GenericOperator("b1_op2", {InputCategory::Data}, 1), + GenericOperator("b1_op3", {InputCategory::Data}, 1) + }), + Sequential({ + GenericOperator("b2_op1", {InputCategory::Data}, 1) + }) + }), + GenericOperator("op1", {InputCategory::Data, InputCategory::Data, InputCategory::Data}, 1), + GenericOperator("op2", {InputCategory::Data}, 1), + GenericOperator("op3", {InputCategory::Data}, 1) + }); + + g->save("branch_forwarded"); + + auto scheduler = SequentialScheduler(g); + scheduler.generateScheduling(); + scheduler.saveStaticSchedulingDiagram("branch_scheduling"); + + // Default scheduling order is not necessarily determinist, but is garanteed to be correct in every case. + // This behavior might change in the future. + auto seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::Default); + fmt::println("seqSchedule = {}", seqSchedule); + + scheduler.tagForkBranches(); + g->save("branch_forwarded_tag"); + + seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::ShortestBranchFirst); + REQUIRE(nodePtrTo(seqSchedule, nodePtrToType) == std::vector<std::string>{ + "Producer", "b2_op1", "b1_op1", "b1_op2", "b1_op3", "b0_op1", "b0_op2", "b0_op3", "b0_op4", "b0_op5", "op1", "op2", "op3"}); + + seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::LonguestBranchFirst); + REQUIRE(nodePtrTo(seqSchedule, nodePtrToType) == std::vector<std::string>{ + "Producer", "b0_op1", "b0_op2", "b0_op3", "b0_op4", "b0_op5", "b1_op1", "b1_op2", "b1_op3", "b2_op1", "op1", "op2", "op3"}); +} + +#ifdef WITH_OPENSSL +TEST_CASE("[cpu/scheduler] Select", "[scheduler]") { + std::shared_ptr<Tensor> in = std::make_shared<Tensor>( + Array2D<cpptype_t<DataType::Float32>, 2, 3>{{{1, 2, 3}, {4, 5, 6}}}); + + std::shared_ptr<GraphView> g = Sequential({ + Producer(in, "input"), + Parallel({ + Sequential({ + CryptoHash("hash"), + Mod("mod") + }), + ReLU("relu"), + Tanh("tanh"), + Sqrt("sqrt") + }), + Select(3, "select") + }); + + auto modProd = Producer(std::make_shared<Tensor>(Array1D<uint64_t, 1>{{3}})); + modProd->addChild(g->getNode("mod"), 0, 1); + g->add(modProd); + + g->getNode("hash")->getOperator()->setDataType(DataType::UInt64); + g->getNode("mod")->getOperator()->setDataType(DataType::UInt64); + g->setBackend("cpu"); + g->save("select"); + + auto scheduler = SequentialScheduler(g); + scheduler.generateScheduling(); + scheduler.saveStaticSchedulingDiagram("select_scheduling"); + REQUIRE_NOTHROW(scheduler.forward(true)); + + g->save("select_forwarded"); + + auto expectedOutputHash = std::make_shared<Tensor>( + Array1D<cpptype_t<DataType::UInt64>, 4>{{0x1b7cf58dfe2dae24, 0x3bac903def4ce580, 0x5f5a347389d97f41, 0x2c2dc759abc6b61}}); + auto outputHash = std::static_pointer_cast<OperatorTensor>(g->getNode("hash")->getOperator())->getOutput(0); + REQUIRE(*outputHash == *expectedOutputHash); + + auto expectedOutputMod = std::make_shared<Tensor>( + Array1D<cpptype_t<DataType::UInt64>, 4>{{2, 1, 1, 2}}); + auto outputMod = std::static_pointer_cast<OperatorTensor>(g->getNode("mod")->getOperator())->getOutput(0); + REQUIRE(*outputMod == *expectedOutputMod); + + auto expectedOutput = std::make_shared<Tensor>( + Array2D<cpptype_t<DataType::Float32>, 2, 3>{{{std::sqrt(1.0f), std::sqrt(2.0f), std::sqrt(3.0f)}, {std::sqrt(4.0f), std::sqrt(5.0f), std::sqrt(6.0f)}}}); + auto output = std::static_pointer_cast<OperatorTensor>(g->getNode("select")->getOperator())->getOutput(0); + REQUIRE(*output == *expectedOutput); + + scheduler.resetScheduling(); + scheduler.tagConditionalNodes(); + + REQUIRE(g->getNode("relu")->attributes()->hasAttr("schedule.cond")); + REQUIRE(g->getNode("relu")->attributes()->getAttr<std::set<std::pair<NodePtr, size_t>>>("schedule.cond") + == std::set<std::pair<NodePtr, size_t>>{{g->getNode("select"), 0}}); + REQUIRE(g->getNode("tanh")->attributes()->hasAttr("schedule.cond")); + REQUIRE(g->getNode("tanh")->attributes()->getAttr<std::set<std::pair<NodePtr, size_t>>>("schedule.cond") + == std::set<std::pair<NodePtr, size_t>>{{g->getNode("select"), 1}}); + REQUIRE(g->getNode("sqrt")->attributes()->hasAttr("schedule.cond")); + REQUIRE(g->getNode("sqrt")->attributes()->getAttr<std::set<std::pair<NodePtr, size_t>>>("schedule.cond") + == std::set<std::pair<NodePtr, size_t>>{{g->getNode("select"), 2}}); + REQUIRE(!g->getNode("input")->attributes()->hasAttr("schedule.cond")); + + scheduler.generateScheduling(); + scheduler.saveStaticSchedulingDiagram("select_scheduling_tag"); + REQUIRE_NOTHROW(scheduler.forward(true)); +} +#endif } // namespace Aidge diff --git a/version.txt b/version.txt index 8f0916f768f0487bcf8d33827ce2c8dcecb645c1..a918a2aa18d5bec6a8bb93891a7a63c243111796 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.5.0 +0.6.0