diff --git a/.gitignore b/.gitignore index 0e14676b900cb1418593019be70cc4d20aba2883..9877699f938bdbf94d0383b5b9db6f5f4cf1023e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ # C++ Build build*/ install*/ +include/aidge/backend/cpu_version.h # VSCode .vscode diff --git a/CMakeLists.txt b/CMakeLists.txt index e9e191c36d5ad57a9a9dbed378154db6676ec796..a2f50c50871d0a47b884b54b353063c60260c1d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,15 +3,18 @@ set(CXX_STANDARD 14) file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version) +# Parse version.txt to retrieve Major, Minor and Path +string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" _ MATCHES ${version}) +set(PROJECT_VERSION_MAJOR ${CMAKE_MATCH_1}) +set(PROJECT_VERSION_MINOR ${CMAKE_MATCH_2}) +set(PROJECT_VERSION_PATCH ${CMAKE_MATCH_3}) + project(aidge_backend_cpu VERSION ${version} DESCRIPTION "CPU implementations of the operators of aidge framework." LANGUAGES CXX) -message(STATUS "Project name: ${CMAKE_PROJECT_NAME}") -message(STATUS "Project version: ${version}") -add_definitions(-DPROJECT_VERSION="${version}") - +# Retrieve latest git commit execute_process( COMMAND git rev-parse --short HEAD WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} @@ -19,8 +22,10 @@ execute_process( OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET ) + +message(STATUS "Project name: ${CMAKE_PROJECT_NAME}") +message(STATUS "Project version: ${version}") message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}") -add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}") # helper for LSP users set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -64,6 +69,8 @@ file(GLOB_RECURSE inc_files "include/*.hpp") add_library(${module_name} ${src_files} ${inc_files}) target_link_libraries(${module_name} + PRIVATE + fmt::fmt PUBLIC _aidge_core # _ is added because we link the exported target and not the project ) @@ -115,6 +122,13 @@ if(CMAKE_COMPILER_IS_GNUCXX AND COVERAGE) append_coverage_compiler_flags() endif() +message(STATUS "Creating ${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cpu_version.h") +# Generate version.h file from config file version.h.in +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/version.h.in" + "${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cpu_version.h" +) + ############################################## # Installation instructions include(GNUInstallDirs) diff --git a/aidge_backend_cpu/__init__.py b/aidge_backend_cpu/__init__.py index a7fe1ea3abdea25b18af6e7e0a1958f01f928433..bb320b2fe436a3be81dde8d643728bd5a30942e7 100644 --- a/aidge_backend_cpu/__init__.py +++ b/aidge_backend_cpu/__init__.py @@ -1,3 +1,2 @@ import aidge_core from aidge_backend_cpu.aidge_backend_cpu import * # import so generated by PyBind -from ._version import * diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp index caa75328e58f6c9581f81368a3981bb79a069d49..7553f96b9c17a930c61658e58282b8ee6ba3e015 100644 --- a/include/aidge/backend/cpu.hpp +++ b/include/aidge/backend/cpu.hpp @@ -12,6 +12,8 @@ #ifndef AIDGE_CPU_IMPORTS_H_ #define AIDGE_CPU_IMPORTS_H_ +#include "aidge/backend/cpu_version.h" + #include "aidge/backend/cpu/operator/AbsImpl.hpp" #include "aidge/backend/cpu/operator/AddImpl.hpp" #include "aidge/backend/cpu/operator/AndImpl.hpp" @@ -27,6 +29,7 @@ #include "aidge/backend/cpu/operator/ConvImpl.hpp" #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp" #include "aidge/backend/cpu/operator/DivImpl.hpp" +#include "aidge/backend/cpu/operator/EqualImpl.hpp" #include "aidge/backend/cpu/operator/ErfImpl.hpp" #include "aidge/backend/cpu/operator/FCImpl.hpp" #include "aidge/backend/cpu/operator/FoldImpl.hpp" @@ -51,6 +54,7 @@ #include "aidge/backend/cpu/operator/SoftmaxImpl.hpp" #include "aidge/backend/cpu/operator/SubImpl.hpp" #include "aidge/backend/cpu/operator/TanhImpl.hpp" +#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp" #include "aidge/backend/cpu/data/TensorImpl.hpp" diff --git a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp index 73b710e021ac5031923eb1e9a2492502c02a3633..d7c8ebcf19f64cb60aa2b62f312f4b46351e6ec2 100644 --- a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp @@ -20,7 +20,7 @@ namespace Aidge { namespace { // suppose values are contiguous in memory template <class I, class O> -void equal_contiguous_arrays(const std::size_t input1size, +void and_contiguous_arrays(const std::size_t input1size, const std::size_t input2size, const std::size_t output1size, const I* input1, @@ -31,14 +31,14 @@ void equal_contiguous_arrays(const std::size_t input1size, { const std::size_t in1_id = (input1size != 1) ? i : 0; const std::size_t in2_id = (input2size != 1) ? i : 0; - output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]); + output[i] = static_cast<O>(input1[in1_id] && input2[in2_id]); } } } template <class I, class O> -void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, +void AndImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, const void* input0_, @@ -60,9 +60,8 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, // special case for equal dimensions, the kernel is called with the entire arrays at once if (dims0 == dims1) { const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); - for (std::size_t i = 0; i < input0_contiguous_size; ++i) - { - output[i] = static_cast<O>(input_0[i] == input_1[i]); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) { + output[i] = static_cast<O>(input_0[i] && input_1[i]); } return; } @@ -126,7 +125,7 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, std::size_t dim = contiguousIdx - 1; const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); for (std::size_t stack = 0; stack < nbStacks;) { - equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + and_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, input_0 + offsetIn0*input0_contiguous_size, input_1 + offsetIn1*input1_contiguous_size, output + offsetOut*output_contiguous_size); @@ -146,17 +145,17 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, // Kernels registration to implementation entry point REGISTRAR(AndImpl_cpu, - {DataType::Float32}, - {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}}, + {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float>, nullptr}); REGISTRAR(AndImpl_cpu, - {DataType::Float64}, - {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}}, + {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double>, nullptr}); REGISTRAR(AndImpl_cpu, - {DataType::Int32}, - {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}}, + {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr}); REGISTRAR(AndImpl_cpu, - {DataType::Int64}, - {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}}, + {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr}); } // namespace Aidge diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp index adea96ca43a1ad9d2a49777426913ca4676e4f32..7c76657f7d100255f49ab1e675672407c5fbbaf8 100644 --- a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp +++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp @@ -28,8 +28,10 @@ namespace Aidge { using AvgPooling2D_Op = AvgPooling_Op<2>; using AvgPoolingImpl2D_cpu = OperatorImpl_cpu<AvgPooling_Op<2>, void(const std::array<DimSize_t, 2>&, + const std::array<DimSize_t, 2>&, const std::array<DimSize_t, 2>&, const std::array<DimSize_t, 4>&, + bool, const void *, void *)>; diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp index f6da9dcb026101b93de862499d42ae8734532d52..68dbfbe7302c392d87d9c895612c7ace9292ef57 100644 --- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp @@ -35,66 +35,54 @@ namespace Aidge { template <class I, class O> void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, const std::array<DimSize_t, 2>& kernelDims, + const std::array<DimSize_t, 2>& dilations, const std::array<DimSize_t, 4> &dims, + bool ceilMode, const void *input_, void *output_) { - // FIXME: missing convolution attributes as arguments const I *input = static_cast<const I *>(input_); O *output = static_cast<O *>(output_); + // Calculate output dimensions based on ceilMode and dilations + auto compute_output_size = [&](DimSize_t inputDim, DimSize_t kernelDim, DimSize_t stride, DimSize_t dilation) { + DimSize_t effectiveKernelDim = (kernelDim - 1) * dilation + 1; + float result = static_cast<float>(inputDim - effectiveKernelDim + stride) / static_cast<float>(stride); + return ceilMode ? static_cast<DimSize_t>(std::ceil(result)) : static_cast<DimSize_t>(std::floor(result)); + }; - // output H size - const std::size_t oxSize = - static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) / - static_cast<float>(strideDims[0]))); - // output W size - const std::size_t oySize = - static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) / - static_cast<float>(strideDims[1]))); + const std::size_t oxSize = compute_output_size(dims[2], kernelDims[0], strideDims[0], dilations[0]); + const std::size_t oySize = compute_output_size(dims[3], kernelDims[1], strideDims[1], dilations[1]); - // TODO: kernel computation - // output (batch, outCh, Xout, Yout) - // input (batch, ch, Xin, Yin) - // weight (outCh, ch, kernelX, kernelY) - // does not take Dilation attribute into account using signedsize = std::make_signed<std::size_t>::type; + for (std::size_t batch = 0; batch < dims[0]; ++batch) { for (std::size_t ch = 0; ch < dims[1]; ++ch) { - const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize; - const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3]; - std::fill(output + oIndex, output+(oIndex+oxSize*oySize), 0); + const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize; + const std::size_t iIndex = (ch + batch * dims[1]) * dims[2] * dims[3]; + std::fill(output + oIndex, output + (oIndex + oxSize * oySize), 0); + for (std::size_t ox = 0; ox < oxSize; ++ox) { - const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]); - const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0))); - const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx); + const signedsize startx = static_cast<signedsize>(ox * strideDims[0]) - (dilations[0] - 1); + const std::size_t sxMin = static_cast<std::size_t>(std::max(startx, signedsize(0))); + const std::size_t sxMax = std::min(dims[2], static_cast<std::size_t>(startx + kernelDims[0] * dilations[0])); + for (std::size_t oy = 0; oy < oySize; ++oy) { - const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]); - const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0))); - const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify); - const std::size_t oIndexFull = oIndex + ox*oySize + oy; - const std::size_t ix = ox * strideDims[0]; - const std::size_t iy = oy * strideDims[1]; + const signedsize starty = static_cast<signedsize>(oy * strideDims[1]) - (dilations[1] - 1); + const std::size_t syMin = static_cast<std::size_t>(std::max(starty, signedsize(0))); + const std::size_t syMax = std::min(dims[3], static_cast<std::size_t>(starty + kernelDims[1] * dilations[1])); - if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) { - output[oIndexFull] += static_cast<O>( - input[iIndex + (ix+0)*dims[3] + (iy+0)] + - input[iIndex + (ix+0)*dims[3] + (iy+1)] + - input[iIndex + (ix+0)*dims[3] + (iy+2)] + - input[iIndex + (ix+1)*dims[3] + (iy+0)] + - input[iIndex + (ix+1)*dims[3] + (iy+1)] + - input[iIndex + (ix+1)*dims[3] + (iy+2)] + - input[iIndex + (ix+2)*dims[3] + (iy+0)] + - input[iIndex + (ix+2)*dims[3] + (iy+1)] + - input[iIndex + (ix+2)*dims[3] + (iy+2)]) / O(9); - } else { - for (std::size_t sx = sxMin; sx < sxMax; ++sx) { - for (std::size_t sy = syMin; sy < syMax; ++sy) { - output[oIndexFull] += input[iIndex + (ix+sx)*dims[3] + (iy+sy)]; - } + const std::size_t oIndexFull = oIndex + ox * oySize + oy; + O sum = static_cast<O>(0); + std::size_t count = 0; + + for (std::size_t sx = sxMin; sx < sxMax; sx += dilations[0]) { + for (std::size_t sy = syMin; sy < syMax; sy += dilations[1]) { + sum += static_cast<O>(input[iIndex + sx * dims[3] + sy]); + ++count; } - // padding not used - output[oIndexFull] /= (sxMax - sxMin) * (syMax - syMin); } + + output[oIndexFull] = sum / static_cast<O>(count); } } } diff --git a/include/aidge/backend/cpu/operator/EqualImpl.hpp b/include/aidge/backend/cpu/operator/EqualImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e2489096067a49139f6291898056d525a77db522 --- /dev/null +++ b/include/aidge/backend/cpu/operator/EqualImpl.hpp @@ -0,0 +1,32 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_EQUALIMPL_H_ +#define AIDGE_CPU_OPERATOR_EQUALIMPL_H_ + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/Equal.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include <memory> +#include <vector> + +namespace Aidge { +// Operator implementation entry point for the backend +using EqualImpl_cpu = OperatorImpl_cpu<Equal_Op, + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>; + +// Implementation entry point registration to Operator +REGISTRAR(Equal_Op, "cpu", Aidge::EqualImpl_cpu::create); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_EQUALIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp b/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3c8ff0f4742e0393efd8cbbf637822c443edffb3 --- /dev/null +++ b/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp @@ -0,0 +1,163 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_ + +#include "aidge/backend/cpu/operator/EqualImpl.hpp" +#include "aidge/utils/Registrar.hpp" + +namespace Aidge { + +namespace { +// suppose values are contiguous in memory +template <class I, class O> +void equal_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I* input1, + const I* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]); + } +} +} + + +template <class I, class O> +void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, + const std::vector<std::size_t>& outputDims, + const void* input0_, + const void* input1_, + void* output_) { + + const I* input_0 = static_cast<const I*>(input0_); + const I* input_1 = static_cast<const I*>(input1_); + O* output = static_cast<O*>(output_); + + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(input_0[i] == input_1[i]); + } + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } +} + +// Kernels registration to implementation entry point +REGISTRAR(EqualImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}}, + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr}); +REGISTRAR(EqualImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}}, + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr}); +REGISTRAR(EqualImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}}, + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr}); +REGISTRAR(EqualImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}}, + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr}); + +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp index 68cc3621514de97d9837e10bcf90218abe559aaa..062088a13c0fcca8116a08f71457e6b0b01d1a65 100644 --- a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp +++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp @@ -28,6 +28,7 @@ namespace Aidge { using MaxPooling2D_Op = MaxPooling_Op<2>; using MaxPoolingImpl2D_cpu = OperatorImpl_cpu<MaxPooling_Op<2>, void(const std::array<DimSize_t, 2>&, + const std::array<DimSize_t, 2>&, const std::array<DimSize_t, 2>&, const bool, const std::array<DimSize_t, 4> &, diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp index 7b6f04f141eb701849a8d436561bcf9e37471cfa..250b11b0e0da100fc415c661ec3497d30203f293 100644 --- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp @@ -35,28 +35,23 @@ namespace Aidge { template <class I, class O> void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, const std::array<DimSize_t, 2>& kernelDims, + const std::array<DimSize_t, 2>& dilations, const bool /*ceilMode*/, const std::array<DimSize_t, 4> &dims, const void *input_, void *output_) { - // FIXME: missing convolution parameters as arguments const I *input = static_cast<const I *>(input_); O *output = static_cast<O *>(output_); // output H size const std::size_t oxSize = - static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) / + static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) / static_cast<float>(strideDims[0]))); // output W size const std::size_t oySize = - static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) / + static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) / static_cast<float>(strideDims[1]))); - // TODO: kernel computation - // output (batch, outCh, Xout, Yout) - // input (batch, ch, Xin, Yin) - // weight (outCh, ch, kernelX, kernelY) - // does not take Dilation parameter into account using signedsize = std::make_signed<std::size_t>::type; for (std::size_t batch = 0; batch < dims[0]; ++batch) { for (std::size_t ch = 0; ch < dims[1]; ++ch) { @@ -77,12 +72,15 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD I poolValue(0.0); bool valid = false; - for (unsigned int channel = 0; channel < dims[1]; - ++channel){ - for (unsigned int sy = syMin; sy < syMax; ++sy) { - for (unsigned int sx = sxMin; sx < sxMax; ++sx) - { - const I value = input[iIndex + (ix+sx)*dims[3] + (iy+sy)]; + for (unsigned int sy = syMin; sy < syMax; ++sy) { + for (unsigned int sx = sxMin; sx < sxMax; ++sx) { + // Apply dilation factor to kernel indices + const std::size_t dilated_sx = sx * dilations[0]; + const std::size_t dilated_sy = sy * dilations[1]; + + // Ensure indices are within bounds + if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) { + const I value = input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)]; if (!valid || value > poolValue) { poolValue = value; @@ -98,106 +96,6 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD } } -//N2D2 version -/* -template <class T> -void N2D2::PoolCell_Frame_Kernels::forwardMax(const T* alpha, - const Tensor<T>& - inputs, - const Descriptor& desc, - const T* beta, - Tensor<T>& outputs, - Tensor<ArgMax>& argMax, - bool useArgMax, - const Tensor<bool>& maps) -{ - const unsigned int size = inputs.dimB() * outputs.dimZ(); - -#if defined(_OPENMP) && _OPENMP >= 200805 -#pragma omp parallel for collapse(2) if (size > 16) -#else -#pragma omp parallel for if (inputs.dimB() > 4 && size > 16) -#endif - for (int batchPos = 0; batchPos < (int)inputs.dimB(); ++batchPos) { - for (unsigned int output = 0; output < outputs.dimZ(); ++output) { - for (unsigned int oy = 0; oy < outputs.dimY(); ++oy) { - for (unsigned int ox = 0; ox < outputs.dimX(); ++ox) { - const unsigned int sxMin = (unsigned int)std::max( - desc.padding[0] - (int)(ox * desc.stride[0]), 0); - const unsigned int syMin = (unsigned int)std::max( - desc.padding[1] - (int)(oy * desc.stride[1]), 0); - const unsigned int sxMax = Utils::clamp - <int>(inputs.dimX() + desc.padding[0] - ox * desc.stride[0], - 0, - desc.pool[0]); - const unsigned int syMax = Utils::clamp - <int>(inputs.dimY() + desc.padding[1] - oy * desc.stride[1], - 0, - desc.pool[1]); - - const int ix = (int)(ox * desc.stride[0]) - desc.padding[0]; - const int iy = (int)(oy * desc.stride[1]) - desc.padding[1]; - - T poolValue(0.0); - - // For each output, compute the pool value - if (useArgMax) { - const ArgMax inputMax - = argMax(ox, oy, output, batchPos); - - if (inputMax.valid) { - poolValue = inputs(inputMax.ix, - inputMax.iy, - inputMax.channel, - batchPos); - } - } - else { - unsigned int ixMax = 0; - unsigned int iyMax = 0; - unsigned int channelMax = 0; - bool valid = false; - - for (unsigned int channel = 0; channel < inputs.dimZ(); - ++channel) - { - if (!maps.empty() && !maps(output, channel)) - continue; - - for (unsigned int sy = syMin; sy < syMax; ++sy) { - for (unsigned int sx = sxMin; sx < sxMax; ++sx) - { - const T value = inputs(ix + sx, - iy + sy, - channel, - batchPos); - - if (!valid || value > poolValue) { - poolValue = value; - valid = true; - - ixMax = ix + sx; - iyMax = iy + sy; - channelMax = channel; - } - } - } - } - - argMax(ox, oy, output, batchPos) - = ArgMax(ixMax, iyMax, channelMax, valid); - } - - outputs(ox, oy, output, batchPos) - = (*alpha) * poolValue - + (*beta) * outputs(ox, oy, output, batchPos); - } - } - } - } -} - -*/ // Kernels registration to implementation entry point REGISTRAR(MaxPoolingImpl2D_cpu, diff --git a/include/aidge/backend/cpu/operator/MulImpl.hpp b/include/aidge/backend/cpu/operator/MulImpl.hpp index c927af9ebd4d658c764cc059df9778c273ba178e..eec5583bb548a3d3343b966c54cfccd1600b8f76 100644 --- a/include/aidge/backend/cpu/operator/MulImpl.hpp +++ b/include/aidge/backend/cpu/operator/MulImpl.hpp @@ -34,6 +34,7 @@ using MulImpl_cpu = OperatorImpl_cpu<Mul_Op, const std::size_t, const std::vector<std::size_t>, const std::vector<std::size_t>, + const std::vector<std::size_t>, const void*, const void*, const void*, diff --git a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp index 556dd56cd32f28de14a43d20b97deb0083341fee..36acb9199c51e900287ca9b262322aa86287d838 100644 --- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp @@ -149,61 +149,53 @@ void MulImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, template <class I1, class I2, class O> void MulImpl_cpu_backward_kernel(const std::size_t input0Length, - const std::size_t input1Length, - const std::size_t grad0Length, - const std::vector<std::size_t> input0Dims, - const std::vector<std::size_t> input1Dims, - const void* input0_, - const void* input1_, - const void* grad_output_, - void* gradientInput0, - void* gradientInput1) + const std::size_t input1Length, + const std::size_t gradOutputLength, + const std::vector<std::size_t>& dims0, + const std::vector<std::size_t>& dims1, + const std::vector<std::size_t>& outputDims, + const void* input0_, + const void* input1_, + const void* grad_output_, + void* gradientInput0_, + void* gradientInput1_) { - const auto* input0 = static_cast<const I1*>(input0_); - const auto* input1 = static_cast<const I1*>(input1_); - const auto* grad_output = static_cast<const O*>(grad_output_); - auto* grad_input_0 = static_cast<I1*>(gradientInput0); - auto* grad_input_1 = static_cast<I2*>(gradientInput1); - - - if(input0Dims.size() >= input1Dims.size()) - { - AIDGE_ASSERT(input0Length == grad0Length, "Incorrect dimensions between Mul input and output tensors"); - - for(auto i = 0U; i < input0Length; ++i) - { - const auto indices = getMultiDimIndices(input1Dims, i); - const auto flattenedIndex = getFlattenedIndex(input1Dims, indices); - - grad_input_0[i] = input1[flattenedIndex] * grad_output[i]; - } - - for(std::size_t i = 0 ; i < grad0Length; ++i) - { - const auto indices = getMultiDimIndices(input1Dims, i); - const auto flattenedIndex = getFlattenedIndex(input1Dims, indices); - - grad_input_1[flattenedIndex] += input0[i] * grad_output[i]; + const I1* input0 = static_cast<const I1*>(input0_); + const I2* input1 = static_cast<const I2*>(input1_); + const O* grad_output = static_cast<const O*>(grad_output_); + auto* grad_input_0 = static_cast<I1*>(gradientInput0_); + auto* grad_input_1 = static_cast<I2*>(gradientInput1_); + + std::fill_n(grad_input_0, input0Length, static_cast<I1>(0)); + std::fill_n(grad_input_1, input1Length, static_cast<I2>(0)); + + // Broadcast dims0 and dims1 to match the shape of outputDims + auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0); + auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1); + + for (std::size_t i = 0; i < gradOutputLength; ++i) { + auto idxOutputGrad = getMultiDimIndices(outputDims, i); + std::vector<std::size_t> idxInput0(broadcastedDims0.size()); + std::vector<std::size_t> idxInput1(broadcastedDims1.size()); + + // Map output indices to input0 indices, considering broadcasting + for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) { + // If input0 is broadcasted along this dimension (== 1) or both dimensions are 1, index is 0. + // idxInput0 represent the multi dim index of input0 contributing + // to the output at index i. + idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension]; } - } else { - AIDGE_ASSERT(input1Length == grad0Length, "Incorrect dimensions between Mul input and output tensors"); - - for(auto i = 0U; i < input1Length; ++i) - { - const auto indices = getMultiDimIndices(input0Dims, i); - const auto flattenedIndex = getFlattenedIndex(input0Dims, indices); - - grad_input_1[i] = input0[flattenedIndex] * grad_output[i]; + for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) { + idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension]; } - for(std::size_t i = 0 ; i < grad0Length; ++i) - { - const auto indices = getMultiDimIndices(input0Dims, i); - const auto flattenedIndex = getFlattenedIndex(input0Dims, indices); + // We have to access tensors with a flat index, hence the conversion + auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0); + auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1); - grad_input_0[flattenedIndex] += input1[i] * grad_output[i]; - } + grad_input_0[idx0] += static_cast<I1>(grad_output[i] * input1[idx1]); + grad_input_1[idx1] += static_cast<I2>(grad_output[i] * input0[idx0]); } } @@ -211,6 +203,9 @@ void MulImpl_cpu_backward_kernel(const std::size_t input0Length, REGISTRAR(MulImpl_cpu, {DataType::Float32}, {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, float, float>, Aidge::MulImpl_cpu_backward_kernel<float, float, float>}); +REGISTRAR(MulImpl_cpu, + {{{DataType::Float32}, {DataType::Float64}}, {DataType::Float32}}, + {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, double, float>, Aidge::MulImpl_cpu_backward_kernel<float, double, float>}); REGISTRAR(MulImpl_cpu, {DataType::Float64}, {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<double, double, double>, Aidge::MulImpl_cpu_backward_kernel<double, double, double>}); diff --git a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp index 6a22ff4ec9d7beaf05be3b479b43dd3ad69bc74b..6449417baf855620669aba11ebca16d9384c4e7c 100644 --- a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp @@ -99,30 +99,31 @@ void ResizeImpl_cpu_forward_kernel( } return; } + // Kernels registration to implementation entry point REGISTRAR(ResizeImpl_cpu, {{{DataType::Int16}, - {DataType::Float32}, - {DataType::Float32}, - {DataType::UInt64}}, + {DataType::Any}, + {DataType::Any}, + {DataType::Any}}, {DataType::Int16}}, {ProdConso::inPlaceModel, ResizeImpl_cpu_forward_kernel<int16_t>, nullptr}); REGISTRAR(ResizeImpl_cpu, {{{DataType::Int32}, - {DataType::Float32}, - {DataType::Float32}, - {DataType::UInt64}}, + {DataType::Any}, + {DataType::Any}, + {DataType::Any}}, {DataType::Int32}}, {ProdConso::inPlaceModel, ResizeImpl_cpu_forward_kernel<int32_t>, nullptr}); REGISTRAR(ResizeImpl_cpu, {{{DataType::Int64}, - {DataType::Float32}, - {DataType::Float32}, - {DataType::Int64}}, + {DataType::Any}, + {DataType::Any}, + {DataType::Any}}, {DataType::UInt64}}, {ProdConso::inPlaceModel, ResizeImpl_cpu_forward_kernel<int64_t>, @@ -130,27 +131,27 @@ REGISTRAR(ResizeImpl_cpu, REGISTRAR(ResizeImpl_cpu, {{{DataType::Float16}, - {DataType::Float32}, - {DataType::Float32}, - {DataType::UInt64}}, + {DataType::Any}, + {DataType::Any}, + {DataType::Any}}, {DataType::Float16}}, {ProdConso::inPlaceModel, ResizeImpl_cpu_forward_kernel<half_float::half>, nullptr}); REGISTRAR(ResizeImpl_cpu, {{{DataType::Float32}, - {DataType::Float32}, - {DataType::Float32}, - {DataType::UInt64}}, + {DataType::Any}, + {DataType::Any}, + {DataType::Any}}, {DataType::Float32}}, {ProdConso::inPlaceModel, ResizeImpl_cpu_forward_kernel<float>, nullptr}); REGISTRAR(ResizeImpl_cpu, {{{DataType::Float64}, - {DataType::Float32}, - {DataType::Float32}, - {DataType::UInt64}}, + {DataType::Any}, + {DataType::Any}, + {DataType::Any}}, {DataType::Float64}}, {ProdConso::inPlaceModel, ResizeImpl_cpu_forward_kernel<double>, diff --git a/include/aidge/backend/cpu/operator/WeightInterleavedImpl.hpp b/include/aidge/backend/cpu/operator/WeightInterleavedImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ff5c4778f530912e8bdf97ffadb2f546789e2c48 --- /dev/null +++ b/include/aidge/backend/cpu/operator/WeightInterleavedImpl.hpp @@ -0,0 +1,37 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_H_ +#define AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_H_ + +#include <array> +#include <memory> +#include <vector> + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/WeightInterleaving.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" + +namespace Aidge { +// Operator implementation entry point for the backend +using WeightInterleavedImpl_cpu = OperatorImpl_cpu<WeightInterleaving_Op, + void(const DimSize_t, + const DimSize_t, + const DimSize_t, + const void *, + void *)>; + +// Implementation entry point registration to Operator +REGISTRAR(WeightInterleaving_Op, "cpu", Aidge::WeightInterleavedImpl_cpu::create); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_WeightInterleavingIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp b/include/aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..18557f8fb5fcdd31476904d273d4d2d7f37a66b5 --- /dev/null +++ b/include/aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp @@ -0,0 +1,143 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_ + +#include <cstddef> // std::size_t +#include <cstdint> // std::int8_t, std::uint8_t + +#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp" +#include "aidge/data/DataType.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/ErrorHandling.hpp" + + +namespace Aidge { + + /** + * @brief Compacts 8-bit data into a smaller bit-width representation. + * + * This function takes an array of 8-bit data and compacts it into smaller chunks + * based on the specified bit-width `nb_bits`. Each element in `compactData` will + * store multiple packed `nb_bits` segments extracted from `data`. + * + * @param data The input array of 8-bit values to be compacted. + * @param dataSize The size of the input `data` array. + * @param compactData The output array storing the compacted data. + * @param nb_bits The number of bits to extract from each `data` element (must be less than 8). + */ + template <typename T> + void compact_data(const T* data, std::size_t dataSize, T* compactData, std::uint8_t nb_bits) { + AIDGE_ASSERT(nb_bits > 0 && nb_bits < 5, "Cannot compact with the given nb_bits"); // Ensure valid bit width + + // Mask to extract `nb_bits` from each data element + const unsigned int mask = (1U << nb_bits) - 1; + + // Calculate the number of `nb_bits` segments that fit into an 8-bit compacted value + const unsigned int nbSlot = 8 / nb_bits; + + // Case nb_bits=3 or 4, then shift is 4 + // Case nb_bits=2, then shift is 2 + // Case nb_bits=1, then shift is 1 + std::uint8_t shift = 8 / nbSlot; + + const unsigned int nbFullCompactbytes = dataSize / nbSlot; + + // Main loop to process data in groups of `nbSlot` + for (std::size_t i = 0; i < nbFullCompactbytes; ++i) { + T compact = 0; + + for (unsigned int j = 0; j < nbSlot; ++j) { + compact |= (data[i * nbSlot + j] & mask); // Apply mask to keep `nb_bits` only + + // Shift only if not on the last slot to make room for the next `nb_bits` + if (j < nbSlot - 1) { + compact <<= shift; + } + } + // Store the compacted value in the output array + compactData[i] = compact; + } + + + // Handle any remaining data elements (if dataSize is not a multiple of nbSlot). + std::size_t remaining = dataSize % nbSlot; + if (remaining != 0) { + std::int8_t compact = 0; + for (std::size_t j = 0; j < remaining; ++j) { + compact |= (data[nbFullCompactbytes*nbSlot + j] & mask); + + if (j < remaining - 1) { + compact <<= shift; + } + } + compact <<= (shift*(nbSlot - remaining)); + // Store the last compacted value + compactData[dataSize / nbSlot] = compact; + } + } + +template <class I, class O, int nb_bits> +void WeightInterleavedImpl_cpu_forward_kernel(const DimSize_t input_interleaving, + const DimSize_t nb_interleaving, + const DimSize_t output_interleaving, + const void* input_, + void* output_) { + const I* input = static_cast<const I*>(input_); + O* output = static_cast<O*>(output_); + + // Aidge::compact_data(const std::int8_t* data, std::size_t dataSize, std::int8_t* compactData, std::uint8_t nb_bits) { + for (std::size_t i=0; i<nb_interleaving; ++i){ + compact_data(input+(i*input_interleaving), input_interleaving, output+(i*output_interleaving), static_cast<std::uint8_t>(nb_bits)); + } + +} + + +REGISTRAR(WeightInterleavedImpl_cpu, + {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int4>, DataFormat::NHWC}}, + {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr}); +REGISTRAR(WeightInterleavedImpl_cpu, + {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int3>, DataFormat::NHWC}}, + {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr}); +REGISTRAR(WeightInterleavedImpl_cpu, + {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Int2>, DataFormat::NHWC}}, + {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr}); +REGISTRAR(WeightInterleavedImpl_cpu, + {ImplSpec::IOSpec{DataType::Binary, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::Binary>, DataFormat::NHWC}}, + {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 1>, nullptr}); + +REGISTRAR(WeightInterleavedImpl_cpu, + {ImplSpec::IOSpec{DataType::UInt4, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt4>, DataFormat::NHWC}}, + {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 4>, nullptr}); +REGISTRAR(WeightInterleavedImpl_cpu, + {ImplSpec::IOSpec{DataType::UInt3, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt3>, DataFormat::NHWC}}, + {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 3>, nullptr}); +REGISTRAR(WeightInterleavedImpl_cpu, + {ImplSpec::IOSpec{DataType::UInt2, DataFormat::NHWC}, ImplSpec::IOSpec{WeightInterleavedType_v<DataType::UInt2>, DataFormat::NHWC}}, + {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<uint8_t, uint8_t, 2>, nullptr}); + + +// REGISTRAR(WeightInterleavedImpl_cpu, +// {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}}, +// {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr}); +// REGISTRAR(WeightInterleavedImpl_cpu, +// {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}}, +// {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr}); +// REGISTRAR(WeightInterleavedImpl_cpu, +// {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}}, +// {ProdConso::defaultModel, Aidge::WeightInterleavedImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr}); + + +} + +#endif /* AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVEDIMPL_KERNELS_H_ */ \ No newline at end of file diff --git a/include/aidge/backend/version.h.in b/include/aidge/backend/version.h.in new file mode 100644 index 0000000000000000000000000000000000000000..4b876f63002972c1f8f1340b70cdecdace911012 --- /dev/null +++ b/include/aidge/backend/version.h.in @@ -0,0 +1,11 @@ +#ifndef VERSION_H +#define VERSION_H + +namespace Aidge { +static constexpr const int PROJECT_VERSION_MAJOR = @PROJECT_VERSION_MAJOR@; +static constexpr const int PROJECT_VERSION_MINOR = @PROJECT_VERSION_MINOR@; +static constexpr const int PROJECT_VERSION_PATCH = @PROJECT_VERSION_PATCH@; +static constexpr const char * PROJECT_VERSION = "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_PATCH@"; +static constexpr const char * PROJECT_GIT_HASH = "@GIT_COMMIT_HASH@"; +} +#endif // VERSION_H diff --git a/include/aidge/utils/sys_info/CpuVersionInfo.hpp b/include/aidge/utils/sys_info/CpuVersionInfo.hpp index 887ce839e079349d9d64505f7184831ffc4cf1c2..3df70d139bd4ad26e0c88d2ee5192e508bc3f71a 100644 --- a/include/aidge/utils/sys_info/CpuVersionInfo.hpp +++ b/include/aidge/utils/sys_info/CpuVersionInfo.hpp @@ -2,17 +2,20 @@ #define AIDGE_UTILS_SYS_INFO_CPU_VERSION_INFO_H #include "aidge/utils/Log.hpp" +#include "aidge/backend/cpu_version.h" namespace Aidge { -#ifndef PROJECT_VERSION // Normally defined in CMakeLists.txt -#define PROJECT_VERSION "Unknown version" -#endif -#ifndef GIT_COMMIT_HASH -#define GIT_COMMIT_HASH "" -#endif -void showCpuVersion() { - Log::info("Aidge backend CPU: {} ({}), {} {}", PROJECT_VERSION, GIT_COMMIT_HASH, __DATE__, __TIME__); +constexpr inline const char * getBackendCPUProjectVersion(){ + return PROJECT_VERSION; +} + +constexpr inline const char * getBackendCPUGitHash(){ + return PROJECT_GIT_HASH; +} + +void showBackendCpuVersion() { + Log::info("Aidge backend CPU: {} ({}), {} {}", getBackendCPUProjectVersion(), getBackendCPUGitHash(), __DATE__, __TIME__); // Compiler version #if defined(__clang__) /* Clang/LLVM. ---------------------------------------------- */ diff --git a/project_name.txt b/project_name.txt new file mode 100644 index 0000000000000000000000000000000000000000..25caafdd8ab794dbcb8c8cdef5097e2143accc6a --- /dev/null +++ b/project_name.txt @@ -0,0 +1 @@ +aidge_backend_cpu diff --git a/pyproject.toml b/pyproject.toml index 3c08302d0fcd4e77943d165fab802a22f4dc39cc..baa61de57375aa97f803efb19bf59973d39ef36f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,17 +7,25 @@ dependencies = [ requires-python = ">= 3.7" readme = "README.md" license = { file = "LICENSE" } -classifiers = [ +classifiers = [ "Development Status :: 2 - Pre-Alpha", "Programming Language :: Python :: 3" ] -dynamic = ["version"] # defined in tool.setuptools_scm +dynamic = ["version"] # defined by pbr + + +[project.urls] +Homepage = "https://www.deepgreen.ai/en/platform" +Documentation = "https://eclipse-aidge.readthedocs.io/en/latest/" +Repository = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu" +Issues = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu/-/issues" +Changelog = "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu/-/releases" [build-system] requires = [ "setuptools>=64", - "setuptools_scm[toml]==7.1.0", - "cmake>=3.18.4.post1" + "cmake>=3.18.4.post1", + "pbr" ] build-backend = "setuptools.build_meta" @@ -29,9 +37,6 @@ where = ["."] # list of folders that contain the packages (["."] by default) include = ["aidge_backend_cpu*"] # package names should match these glob patterns (["*"] by default) exclude = ["aidge_backend_cpu.unit_tests*"] # exclude packages matching these glob patterns (empty by default) namespaces = false # to disable scanning PEP 420 namespaces (true by default) -# SETUPTOOLS_SCM -[tool.setuptools_scm] -write_to = "aidge_backend_cpu/_version.py" ##################################################### # CIBUILDWHEEL diff --git a/python_binding/pybind_cpu.cpp b/python_binding/pybind_cpu.cpp index d5022e1d469ae4171e796baed6c1aa061dd95765..e576de0849ec6e0739b7cad32e8e8b003c092b7b 100644 --- a/python_binding/pybind_cpu.cpp +++ b/python_binding/pybind_cpu.cpp @@ -6,10 +6,10 @@ namespace py = pybind11; namespace Aidge { -void init_cpu_sys_info(py::module& m); +void init_CpuVersionInfo(py::module& m); void init_Aidge(py::module& m){ - init_cpu_sys_info(m); + init_CpuVersionInfo(m); } diff --git a/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp b/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp index 573bee3659c65f90935e03c06eff5a2998bb9f5b..7461dd955e8b3e64b086a51664b3d73b6bfaed93 100644 --- a/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp +++ b/python_binding/utils/sys_info/pybind_CpuVersionInfo.cpp @@ -3,7 +3,9 @@ namespace py = pybind11; namespace Aidge { -void init_cpu_sys_info(py::module& m){ - m.def("show_cpu_version", &showCpuVersion); +void init_CpuVersionInfo(py::module& m){ + m.def("show_version", &showBackendCpuVersion); + m.def("get_project_version", &getBackendCPUProjectVersion); + m.def("get_git_hash", &getBackendCPUGitHash); } } diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..aa0f227f6688468a5ab93384f7b1670086000035 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +# pbr file +[metadata] +version = file: version.txt diff --git a/setup.py b/setup.py index 22cbd9732c8b9e1099c3e322032e8377f6d4506b..82edd09c90413f4c81ab3356d57e0380d4eab5ab 100644 --- a/setup.py +++ b/setup.py @@ -11,8 +11,10 @@ from math import ceil from setuptools import setup, Extension from setuptools.command.build_ext import build_ext +def get_project_name() -> str: + return open(pathlib.Path().absolute() / "project_name.txt", "r").read().strip() -PROJECT_NAME = "aidge_backend_cpu" +PROJECT_NAME = get_project_name() SETUP_DIR = pathlib.Path(__file__).parent @@ -66,7 +68,7 @@ class AidgePkgBuild(build_ext): else [] ) test_onoff = os.environ.get("AIDGE_BUILD_TEST", "OFF") - + self.spawn( [ "cmake", diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp index 01a5e8cf1772161f5cf98d3a8bd52f43ac7a1d0d..eb5ef87bd16620cdef33330bd7b39ece1783ecfc 100644 --- a/src/operator/AvgPoolingImpl.cpp +++ b/src/operator/AvgPoolingImpl.cpp @@ -32,7 +32,9 @@ void Aidge::AvgPoolingImpl2D_cpu::forward() { // Call kernel impl.forward(op_.strideDims(), op_.kernelDims(), + op_.dilations(), op_.getInput(0)->template dims<4>(), + op_.ceilMode(), getCPUPtr(op_.getInput(0)), getCPUPtr(op_.getOutput(0))); } diff --git a/src/operator/EqualImpl.cpp b/src/operator/EqualImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5926212e8453a32e54de7691343d44f9c6849a05 --- /dev/null +++ b/src/operator/EqualImpl.cpp @@ -0,0 +1,61 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <cassert> +#include <chrono> // std::chrono::milliseconds +#include <numeric> // std::accumulate +#include <thread> // std::this_thread::sleep_for +#include <vector> + +#include "aidge/operator/Equal.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/Broadcasting.hpp" +#include "aidge/backend/cpu/data/GetCPUPtr.h" + +#include "aidge/backend/cpu/operator/EqualImpl.hpp" +#include "aidge/backend/cpu/operator/EqualImpl_kernels.hpp" + +template <> +void Aidge::EqualImpl_cpu::forward() { + const Equal_Op& op = static_cast<const Equal_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in Equal operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Equal forward because the 0-th input has no implementation."); + + AIDGE_ASSERT(op.getInput(1), "missing input in Equal operator"); + AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Equal forward because the 1st input has no implementation."); + + AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot Equal inputs with two differents data type."); + + // Find the correct kernel type + const auto impl = Registrar<EqualImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; + const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0)); + const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1)); + + + impl.forward(op.getInput(0)->dims(), + op.getInput(1)->dims(), + op.getOutput(0)->dims(), + input0.getImpl()->rawPtr(), + input1.getImpl()->rawPtr(), + getCPUPtr(op.getRawOutput(0))); +} + +template <> +void Aidge::EqualImpl_cpu::backward() { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Equal_Op on backend cpu"); +} diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp index 90075a397be3f082ef95fd4df074c99d926fd385..13ef75b0eb92cea700e5b487a06046fc9285d5ad 100644 --- a/src/operator/MaxPoolingImpl.cpp +++ b/src/operator/MaxPoolingImpl.cpp @@ -30,6 +30,7 @@ void Aidge::MaxPoolingImpl2D_cpu::forward() { // Call kernel impl.forward(op_.strideDims(), op_.kernelDims(), + op_.dilations(), op_.ceilMode(), op_.getInput(0)->template dims<4>(), getCPUPtr(mOp.getRawInput(0)), diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp index 422bdd005f058fc9200cf5f7962bfc8d5877e6e1..a90d521a759f1ce6f4883bdd0bc05d84daa0f668 100644 --- a/src/operator/MulImpl.cpp +++ b/src/operator/MulImpl.cpp @@ -58,6 +58,7 @@ void Aidge::MulImpl_cpu::backward() { /* grad0Length */ out0grad->size(), /* input0Dims */ in0->dims(), /* input1Dims */ in1->dims(), + out0grad->dims(), getCPUPtr(in0), getCPUPtr(in1), getCPUPtr(out0grad), diff --git a/src/operator/WeightInterleavedImpl.cpp b/src/operator/WeightInterleavedImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2c9f3a6e8df35616a4f7ffae86cbeacd841f44bf --- /dev/null +++ b/src/operator/WeightInterleavedImpl.cpp @@ -0,0 +1,75 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp" + +#include <cstddef> // std::size_t +#include <functional> +#include <memory> +#include <tuple> + +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include "aidge/backend/cpu/operator/WeightInterleavedImpl_kernels.hpp" +#include "aidge/operator/WeightInterleaving.hpp" +#include "aidge/utils/ErrorHandling.hpp" +#include "aidge/utils/Types.h" + + +template <> +void Aidge::WeightInterleavedImpl_cpu::forward() +{ + const WeightInterleaving_Op& op_ = dynamic_cast<const WeightInterleaving_Op&>(mOp); + AIDGE_ASSERT(op_.getInput(0), "missing input #0"); + + const auto impl = Registrar<WeightInterleavedImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + std::shared_ptr<Tensor> input0Fallback; + const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *(op_.getOutput(0))); + + // inputInterleaving is the number of consecutive input elements that will be compacted + // Here the interleaving is the last dimension (cf STM32 low bit kernels) + std::size_t inputInterleaving = input0.dims().back(); + + // The resulting compacted dimension was computed in forwardDims and the output tensor was resized + std::size_t outputInterleaving = op_.getOutput(0)->dims().back(); + + // nb_interleaving is the number of compacted segments + std::size_t nbInterleaving; + + // Determine the number of segment to compact + if (input0.dims().size() > 1){ + nbInterleaving = std::accumulate( + input0.dims().cbegin(), + std::prev(input0.dims().cend()), // Exclude the last element + std::size_t(1), + std::multiplies<std::size_t>()); + } else { + // Case when the weight tensor is only one dimension + nbInterleaving = 1; + } + + impl.forward(inputInterleaving, + nbInterleaving, + outputInterleaving, + input0.getImpl()->rawPtr(), + getCPUPtr(mOp.getRawOutput(0))); + + +} + +template <> +void Aidge::WeightInterleavedImpl_cpu::backward() { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for WeightInterleaving_Op on backend cpu"); +} \ No newline at end of file diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt index 5984524fdc8c596641e505897d16e12de78024cc..7e63fb2be1a8aa9e7955b768c736302e9f9e3920 100644 --- a/unit_tests/CMakeLists.txt +++ b/unit_tests/CMakeLists.txt @@ -1,12 +1,19 @@ -Include(FetchContent) +find_package(Catch2 QUIET) -FetchContent_Declare( - Catch2 - GIT_REPOSITORY https://github.com/catchorg/Catch2.git - GIT_TAG v3.7.1 # or a later release -) +if(NOT Catch2_FOUND) + message(STATUS "Catch2 not found in system, retrieving from git") + Include(FetchContent) -FetchContent_MakeAvailable(Catch2) + FetchContent_Declare( + Catch2 + GIT_REPOSITORY https://github.com/catchorg/Catch2.git + GIT_TAG devel # or a later release + ) + + FetchContent_MakeAvailable(Catch2) +else() + message(STATUS "Found system Catch2 version ${Catch2_VERSION}") +endif() file(GLOB_RECURSE src_files "*.cpp") diff --git a/unit_tests/data/Test_TensorImpl.cpp b/unit_tests/data/Test_TensorImpl.cpp index fd938f10a947d1520600a1d00022eeb970cd76e6..2bc1e7d4c6f8a7cfbae8807e3021f9c5dd89fff6 100644 --- a/unit_tests/data/Test_TensorImpl.cpp +++ b/unit_tests/data/Test_TensorImpl.cpp @@ -9,19 +9,23 @@ * ********************************************************************************/ -#include <catch2/catch_test_macros.hpp> -#include <cstddef> // std::size_t -#include <cstdint> // std::uint16_t -#include <chrono> -#include <iostream> +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cstddef> // std::size_t +#include <cstdint> // std::int32_t, std::uint16_t #include <memory> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> + +#include <catch2/catch_test_macros.hpp> +#include <fmt/core.h> -#include "aidge/data/Tensor.hpp" #include "aidge/backend/cpu/data/TensorImpl.hpp" -#include "aidge/operator/Add.hpp" #include "aidge/backend/cpu/operator/AddImpl.hpp" +#include "aidge/data/Data.hpp" +#include "aidge/operator/Add.hpp" +#include "aidge/utils/ArrayHelpers.hpp" namespace Aidge { @@ -35,8 +39,7 @@ TEST_CASE("Test addition of Tensors","[TensorImpl][Add][Data]") { std::uniform_int_distribution<int> boolDist(0,1); // Create MatMul Operator - std::shared_ptr<Node> mySub = Add(); - auto op = std::static_pointer_cast<OperatorTensor>(mySub-> getOperator()); + std::shared_ptr<Add_Op> op = std::make_shared<Add_Op>(); op->setDataType(DataType::Float32); op->setBackend("cpu"); diff --git a/unit_tests/operator/Test_AddImpl.cpp b/unit_tests/operator/Test_AddImpl.cpp index bca4025705cb1c851dcf3e9accbf016c4535120a..bff9629be152163b2aa92bdc9d0c3029d7987b9b 100644 --- a/unit_tests/operator/Test_AddImpl.cpp +++ b/unit_tests/operator/Test_AddImpl.cpp @@ -9,12 +9,16 @@ * ********************************************************************************/ +#include <memory> + #include <catch2/catch_test_macros.hpp> +#include "aidge/backend/cpu/operator/AddImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" #include "aidge/operator/Add.hpp" - -#include "aidge/backend/cpu.hpp" +#include "aidge/utils/ArrayHelpers.hpp" using namespace Aidge; @@ -96,7 +100,7 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") { }); // std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{100,200}}); - std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { + Tensor expectedOutput = Array4D<int,3,3,3,2> { { // { // {{ 120, 222},{ 124, 226},{ 128, 230}}, // @@ -114,7 +118,7 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") { {{ 144, 246},{ 148, 250},{152, 254}} // } // } // - }); // + }; // std::shared_ptr<Node> myAdd_0 = Add(); std::shared_ptr<Node> myAdd_1 = Add(); @@ -131,8 +135,8 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") { op_1->setBackend("cpu"); myAdd_0->forward(); myAdd_1->forward(); - op_1->getOutput(0)->print(); - expectedOutput->print(); - REQUIRE(*op_1->getOutput(0) == *expectedOutput); + Log::info("Add_1 Tensor:\n{}", *(op_1->getOutput(0))); + Log::info("Expected Add_1 Tensor:\n{}", expectedOutput); + REQUIRE(*op_1->getOutput(0) == expectedOutput); } } \ No newline at end of file diff --git a/unit_tests/operator/Test_AndImpl.cpp b/unit_tests/operator/Test_AndImpl.cpp index 053bb3ea4ed913bd388f3ae049c4d6402ad58d59..148298d5f44b9f7744ab18bcfc5ce675f77a784c 100644 --- a/unit_tests/operator/Test_AndImpl.cpp +++ b/unit_tests/operator/Test_AndImpl.cpp @@ -9,86 +9,109 @@ * ********************************************************************************/ +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <memory> +#include <random> // std::random_device, std::mt19937, std::uniform_int_distribution, std::uniform_real_distribution + #include <catch2/catch_test_macros.hpp> -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include "aidge/backend/cpu/operator/AndImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" #include "aidge/operator/And.hpp" - -#include "aidge/backend/cpu.hpp" +#include "aidge/utils/ArrayHelpers.hpp" using namespace Aidge; TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") { - SECTION("ForwardDims") - { + SECTION("ForwardDims") { constexpr std::uint16_t NBTRIALS = 10; // Create a random number generator std::random_device rd; std::mt19937 gen(rd()); - std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10)); - std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5)); - std::uniform_int_distribution<int> boolDist(0,1); + std::uniform_int_distribution<int> boolDist(0, 1); // Use 0 for false, 1 for true + std::uniform_int_distribution<std::size_t> dimSizeDist(2, 10); + std::uniform_int_distribution<std::size_t> nbDimsDist(1, 5); SECTION("Same dimensions") { for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { DimSize_t nbDims = nbDimsDist(gen); std::vector<DimSize_t> dims(nbDims); - for (std::size_t i = 0; i < nbDims; i++) { + for (std::size_t i = 0; i < nbDims; ++i) { dims[i] = dimSizeDist(gen); } - + const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + float* array0 = new float[nb_elements]; + float* array1 = new float[nb_elements]; + for (std::size_t i = 0; i < nb_elements; ++i) { + array0[i] = boolDist(gen); + array1[i] = boolDist(gen); + } std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims); - myInput1->setBackend("cpu"); - myInput1->setDataType(DataType::Float32); - myInput1->zeros(); std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims); - myInput2->setBackend("cpu"); + myInput1->setDataType(DataType::Float32); myInput2->setDataType(DataType::Float32); - myInput2->zeros(); + myInput1->setBackend("cpu"); + myInput2->setBackend("cpu"); + + myInput1 -> getImpl() -> setRawPtr(array0, nb_elements); + myInput2 -> getImpl() -> setRawPtr(array1, nb_elements); + std::shared_ptr<Node> myAnd = And(); - auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator()); - op->associateInput(0,myInput1); - op->associateInput(1,myInput2); + auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator()); + op->associateInput(0, myInput1); + op->associateInput(1, myInput2); op->setDataType(DataType::Float32); op->setBackend("cpu"); op->forwardDims(); const auto outputDims = op->getOutput(0)->dims(); REQUIRE(outputDims == dims); + delete[] array0; + delete[] array1; } } + SECTION("Broadcasting") { for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { DimSize_t nbDims = nbDimsDist(gen); std::vector<DimSize_t> dims1(nbDims, 1); std::vector<DimSize_t> dims2(nbDims, 1); std::vector<DimSize_t> expectedOutDims; - for (std::size_t i = 0; i < nbDims; i++) { + for (std::size_t i = 0; i < nbDims; ++i) { DimSize_t dim = dimSizeDist(gen); - if (boolDist(gen)) { - dims1[i] = dim; - } - if (boolDist(gen)) { - dims2[i] = dim; - } - expectedOutDims.push_back(std::max(dims1[i],dims2[i])); + if (boolDist(gen)) dims1[i] = dim; + if (boolDist(gen)) dims2[i] = dim; + expectedOutDims.push_back(std::max(dims1[i], dims2[i])); } + const std::size_t nb_elements0 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements1 = std::accumulate(dims2.cbegin(), dims2.cend(), std::size_t(1), std::multiplies<std::size_t>()); + float* array0 = new float[nb_elements0]; + float* array1 = new float[nb_elements1]; + for (std::size_t i = 0; i < nb_elements0; ++i) { + array0[i] = boolDist(gen); + } + for (std::size_t i = 0; i < nb_elements1; ++i) { + array1[i] = boolDist(gen); + } std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1); - myInput1->setBackend("cpu"); - myInput1->setDataType(DataType::Float32); - myInput1->zeros(); std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2); - myInput2->setBackend("cpu"); + myInput1->setDataType(DataType::Float32); myInput2->setDataType(DataType::Float32); - myInput2->zeros(); + myInput1->setBackend("cpu"); + myInput2->setBackend("cpu"); + myInput1 -> getImpl() -> setRawPtr(array0, nb_elements0); + myInput2 -> getImpl() -> setRawPtr(array1, nb_elements1); + + std::shared_ptr<Node> myAnd = And(); - auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator()); - op->associateInput(0,myInput1); - op->associateInput(1,myInput2); + auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator()); + op->associateInput(0, myInput1); + op->associateInput(1, myInput2); op->setDataType(DataType::Float32); op->setBackend("cpu"); @@ -96,110 +119,67 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") { const auto outputDims = op->getOutput(0)->dims(); REQUIRE(outputDims == expectedOutDims); + delete[] array0; + delete[] array1; } } } + SECTION("Same size inputs") { - std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { - { // - { // - {{20, 15},{31, 11},{22, 49}}, // - {{41, 10},{24, 51},{27, 52}}, // - {{26, 53},{27, 54},{28, 55}} // - }, // - { // - {{29, 56},{30, 57},{31, 58}}, // - {{32, 59},{33, 60},{34, 61}}, // - {{35, 62},{36, 63},{37, 64}} // - }, // - { // - {{38, 65},{39, 66},{40, 67}}, // - {{41, 68},{42, 69},{43, 70}}, // - {{44, 71},{45, 72},{46, 73}} // - } // - } // - }); // - std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { - { // - { // - {{20, 47},{21, 48},{22, 49}}, // - {{23, 50},{24, 51},{25, 52}}, // - {{17, 53},{27, 26},{14, 33}} // - }, // - { // - {{29, 56},{30, 57},{31, 58}}, // - {{72, 44},{33, 20},{27, 55}}, // - {{35, 24},{25, 63},{28, 64}} // - }, // - { // - {{32, 65},{39, 66},{40, 70}}, // - {{41, 53},{42, 60},{34, 70}}, // - {{44, 71},{30, 12},{46, 73}} // - } // - } // - }); // - std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { + std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{ { - { - {{1, 0},{0, 0},{1, 1}}, - {{0, 0},{1, 1},{0, 1}}, - {{0, 1},{1, 0},{0, 0}} - }, - { - {{1, 1},{1, 1},{1, 1}}, - {{0, 0},{1, 0},{0, 0}}, - {{1, 0},{0, 1},{0, 1}} - }, - { - {{0, 1},{1, 1},{1, 0}}, - {{1, 0},{1, 0},{0, 1}}, - {{1, 1},{0, 0},{1, 1}} - } - } - }); + {{{1, 0}, {0, 1}}, + {{1, 1}, {0, 0}}}, + {{{0, 1}, {1, 0}}, + {{1, 0}, {0, 1}}}} + }); + std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{ + { + {{{1, 1}, {0, 0}}, + {{0, 1}, {1, 1}}}, + {{{1, 1}, {0, 0}}, + {{0, 1}, {1, 0}}}} + }); + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{ + { + {{{1, 0}, {0, 0}}, + {{0, 1}, {0, 0}}}, + {{{0, 1}, {0, 0}}, + {{0, 0}, {0, 0}}}} + }); std::shared_ptr<Node> myAnd = And(); - auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator()); + auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator()); op->associateInput(0, input1); op->associateInput(1, input2); op->setBackend("cpu"); - op->setDataType(DataType::Int32); + op->setDataType(DataType::Float32); myAnd->forward(); - + op->getOutput(0)->print(); REQUIRE(*(op->getOutput(0)) == *expectedOutput); } SECTION("Broadcasting") { - std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> { - { // - { // - {{10, 20},{22, 23},{20, 20}}, // - {{10, 15},{10, 29},{20, 20}}, // - {{26, 25},{33, 20},{10, 20}} // - } // - } // - }); // - - std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}}); - std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> { - { // - { // - {{ 1, 1},{ 0, 0},{ 0, 1}}, // - {{ 1, 0},{ 1, 0},{ 0, 1}}, // - {{ 0, 0},{ 0, 1},{ 1, 1}} // - } // - } // - }); // + std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{ + { + {{{1, 0}, {1, 0}}, + {{1, 1}, {0, 0}}}} + }); + std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float, 2>{{1, 0}}); + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{ + { + {{{1, 0}, {1, 0}}, + {{1, 0}, {0, 0}}}} + }); std::shared_ptr<Node> myAnd = And(); - auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator()); + auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator()); op->associateInput(0, input_1); op->associateInput(1, input_2); - op->setDataType(DataType::Int32); + op->setDataType(DataType::Float32); op->setBackend("cpu"); myAnd->forward(); - op->getOutput(0)->print(); - expectedOutput->print(); - REQUIRE(*op->getOutput(0) == *expectedOutput); + + REQUIRE(*(op->getOutput(0)) == *expectedOutput); } -} \ No newline at end of file +} diff --git a/unit_tests/operator/Test_ArgMaxImpl.cpp b/unit_tests/operator/Test_ArgMaxImpl.cpp index 9915d90423e976db1bdd2a694a2cfd7beb380cee..894697f65a6f73af27a568b994c1dd2dc6b118f3 100644 --- a/unit_tests/operator/Test_ArgMaxImpl.cpp +++ b/unit_tests/operator/Test_ArgMaxImpl.cpp @@ -9,17 +9,20 @@ * ********************************************************************************/ -#include <catch2/catch_test_macros.hpp> +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t #include <memory> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937, std::uniform_int_distribution, std::uniform_real_distribution + +#include <catch2/catch_test_macros.hpp> +#include <fmt/core.h> +#include "aidge/backend/cpu/operator/ArgMaxImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" #include "aidge/operator/ArgMax.hpp" -#include "aidge/operator/Conv.hpp" - -#include "aidge/backend/cpu.hpp" -#include "aidge/utils/TensorUtils.hpp" +#include "aidge/utils/ArrayHelpers.hpp" using namespace Aidge; @@ -118,8 +121,8 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") { SECTION("Axis 2") { Tensor myOutput = Tensor(Array3D<float,2,3, 1> { - { - { + { + { {3.0}, {2.0}, {1.0} @@ -144,7 +147,7 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") { SECTION("Axis 2 with keep_dims false") { Tensor myOutput = Tensor(Array2D<float,2,3> { - { + { { 3.0, 2.0, 1.0 }, { 2.0, 1.0, 0.0 } } @@ -196,10 +199,11 @@ TEST_CASE("[cpu/operator] ArgMax(forward)", "[ArgMax][CPU]") { op->associateInput(0,myInput); op->setDataType(DataType::Float32); op->setBackend("cpu"); - std::cout << " ............... "<< std::endl; + fmt::print("{:.^20}\n", "forward"); myArgMax->forward(); + fmt::print("{:.^20}\n", "result"); op->getOutput(0)->print(); - std::cout <<"------"<<std::endl; + fmt::print("{:.^20}\n", "truth"); myOutput.print(); REQUIRE(*(op->getOutput(0)) == myOutput); diff --git a/unit_tests/operator/Test_Atan.cpp b/unit_tests/operator/Test_Atan.cpp index 9548e35d81b0423125424a4198d82558c4e57df4..b9438db0b38642e8c49e46451544a68714ac4de6 100644 --- a/unit_tests/operator/Test_Atan.cpp +++ b/unit_tests/operator/Test_Atan.cpp @@ -9,14 +9,18 @@ * ********************************************************************************/ +#include <cmath> // std::abs +#include <cstddef> // std::size_t +#include <memory> + #include <catch2/catch_test_macros.hpp> +#include "aidge/backend/cpu/operator/AtanImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" #include "aidge/operator/Atan.hpp" - -#include "aidge/backend/cpu.hpp" - -#include <memory> +#include "aidge/utils/ArrayHelpers.hpp" using namespace Aidge; @@ -32,7 +36,7 @@ TEST_CASE("[cpu/operator] Atan(forward)") { 0.09486303, 0.16007232, 0.40421187, 0.4102045, 0.39055911}}); std::shared_ptr<Node> myAtan = Atan(); - auto op = std::static_pointer_cast<OperatorTensor>(myAtan->getOperator()); + auto op = std::static_pointer_cast<Atan_Op>(myAtan->getOperator()); op->associateInput(0, input0); op->setDataType(DataType::Float32); op->setBackend("cpu"); @@ -61,7 +65,7 @@ TEST_CASE("[cpu/operator] Atan(forward)") { {0.75377332, 0.77411225, 0.32928031}}}}); std::shared_ptr<Node> myAtan = Atan(); - auto op = std::static_pointer_cast<OperatorTensor>(myAtan->getOperator()); + auto op = std::static_pointer_cast<Atan_Op>(myAtan->getOperator()); op->associateInput(0, input0); op->setDataType(DataType::Float32); op->setBackend("cpu"); diff --git a/unit_tests/operator/Test_AvgPoolingImpl.cpp b/unit_tests/operator/Test_AvgPoolingImpl.cpp index aaa2757830c245275d02792a7a5a2eb1db32d7b8..372febc61d04c2ba983dd33f009fe5bf1d2908a0 100644 --- a/unit_tests/operator/Test_AvgPoolingImpl.cpp +++ b/unit_tests/operator/Test_AvgPoolingImpl.cpp @@ -9,14 +9,18 @@ * ********************************************************************************/ -#include <catch2/catch_test_macros.hpp> +#include <cmath> // std::abs +#include <cstddef> // std::size_t #include <memory> -#include <cstdlib> +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" #include "aidge/operator/AvgPooling.hpp" - -#include "aidge/backend/cpu.hpp" +#include "aidge/utils/ArrayHelpers.hpp" using namespace Aidge; @@ -53,7 +57,7 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") { }); SECTION("Stride") { std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "mycdw", {2,2}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator()); + auto op = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool -> getOperator()); std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> { { @@ -90,7 +94,7 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") { } }); std::shared_ptr<Node> myAvgPool = AvgPooling({3,3}, "mycdw", {3,3}); - auto op = std::static_pointer_cast<OperatorTensor>(myAvgPool -> getOperator()); + auto op = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool -> getOperator()); Tensor myOutput = Array4D<float,1,1,1,1> { {{{{(0.3745 + 0.9507 + 0.7320 + 0.5987 + 0.1560 + 0.1560 + 0.0581 + 0.8662 + 0.6011)/9.0}}}} diff --git a/unit_tests/operator/Test_BatchNormImpl.cpp b/unit_tests/operator/Test_BatchNormImpl.cpp index 1b42c90dd09d63cd319f19bd29751da816db06c0..26e964f9386e19a6070d75a4106b6b46a29e455d 100644 --- a/unit_tests/operator/Test_BatchNormImpl.cpp +++ b/unit_tests/operator/Test_BatchNormImpl.cpp @@ -9,20 +9,24 @@ * ********************************************************************************/ -#include <catch2/catch_test_macros.hpp> +#include <cmath> // std::abs +#include <cstddef> // std::size_t #include <memory> +#include <catch2/catch_test_macros.hpp> + +#include "aidge/backend/cpu/operator/BatchNormImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" #include "aidge/operator/BatchNorm.hpp" -#include "aidge/scheduler/SequentialScheduler.hpp" - -#include "aidge/backend/cpu.hpp" +#include "aidge/utils/ArrayHelpers.hpp" using namespace Aidge; TEST_CASE("[cpu/operator] BatchNorm(forward)", "[BatchNorm][CPU]") { std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm"); - auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator()); + auto op = std::static_pointer_cast<BatchNorm_Op<2>>(myBatchNorm -> getOperator()); std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array1D<float,3> {{0.9044, 0.3028, 0.0218}}); std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.1332, 0.7503, 0.0878}}); std::shared_ptr<Tensor> myMean = std::make_shared<Tensor>(Array1D<float,3> {{0.9931, 0.8421, 0.9936}}); diff --git a/unit_tests/operator/Test_BitShift.cpp b/unit_tests/operator/Test_BitShift.cpp index a52990bc7991a325ce151cf6634b0d5a831992c8..33ab932e296be717604be42716d7abe2b61f65ee 100644 --- a/unit_tests/operator/Test_BitShift.cpp +++ b/unit_tests/operator/Test_BitShift.cpp @@ -9,15 +9,20 @@ * ********************************************************************************/ -#include <catch2/catch_test_macros.hpp> +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock #include <cstddef> // std::size_t #include <cstdint> // std::uint16_t #include <chrono> -#include <iostream> #include <memory> -#include <numeric> +#include <numeric> #include <random> // std::random_device, std::mt19937, std::uniform_real_distribution -#include <iomanip> + +#include <catch2/catch_test_macros.hpp> +#include <fmt/core.h> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/BitShiftImpl.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/operator/BitShift.hpp" #include "aidge/utils/TensorUtils.hpp" @@ -29,7 +34,7 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") { // Create a random number generator std::random_device rd; std::mt19937 gen(rd()); - std::uniform_int_distribution<int> valueDist(-15, 15); + std::uniform_int_distribution<int> valueDist(-15, 15); std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5)); std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3)); std::uniform_int_distribution<int> boolDist(0,1); @@ -131,8 +136,8 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") { } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {}μs\n", duration.count()); } SECTION("Test BitShift kernels with Broadcasting") { std::size_t number_of_operation = 0; @@ -194,7 +199,7 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") { } else { - result[idx_out + d] = array0[idx0] >> array1[idx1]; + result[idx_out + d] = array0[idx0] >> array1[idx1]; } } } @@ -222,12 +227,7 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") { duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); // comparison between truth and computed result - bool equiv = (approxEq<int>(*(op->getOutput(0)), *Tres)); - if(equiv == false) - { - std::cout << "Problem\n"; - } - REQUIRE(equiv); + REQUIRE(approxEq<int>(*(op->getOutput(0)), *Tres)); delete[] array0; delete[] array1; @@ -236,8 +236,8 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") { const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); number_of_operation += nb_elements; } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {}μs\n", duration.count()); } } diff --git a/unit_tests/operator/Test_ClipImpl.cpp b/unit_tests/operator/Test_ClipImpl.cpp index 45c8da5bf7ecc84fad6b3e694fe204540f579af3..99147ac93bd659dd91897f6b7f1f3f33e5552ef6 100644 --- a/unit_tests/operator/Test_ClipImpl.cpp +++ b/unit_tests/operator/Test_ClipImpl.cpp @@ -9,36 +9,37 @@ * ********************************************************************************/ -#include <catch2/catch_test_macros.hpp> +#include <algorithm> // std::max, std::min +#include <chrono> #include <cstddef> // std::size_t #include <cstdint> // std::uint16_t -#include <chrono> -#include <iostream> -#include <vector> -#include <algorithm> -#include <iomanip> #include <memory> -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> + +#include <catch2/catch_test_macros.hpp> +#include <fmt/core.h> +#include "aidge/backend/cpu/operator/ClipImpl.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/operator/Clip.hpp" #include "aidge/operator/OperatorTensor.hpp" #include "aidge/utils/TensorUtils.hpp" -#include "aidge/backend/cpu.hpp" void ComputeClipBackward(const std::vector<float>& vec1, std::vector<float>& vec2, float min, float max) { if (vec1.size() != vec2.size()) { - std::cerr << "Vectors should have the same sizes." << std::endl; + fmt::print(stderr, "Vectors should have the same sizes.\n"); return; } - for (size_t i = 0; i < vec1.size(); ++i) { + for (std::size_t i = 0; i < vec1.size(); ++i) { if (vec1[i] < min || vec1[i] > max) { vec2[i] = 0.0f; } } } -namespace Aidge +namespace Aidge { TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") { @@ -47,8 +48,8 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> dis(0.0, 10.0); - std::uniform_real_distribution<float> dismin(0.0, 4.5); - std::uniform_real_distribution<float> dismax(5.5, 10.0); + std::uniform_real_distribution<float> dismin(0.0, 4.5); + std::uniform_real_distribution<float> dismax(5.5, 10.0); std::uniform_int_distribution<std::size_t> distDims(5,15); std::uniform_int_distribution<std::size_t> distNbMatrix(1, 5); @@ -71,7 +72,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") // Create and populate the array with random float values float* Array = new float[dim0*dim1]; - for (int i = 0; i < dim0*dim1; ++i) { + for (std::size_t i = 0; i < dim0*dim1; ++i) { Array[i] = dis(gen); // Generate random float value } @@ -80,7 +81,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") TInput -> resize({dim0,dim1}); TInput -> setBackend("cpu"); TInput -> getImpl() -> setRawPtr(Array, dim0*dim1); - + float min = dismin(gen); std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32); Tmin -> resize({}); @@ -109,7 +110,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") op->setDataType(DataType::Float32); op->setBackend("cpu"); op->forwardDims(true); - + start = std::chrono::system_clock::now(); myClip->forward(); end = std::chrono::system_clock::now(); @@ -118,9 +119,9 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); } - std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; - std::cout << "total time: " << duration.count() << std::endl; - } + Log::info("multiplications over time spent: {}\n", totalComputation/duration.count()); + Log::info("total time: {}\n", duration.count()); + } SECTION("Clip test with min >= max [Forward]") { std::size_t totalComputation = 0; for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { @@ -131,7 +132,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") // Create and populate the array with random float values float* Array = new float[dim0*dim1]; - for (int i = 0; i < dim0*dim1; ++i) { + for (std::size_t i = 0; i < dim0*dim1; ++i) { Array[i] = dis(gen); // Generate random float value } @@ -140,7 +141,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") TInput -> resize({dim0,dim1}); TInput -> setBackend("cpu"); TInput -> getImpl() -> setRawPtr(Array, dim0*dim1); - + float min = dismax(gen); std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32); Tmin -> resize({}); @@ -169,7 +170,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") op->setDataType(DataType::Float32); op->setBackend("cpu"); op->forwardDims(true); - + start = std::chrono::system_clock::now(); myClip->forward(); end = std::chrono::system_clock::now(); @@ -178,13 +179,13 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); } - std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; - std::cout << "total time: " << duration.count() << std::endl; - } + Log::info("multiplications over time spent: {}\n", totalComputation/duration.count()); + Log::info("total time: {}\n", duration.count()); + } SECTION("Clip with Clip Attr [Forward]") { std::size_t totalComputation = 0; - for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { float min = dismin(gen); @@ -200,7 +201,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") // Create and populate the array with random float values float* Array = new float[dim0*dim1]; - for (int i = 0; i < dim0*dim1; ++i) { + for (std::size_t i = 0; i < dim0*dim1; ++i) { Array[i] = dis(gen); // Generate random float value } // Convert Input to Tensor @@ -231,8 +232,8 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); } - std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; - std::cout << "total time: " << duration.count() << std::endl; + Log::info("multiplications over time spent: {}\n", totalComputation/duration.count()); + Log::info("total time: {}\n", duration.count()); } SECTION("Simple clip test [Backward]") { std::size_t totalComputation = 0; @@ -243,13 +244,13 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") // generate Tensors dimensions const std::size_t dim0 = distDims(gen); const std::size_t dim1 = distDims(gen); - + totalComputation += dim0*dim1; // Create and populate the array with random float values float* Array = new float[dim0*dim1]; float* gradArray = new float[dim0*dim1]; - for (int i = 0; i < dim0*dim1; ++i) { + for (std::size_t i = 0; i < dim0*dim1; ++i) { Array[i] = dis(gen); // Generate random float value gradArray[i] = dis(gen); } @@ -264,7 +265,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") TInput -> resize({dim0,dim1}); TInput -> setBackend("cpu"); TInput -> getImpl() -> setRawPtr(Array, dim0*dim1); - + float min = dismin(gen); std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32); Tmin -> resize({}); @@ -296,7 +297,7 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") myClip->forward(); op->getOutput(0)->setGrad(TGrad); - + start = std::chrono::system_clock::now(); REQUIRE_NOTHROW(myClip->backward()); end = std::chrono::system_clock::now(); @@ -310,9 +311,9 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); REQUIRE(GT1 == BackwardTensorVec); } - std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; - std::cout << "total time: " << duration.count() << std::endl; + Log::info("multiplications over time spent: {}\n", totalComputation/duration.count()); + Log::info("total time: {}\n", duration.count()); } } -} // namespace Aidge +} // namespace Aidge } \ No newline at end of file diff --git a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp index 42505d385fde7e72e09531f1607287ffc6978f75..8ec1669b92a5116999413cf55a8c5113363ef330 100644 --- a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp +++ b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp @@ -9,32 +9,27 @@ * ********************************************************************************/ -#include <algorithm> -#include <chrono> -#include <cmath> -#include <cstddef> // std::size_t -#include <cstdint> // std::uint16_t -#include <iostream> +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cstddef> // std::size_t +#include <cstdint> // std::int64_t, std::uint16_t #include <memory> -#include <numeric> // std::accumulate -#include <ostream> -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> -#include "catch2/internal/catch_compiler_capabilities.hpp" -#include "catch2/internal/catch_enforce.hpp" #include <catch2/catch_test_macros.hpp> #include <catch2/generators/catch_generators_random.hpp> +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/filler/Filler.hpp" #include "aidge/operator/ConstantOfShape.hpp" +#include "aidge/operator/OperatorTensor.hpp" #include "aidge/utils/TensorUtils.hpp" -#include <aidge/data/Data.hpp> -#include <aidge/data/half.hpp> -#include <aidge/filler/Filler.hpp> -#include <aidge/operator/OperatorTensor.hpp> -#include <aidge/operator/Reshape.hpp> -#include <aidge/utils/TensorUtils.hpp> -#include <aidge/utils/Types.h> +#include "aidge/utils/Types.h" namespace Aidge { TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") { @@ -62,7 +57,7 @@ TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") { result->setDataType(DataType::Int64); result->setBackend("cpu"); for (DimSize_t i = 0; i < result->size(); ++i) { - result->set<int64_t>(i, input_tensor_values_dist(gen)); + result->set<std::int64_t>(i, input_tensor_values_dist(gen)); } return result; }; diff --git a/unit_tests/operator/Test_DivImpl.cpp b/unit_tests/operator/Test_DivImpl.cpp index 5d7dfdf12032d4c444e38cda6d2a4298fc552b14..4037b2ad4e117573279f07d0c1819d3435ee7ada 100644 --- a/unit_tests/operator/Test_DivImpl.cpp +++ b/unit_tests/operator/Test_DivImpl.cpp @@ -9,17 +9,26 @@ * ********************************************************************************/ -#include <catch2/catch_test_macros.hpp> -#include <cstddef> // std::size_t -#include <cstdint> // std::uint16_t -#include <chrono> -#include <iostream> +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies #include <memory> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> + +#include <catch2/catch_test_macros.hpp> +#include <fmt/core.h> +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/DivImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/operator/Div.hpp" +#include "aidge/operator/OperatorTensor.hpp" #include "aidge/utils/TensorUtils.hpp" namespace Aidge { @@ -117,8 +126,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") { // with broadcasting } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {} μs\n", duration.count()); } SECTION("+1-D Tensor / +1-D Tensor - broadcasting") { @@ -212,8 +221,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") { const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); number_of_operation += nb_elements; } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {} μs\n", duration.count()); } SECTION("+1-D Tensor / 1-D Tensor") { std::size_t number_of_operation = 0; @@ -308,8 +317,8 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") { number_of_operation += nb_elements; } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {} μs\n", duration.count()); } } } diff --git a/unit_tests/operator/Test_EqualImpl.cpp b/unit_tests/operator/Test_EqualImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a229b8ce3ebcd7672323f2585e3a48343f544c3d --- /dev/null +++ b/unit_tests/operator/Test_EqualImpl.cpp @@ -0,0 +1,205 @@ +/******************************************************************************** + * Copyright (c) 2024 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <catch2/catch_test_macros.hpp> +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/Equal.hpp" + +#include "aidge/backend/cpu.hpp" + +using namespace Aidge; + +TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") { + SECTION("ForwardDims") + { + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5)); + std::uniform_int_distribution<int> boolDist(0,1); + + SECTION("Same dimensions") { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + DimSize_t nbDims = nbDimsDist(gen); + std::vector<DimSize_t> dims(nbDims); + for (std::size_t i = 0; i < nbDims; i++) { + dims[i] = dimSizeDist(gen); + } + + std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims); + myInput1->setBackend("cpu"); + myInput1->setDataType(DataType::Float32); + myInput1->zeros(); + std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims); + myInput2->setBackend("cpu"); + myInput2->setDataType(DataType::Float32); + myInput2->zeros(); + std::shared_ptr<Node> myEqual = Equal(); + auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator()); + op->associateInput(0,myInput1); + op->associateInput(1,myInput2); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->forwardDims(); + + const auto outputDims = op->getOutput(0)->dims(); + REQUIRE(outputDims == dims); + } + } + SECTION("Broadcasting") { + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + DimSize_t nbDims = nbDimsDist(gen); + std::vector<DimSize_t> dims1(nbDims, 1); + std::vector<DimSize_t> dims2(nbDims, 1); + std::vector<DimSize_t> expectedOutDims; + for (std::size_t i = 0; i < nbDims; i++) { + DimSize_t dim = dimSizeDist(gen); + if (boolDist(gen)) { + dims1[i] = dim; + } + if (boolDist(gen)) { + dims2[i] = dim; + } + expectedOutDims.push_back(std::max(dims1[i],dims2[i])); + } + + + std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1); + myInput1->setBackend("cpu"); + myInput1->setDataType(DataType::Float32); + myInput1->zeros(); + std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2); + myInput2->setBackend("cpu"); + myInput2->setDataType(DataType::Float32); + myInput2->zeros(); + std::shared_ptr<Node> myEqual = Equal(); + auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator()); + op->associateInput(0,myInput1); + op->associateInput(1,myInput2); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + op->forwardDims(); + + const auto outputDims = op->getOutput(0)->dims(); + REQUIRE(outputDims == expectedOutDims); + } + } + } + SECTION("Same size inputs") { + std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { + { // + { // + {{20, 15},{31, 11},{22, 49}}, // + {{41, 10},{24, 51},{27, 52}}, // + {{26, 53},{27, 54},{28, 55}} // + }, // + { // + {{29, 56},{30, 57},{31, 58}}, // + {{32, 59},{33, 60},{34, 61}}, // + {{35, 62},{36, 63},{37, 64}} // + }, // + { // + {{38, 65},{39, 66},{40, 67}}, // + {{41, 68},{42, 69},{43, 70}}, // + {{44, 71},{45, 72},{46, 73}} // + } // + } // + }); // + std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { + { // + { // + {{20, 47},{21, 48},{22, 49}}, // + {{23, 50},{24, 51},{25, 52}}, // + {{17, 53},{27, 26},{14, 33}} // + }, // + { // + {{29, 56},{30, 57},{31, 58}}, // + {{72, 44},{33, 20},{27, 55}}, // + {{35, 24},{25, 63},{28, 64}} // + }, // + { // + {{32, 65},{39, 66},{40, 70}}, // + {{41, 53},{42, 60},{34, 70}}, // + {{44, 71},{30, 12},{46, 73}} // + } // + } // + }); // + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { + { + { + {{1, 0},{0, 0},{1, 1}}, + {{0, 0},{1, 1},{0, 1}}, + {{0, 1},{1, 0},{0, 0}} + }, + { + {{1, 1},{1, 1},{1, 1}}, + {{0, 0},{1, 0},{0, 0}}, + {{1, 0},{0, 1},{0, 1}} + }, + { + {{0, 1},{1, 1},{1, 0}}, + {{1, 0},{1, 0},{0, 1}}, + {{1, 1},{0, 0},{1, 1}} + } + } + }); + + std::shared_ptr<Node> myEqual = Equal(); + auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator()); + op->associateInput(0, input1); + op->associateInput(1, input2); + op->setBackend("cpu"); + op->setDataType(DataType::Int32); + myEqual->forward(); + + REQUIRE(*(op->getOutput(0)) == *expectedOutput); + } + + SECTION("Broadcasting") { + std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> { + { // + { // + {{10, 20},{22, 23},{20, 20}}, // + {{10, 15},{10, 29},{20, 20}}, // + {{26, 25},{33, 20},{10, 20}} // + } // + } // + }); // + + std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}}); + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> { + { // + { // + {{ 1, 1},{ 0, 0},{ 0, 1}}, // + {{ 1, 0},{ 1, 0},{ 0, 1}}, // + {{ 0, 0},{ 0, 1},{ 1, 1}} // + } // + } // + }); // + + std::shared_ptr<Node> myEqual = Equal(); + auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator()); + op->associateInput(0, input_1); + op->associateInput(1, input_2); + op->setDataType(DataType::Int32); + op->setBackend("cpu"); + myEqual->forward(); + op->getOutput(0)->print(); + expectedOutput->print(); + REQUIRE(*op->getOutput(0) == *expectedOutput); + } +} \ No newline at end of file diff --git a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp index 43af544871ad6c2ac319de09f3c6fce5065e60d5..8e8536accadcb874f74d4d962aae435bc1351d6e 100644 --- a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp +++ b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp @@ -9,34 +9,29 @@ * ********************************************************************************/ -#include <aidge/utils/Types.h> -#include <catch2/catch_test_macros.hpp> #include <chrono> -#include <cmath> #include <cstddef> // std::size_t #include <cstdint> // std::uint16_t -#include <iostream> +#include <functional> // std::multiplies #include <memory> #include <numeric> // std::accumulate -#include <ostream> -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> + +#include <catch2/catch_test_macros.hpp> +#include <fmt/core.h> +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/operator/GlobalAveragePooling.hpp" #include "aidge/utils/TensorUtils.hpp" - -// debug print function -void print_tensor(Aidge::Tensor &T) { - // Print tensors - std::cout << "Tensor : size =  ["; - for (auto &dim : T.dims()) { - std::cout << dim << " , "; - } - std::cout << "]" << std::endl; - T.print(); -} +#include "aidge/utils/Types.h" namespace Aidge { + TEST_CASE("[cpu/operator] GlobalAveragePooling", "[GlobalAveragePooling][CPU]") { constexpr std::uint16_t NBTRIALS = 10; @@ -54,9 +49,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling", std::size_t(7)); // Create MatGlobalAveragePooling Operator - std::shared_ptr<Node> globAvgPool = GlobalAveragePooling(); - auto op = - std::static_pointer_cast<OperatorTensor>(globAvgPool->getOperator()); + std::shared_ptr<GlobalAveragePooling_Op> op = std::make_shared<GlobalAveragePooling_Op>(); op->setDataType(DataType::Float32); op->setBackend("cpu"); @@ -99,7 +92,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling", T0->resize(dims); T0->getImpl()->setRawPtr(array0, nb_elements); - REQUIRE_THROWS(globAvgPool->forward()); + REQUIRE_THROWS(op->forward()); delete[] array0; } @@ -158,7 +151,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling", op->forwardDims(); start = std::chrono::system_clock::now(); - REQUIRE_NOTHROW(globAvgPool->forward()); + REQUIRE_NOTHROW(op->forward()); end = std::chrono::system_clock::now(); duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); @@ -231,7 +224,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling", op->forwardDims(); start = std::chrono::system_clock::now(); - REQUIRE_NOTHROW(globAvgPool->forward()); + REQUIRE_NOTHROW(op->forward()); end = std::chrono::system_clock::now(); duration += std::chrono::duration_cast<std::chrono::microseconds>( end - start); @@ -358,7 +351,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling", Tres->getImpl()->setRawPtr(result, out_nb_elems); op->forwardDims(); start = std::chrono::system_clock::now(); - REQUIRE_NOTHROW(globAvgPool->forward()); + REQUIRE_NOTHROW(op->forward()); end = std::chrono::system_clock::now(); duration += std::chrono::duration_cast<std::chrono::microseconds>( end - start); @@ -547,7 +540,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling", Tres->getImpl()->setRawPtr(result, out_nb_elems); op->forwardDims(); start = std::chrono::system_clock::now(); - REQUIRE_NOTHROW(globAvgPool->forward()); + REQUIRE_NOTHROW(op->forward()); end = std::chrono::system_clock::now(); duration += std::chrono::duration_cast<std::chrono::microseconds>( end - start); @@ -561,12 +554,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling", delete[] result; } } - std::cout << "GlobalAveragePooling total execution time : " - << duration.count() << "µs" << std::endl; - std::cout << "Number of operations : " << number_of_operation - << std::endl; - std::cout << "Operation / µs = " << number_of_operation / duration.count() - << std::endl; + Log::info("GlobalAveragePooling total execution time: {}µs\n", duration.count()); + Log::info("Number of operations : {}\n", number_of_operation); + Log::info("Operation / µs = {}\n", number_of_operation / duration.count()); } } } diff --git a/unit_tests/operator/Test_MatMulImpl.cpp b/unit_tests/operator/Test_MatMulImpl.cpp index d6e934b4dc8d84e8a595eb74d1af9d2c68c892d1..f062f06cddfbd04217d63e1edcb6505914bc77e9 100644 --- a/unit_tests/operator/Test_MatMulImpl.cpp +++ b/unit_tests/operator/Test_MatMulImpl.cpp @@ -9,21 +9,26 @@ * ********************************************************************************/ -#include <catch2/catch_test_macros.hpp> -#include <cstddef> // std::size_t -#include <cstdint> // std::uint16_t -#include <chrono> -#include <iostream> +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock, std::chrono::duration +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t #include <memory> -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> + +#include <catch2/catch_test_macros.hpp> +#include <fmt/core.h> +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/MatMulImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/operator/MatMul.hpp" #include "aidge/operator/OperatorTensor.hpp" #include "aidge/utils/TensorUtils.hpp" -#include "aidge/backend/cpu/operator/MatMulImpl.hpp" - namespace Aidge { TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") { @@ -106,8 +111,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") { delete[] bigArray2; delete[] res; } - std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; - std::cout << "total time: " << duration.count() << std::endl; + Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count())); + Log::info("total time: {} μs\n", duration.count()); } SECTION("3-D Tensors") { @@ -174,8 +179,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") { delete[] bigArray2; delete[] res; } - std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; - std::cout << "total time: " << duration.count() << std::endl; + Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count())); + Log::info("total time: {} μs\n", duration.count()); } SECTION("4-D Tensors") { @@ -244,8 +249,8 @@ TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") { delete[] bigArray2; delete[] res; } - std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; - std::cout << "total time: " << duration.count() << std::endl; + Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count())); + Log::info("total time: {} μs\n", duration.count()); } SECTION("+2-D / 1-D") { diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp index af04ede4e33c32ce785804e2484b6ba9ac5edc36..7db082c5c8c564648f65bedc27240e3d9b775eef 100644 --- a/unit_tests/operator/Test_MaxPoolingImpl.cpp +++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp @@ -12,6 +12,7 @@ #include <catch2/catch_test_macros.hpp> #include <memory> #include <cstdlib> +#include <iostream> #include "aidge/data/Tensor.hpp" #include "aidge/operator/MaxPooling.hpp" @@ -79,4 +80,39 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") { op->getOutput(0)->print(); REQUIRE(*(op->getOutput(0)) == *myOutput); } + SECTION("Dilation") { + std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2}, {2,2}); // Dilation 2x2 + auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator()); + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> { + { + { + { + {0.71470, 0.52770}, + {0.71470, 0.48740} + }, + { + {2.23290, 0.48590}, + {2.23290, 0.07000} + } + }, + { + { + {1.76530, 1.20710}, + {1.76530, 1.20710} + }, + { + {1.04290, 0.67760}, + {1.72170, 0.67760} + } + } + } + }); + myMaxPool->getOperator()->associateInput(0,myInput); + myMaxPool->getOperator()->setDataType(DataType::Float32); + myMaxPool->getOperator()->setBackend("cpu"); + myMaxPool->forward(); + op->getOutput(0)->print(); + REQUIRE(*(op->getOutput(0)) == *myOutput); + } } \ No newline at end of file diff --git a/unit_tests/operator/Test_Memorize.cpp b/unit_tests/operator/Test_Memorize.cpp new file mode 100644 index 0000000000000000000000000000000000000000..45ab40c5315e88b2db48c1fba92da0ab9b587398 --- /dev/null +++ b/unit_tests/operator/Test_Memorize.cpp @@ -0,0 +1,65 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <catch2/catch_test_macros.hpp> +#include <memory> +#include <string> + +#include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/graph/GraphView.hpp" +#include "aidge/graph/OpArgs.hpp" +#include "aidge/operator/Memorize.hpp" +#include "aidge/operator/Producer.hpp" +#include "aidge/scheduler/SequentialScheduler.hpp" + +#include "aidge/backend/cpu.hpp" +#include "aidge/recipes/GraphViewHelper.hpp" + + +namespace Aidge { + +TEST_CASE("[cpu/operator] Memorize(forward)", "[Memorize][CPU]") { + SECTION("Test simple") { + std::shared_ptr<Tensor> inputTensor = + std::make_shared<Tensor>(Array1D<int, 1>{{1}}); + + auto input = Producer({1}, "input"); + auto init = Producer({1}, "init"); + auto add = Add("add"); + auto mem = Memorize(3, "mem"); + + input->addChild(add, 0, 0); + init->addChild(mem, 0, 1); + add->addChild(mem, 0,0); + mem->addChild(/*otherNode=*/add, /*outId=*/1, /*otherInId=*/1); + + input->getOperator()->setOutput(0, inputTensor); + init->getOperator()->setOutput(0, inputTensor); + + auto g = getConnectedGraphView(input); + + g->setDataType(Aidge::DataType::Int32); + g->setBackend("cpu"); + g->forwardDims(); + g->save("simple_graph"); + + SequentialScheduler scheduler(g); + REQUIRE_NOTHROW(scheduler.forward()); + scheduler.saveSchedulingDiagram("simple"); + + const auto expectedOutput = std::make_shared<Tensor>(Array1D<int, 1>{{4}}); + std::shared_ptr<Tensor> other = std::static_pointer_cast<OperatorTensor>(mem->getOperator())->getOutput(0); + other->print(); + REQUIRE((*other == *expectedOutput)); + } +} +} // namespace Aidge diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp index 3378861d0d3d7e74e7867c2765a0b09069fa8caf..2937e94938c671140eeeee87d47d5c48f685203e 100644 --- a/unit_tests/operator/Test_MulImpl.cpp +++ b/unit_tests/operator/Test_MulImpl.cpp @@ -9,379 +9,336 @@ * ********************************************************************************/ -#include <catch2/catch_test_macros.hpp> -#include <cstddef> // std::size_t -#include <cstdint> // std::uint16_t #include <chrono> -#include <iostream> +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t #include <memory> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution, + // std::uniform_int_distribution + +#include <catch2/catch_test_macros.hpp> +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/MulImpl.hpp" +#include "aidge/data/DataType.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/operator/Mul.hpp" +#include "aidge/utils/ArrayHelpers.hpp" +#include "aidge/utils/Log.hpp" #include "aidge/utils/TensorUtils.hpp" namespace Aidge { - TEST_CASE("[CPU/Operator] Mul Backward", "[Mul][CPU][Backward]") - { - std::shared_ptr<Node> myMul = Mul(); - auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator()); - op->setDataType(DataType::Float32); - op->setBackend("cpu"); - - SECTION("Case 1: 2D and 1D tensors") { - const auto T0 = std::make_shared<Tensor>(Array2D<float,2,3>( - { - { - {1,2,3},{4,5,6} - } - } - )); +TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") { + std::shared_ptr<Mul_Op> op = std::make_shared<Mul_Op>(); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); - const auto T1 = std::make_shared<Tensor>(Array1D<float,3>( - {0.1,0.2,0.3} - )); + // NOTE: The first four tests use fixed values, the last one uses random values but static dimensions. - T0->setDataType(DataType::Float32); - T0->setBackend("cpu"); - T1->setDataType(DataType::Float32); - T1->setBackend("cpu"); + SECTION("Case 1: 1D and 2D Tensors") { + const auto T0 = std::make_shared<Tensor>( + Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 2, 3}, {4, 5, 6}}})); - op->getOutput(0)->setGrad(std::make_shared<Tensor>(Array2D<float,2,3>({{{1.0,1.0,1.0},{1.0,1.0,1.0}}}))); + const auto T1 = + std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, 3>({0.1, 0.2, 0.3})); - op->associateInput(0,T0); - op->associateInput(1,T1); - op->forwardDims(); + op->associateInput(0, T0); + op->associateInput(1, T1); + op->getOutput(0)->setGrad(std::make_shared<Tensor>( + Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}))); + op->forwardDims(); - myMul->forward(); - myMul->backward(); + op->backward(); - auto T0Grad = std::make_shared<Tensor>(Array2D<float, 2,3>({{{0.1,0.2,0.3},{0.1, 0.2, 0.3}}})); - auto T1Grad = std::make_shared<Tensor>(Array1D<float, 3>({5,7,9})); + const Tensor expectedGrad0 = + Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{0.1, 0.2, 0.3}, {0.1, 0.2, 0.3}}}); - REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *T0Grad)); - REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *T1Grad)); - } + const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({5, 7, 9}); - SECTION("Case 2: 3D and 1D tensors") { - const auto T0 = std::make_shared<Tensor>(Array3D<float,2,2,3>( - { - { - { - {1.0, 2.0, 3.0}, - {4.0, 5.0, 6.0} - }, - { - {7.0, 8.0, 9.0}, - {10.0, 11.0, 12.0} - } - } - } - )); - - const auto T1 = std::make_shared<Tensor>(Array1D<float, 3>({0.3,0.2,0.1})); - - const auto newGrad = std::make_shared<Tensor>(Array3D<float,2,2,3>( - { - { - { - {1, 1, 1}, - {1, 1, 1} - }, - { - {1, 1, 1}, - {1, 1, 1} - } - } - } - )); - - const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,2,3>( - { - { - { - {0.3, 0.2, 0.1}, - {0.3, 0.2, 0.1} - }, - { - {0.3, 0.2, 0.1}, - {0.3, 0.2, 0.1} - } - } - } - )); + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1)); + } - const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float,3>( - {22.0, 26.0, 30.0} - )); + SECTION("Case 2: 3D and 1D tensors") { + const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}, + {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}})); - for(auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1}) - { - T->setBackend("cpu") ; - T->setDataType(DataType::Float32); - } + const auto T1 = + std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1})); - op->associateInput(0, T0); - op->associateInput(1, T1); - op->getOutput(0)->setGrad(newGrad); - op->forwardDims(); + const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}})); - myMul->backward(); + const Tensor expectedGrad0 = + Array3D<float, 2, 2, 3>({{{{0.3, 0.2, 0.1}, {0.3, 0.2, 0.1}}, + {{0.3, 0.2, 0.1}, {0.3, 0.2, 0.1}}}}); - REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0)); - REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1)); - } + const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({22.0, 26.0, 30.0}); - SECTION("Case 3: 4D and 2D tensors") { - const auto T0 = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>( - { - { - { - { - {1.0, 2.0, 3.0}, - {4.0, 5.0, 6.0}, - {7.0, 8.0, 9.0} - }, - { - {10.0, 11.0, 12.0}, - {13.0, 14.0, 15.0}, - {16.0, 17.0, 18.0} - } - }, - { - { - {19.0, 20.0, 21.0}, - {22.0, 23.0, 24.0}, - {25.0, 26.0, 27.0} - }, - { - {28.0, 29.0, 30.0}, - {31.0, 32.0, 33.0}, - {34.0, 35.0, 36.0} - } - } - } - } - )); - - const auto T1 = std::make_shared<Tensor>(Array2D<float, 3,3>( - { - { - {0.5,0.3,0.1}, - {0.4,0.2,0.6}, - {0.7,0.8,0.9} - } - } - )); - - const auto newGrad = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>( - { - { - { - { - {1.0, 1.0, 1.0}, - {1.0, 1.0, 1.0}, - {1.0, 1.0, 1.0} - }, - { - {1.0, 1.0, 1.0}, - {1.0, 1.0, 1.0}, - {1.0, 1.0, 1.0} - } - }, - { - { - {1.0, 1.0, 1.0}, - {1.0, 1.0, 1.0}, - {1.0, 1.0, 1.0} - }, - { - {1.0, 1.0, 1.0}, - {1.0, 1.0, 1.0}, - {1.0, 1.0, 1.0} - } - } - } - } - )); - - const auto expectedGrad0 = std::make_shared<Tensor>(Array4D<float,2,2,3,3>( - { - { - { - { - {0.5, 0.3, 0.1}, - {0.4, 0.2, 0.6}, - {0.7, 0.8, 0.9} - }, - { - {0.5, 0.3, 0.1}, - {0.4, 0.2, 0.6}, - {0.7, 0.8, 0.9} - } - }, - { - { - {0.5, 0.3, 0.1}, - {0.4, 0.2, 0.6}, - {0.7, 0.8, 0.9} - }, - { - {0.5, 0.3, 0.1}, - {0.4, 0.2, 0.6}, - {0.7, 0.8, 0.9} - } - } - } - } - )); - - const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 3>( - { - { - {58.0, 62.0, 66.0}, - {70.0, 74.0, 78.0}, - {82.0, 86.0, 90.0} - } - } - )); + op->associateInput(0, T0); + op->associateInput(1, T1); + op->getOutput(0)->setGrad(newGrad); + op->forwardDims(); - for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1}) - { - T->setBackend("cpu") ; - T->setDataType(DataType::Float32); - } + op->backward(); + + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1)); + } - op->associateInput(0, T0); - op->associateInput(1, T1); - op->getOutput(0)->setGrad(newGrad); - op->forwardDims(); + SECTION("Case 3: 4D and 2D tensors") { + const auto T0 = std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>( + {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}}, + {{10.0, 11.0, 12.0}, {13.0, 14.0, 15.0}, {16.0, 17.0, 18.0}}}, + {{{19.0, 20.0, 21.0}, {22.0, 23.0, 24.0}, {25.0, 26.0, 27.0}}, + {{28.0, 29.0, 30.0}, + {31.0, 32.0, 33.0}, + {34.0, 35.0, 36.0}}}}})); + + const auto T1 = std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 3>( + {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}})); + + const auto newGrad = + std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>( + {{{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}, + {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}, + {{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}, + {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}})); + + const Tensor expectedGrad0 = + Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>( + {{{{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}, + {{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}, + {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}, + {{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}}}); + + const Tensor expectedGrad1 = + Array2D<cpptype_t<DataType::Float32>, 3, 3>({{{58.0, 62.0, 66.0}, + {70.0, 74.0, 78.0}, + {82.0, 86.0, 90.0}}}); + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->getOutput(0)->setGrad(newGrad); + op->forwardDims(); + + op->backward(); + + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1)); + } + + SECTION("Case 4: 3D and 2D tensors") { + const auto T0 = std::make_shared<Tensor>( + Array3D<float, 2, 3, 4>({{{ + {1.0, 2.0, 3.0, 4.0}, + {5.0, 6.0, 7.0, 8.0}, + {9.0, 10.0, 11.0, 12.0}, + }, + { + {13.0, 14.0, 15.0, 16.0}, + {17.0, 18.0, 19.0, 20.0}, + {21.0, 22.0, 23.0, 24.0}, + }}})); + + const auto T1 = std::make_shared<Tensor>( + Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{0.1, 0.2, 0.3, 0.4}, + {0.5, 0.6, 0.7, 0.8}, + {0.9, 1.0, 1.1, 1.2}}})); + + const auto newGrad = std::make_shared<Tensor>( + Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{ + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + }, + { + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + }}})); + + const Tensor expectedGrad0 = + Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{{0.1, 0.2, 0.3, 0.4}, + {0.5, 0.6, 0.7, 0.8}, + {0.9, 1.0, 1.1, 1.2}}, + {{0.1, 0.2, 0.3, 0.4}, + {0.5, 0.6, 0.7, 0.8}, + {0.9, 1.0, 1.1, 1.2}}}}); + + const Tensor expectedGrad1 = + Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{14.0, 16.0, 18.0, 20.0}, + {22.0, 24.0, 26.0, 28.0}, + {30.0, 32.0, 34.0, 36.0}}}); + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->getOutput(0)->setGrad(newGrad); + op->forwardDims(); + + op->backward(); + + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0)); + REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1)); + } - myMul->backward(); + SECTION("Case 5: Tensors with random values") { - REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0)); - REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1)); + // Use random values + const std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor + const std::vector<std::size_t> dims1 = {2, 6, 7}; // Second tensor + const std::vector<std::size_t> outputDims = {5, 2, 6, 7}; + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> dist(0.1f, 1.0f); + + auto T0 = std::make_shared<Tensor>(dims0); + T0->setDataType(DataType::Float32); + T0->setBackend("cpu"); + float* input0Data = static_cast<float*>(T0->getImpl()->rawPtr()); + // Fill with random values + for (std::size_t i = 0; i < T0->size(); ++i) { + input0Data[i] = dist(gen); } - SECTION("Case 4: 3D and 2D tensors") { - const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 3, 4>( - { - { - { - {1.0, 2.0, 3.0, 4.0}, - {5.0, 6.0, 7.0, 8.0}, - {9.0, 10.0, 11.0, 12.0}, - }, - { - {13.0, 14.0, 15.0, 16.0}, - {17.0, 18.0, 19.0, 20.0}, - {21.0, 22.0, 23.0, 24.0}, - } - } - } - )); - - const auto T1 = std::make_shared<Tensor>(Array2D<float, 3, 4>( - { - { - {0.1, 0.2, 0.3, 0.4}, - {0.5, 0.6, 0.7, 0.8}, - {0.9, 1.0, 1.1, 1.2} - } - } - )); - - const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2,3,4>( - { - { - { - {1.0, 1.0, 1.0, 1.0}, - {1.0, 1.0, 1.0, 1.0}, - {1.0, 1.0, 1.0, 1.0}, - }, - { - {1.0, 1.0, 1.0, 1.0}, - {1.0, 1.0, 1.0, 1.0}, - {1.0, 1.0, 1.0, 1.0}, - } - } - } - )); - - const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,3,4>( - { - { - { - {0.1, 0.2, 0.3, 0.4}, - {0.5, 0.6, 0.7, 0.8}, - {0.9, 1.0, 1.1, 1.2} - }, - { - {0.1, 0.2, 0.3, 0.4}, - {0.5, 0.6, 0.7, 0.8}, - {0.9, 1.0, 1.1, 1.2} - } - } - } - )); - - const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 4>( - { - { - {14.0, 16.0, 18.0, 20.0}, - {22.0, 24.0, 26.0, 28.0}, - {30.0, 32.0, 34.0, 36.0} + auto T1 = std::make_shared<Tensor>(dims1); + T1->setDataType(DataType::Float32); + T1->setBackend("cpu"); + float* input1Data = static_cast<float*>(T1->getImpl()->rawPtr()); + // Fill with random values + for (std::size_t i = 0; i < T1->size(); ++i) { + input1Data[i] = dist(gen); + } + + op->associateInput(0, T0); + op->associateInput(1, T1); + + op->forwardDims(); + op->forward(); + + Tensor expectedOutput{outputDims}; + expectedOutput.setBackend("cpu"); + float* expectedOutputData = static_cast<float*>(expectedOutput.getImpl()->rawPtr()); + + for (std::size_t n = 0; n < 5; ++n) { + for (std::size_t c = 0; c < 2; ++c) { + for (std::size_t h = 0; h < 6; ++h) { + for (std::size_t w = 0; w < 7; ++w) { + std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n)); + std::size_t in0Idx = + w + 7 * (0 + 1 * (c + 2 * n)); // middle dim is 1 + std::size_t in1Idx = + w + 7 * (h + 6 * c); // no n dimension + + expectedOutputData[outIdx] = input0Data[in0Idx] * input1Data[in1Idx]; } } - )); - - for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1}) - { - T->setBackend("cpu") ; - T->setDataType(DataType::Float32); } + } - op->associateInput(0, T0); - op->associateInput(1, T1); - op->getOutput(0)->setGrad(newGrad); - op->forwardDims(); + auto outputTensor = op->getOutput(0); - myMul->backward(); + REQUIRE(approxEq<float>(*outputTensor, expectedOutput)); - REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0)); - REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1)); + // Backward pass + std::vector<float> gradOutputData(expectedOutput.size()); + for (auto &val : gradOutputData) { + val = dist(gen); } + + op->getOutput(0)->setGrad(std::make_shared<Tensor>()); + op->getOutput(0)->grad()->resize(outputDims); + op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(), + expectedOutput.size()); + + // Compute reference gradients + std::vector<float> expectedGrad0(T0->size(), 0.0f); + std::vector<float> expectedGrad1(T1->size(), 0.0f); + + for (std::size_t n = 0; n < 5; ++n) { + for (std::size_t c = 0; c < 2; ++c) { + for (std::size_t h = 0; h < 6; ++h) { + for (std::size_t w = 0; w < 7; ++w) { + std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n)); + std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n)); + std::size_t in1Idx = w + 7 * (h + 6 * c); + + // Gradient for input0: grad_output * input1 + expectedGrad0[in0Idx] += + gradOutputData[outIdx] * input1Data[in1Idx]; + + // Gradient for input1: grad_output * input0 + expectedGrad1[in1Idx] += + gradOutputData[outIdx] * input0Data[in0Idx]; + } + } + } + } + + // Perform backward pass + op->backward(); + + auto expectedGrad0Tensor = std::make_shared<Tensor>(); + expectedGrad0Tensor->resize(T0->dims()); + expectedGrad0Tensor->setBackend("cpu"); + expectedGrad0Tensor->setDataType(DataType::Float32); + expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(), + expectedGrad0.size()); + + auto expectedGrad1Tensor = std::make_shared<Tensor>(T1->dims()); + expectedGrad1Tensor->setBackend("cpu"); + expectedGrad1Tensor->setDataType(DataType::Float32); + expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(), + expectedGrad1.size()); + + // Verify backward pass + REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor)); + REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor)); + + // Optional: Print some values for verification + // std::cout << "Input shapes: (" << dims0[0] << "," << dims0[1] << + // "," << dims0[2] << "," << dims0[3] + // << ") * (" << dims1[0] << "," << dims1[1] << "," << + // dims1[2] + // << ") -> (" << outputDims[0] << "," << outputDims[1] + // << "," << outputDims[2] << "," << outputDims[3] << + // ")\n"; + // std::cout << "Input sizes: " << input0_size << " * " << + // input1_size << " -> " << output_size << "\n"; } +} -TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { +TEST_CASE("[cpu/operator] Mul(forward)", "[Mul][CPU]") { constexpr std::uint16_t NBTRIALS = 10; // Create a random number generator std::random_device rd; std::mt19937 gen(rd()); - std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1 - std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10)); - std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3)); - std::uniform_int_distribution<int> boolDist(0,1); - - // Create MatMul Operator - std::shared_ptr<Node> myMul = Mul(); - auto op = std::static_pointer_cast<OperatorTensor>(myMul-> getOperator()); + std::uniform_real_distribution<float> valueDist( + 0.1f, + 1.1f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), + std::size_t(10)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), + std::size_t(3)); + std::uniform_int_distribution<int> boolDist(0, 1); + + std::shared_ptr<Mul_Op> op = std::make_shared<Mul_Op>(); op->setDataType(DataType::Float32); op->setBackend("cpu"); - // Create 2 input Tensors std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>(); - op->associateInput(0,T0); + op->associateInput(0, T0); T0->setDataType(DataType::Float32); T0->setBackend("cpu"); std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>(); - op -> associateInput(1,T1); + op->associateInput(1, T1); T1->setDataType(DataType::Float32); T1->setBackend("cpu"); - // Create results Tensor std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(); Tres->setDataType(DataType::Float32); Tres->setBackend("cpu"); @@ -391,14 +348,9 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration{}; - SECTION("MulImpl_cpu::forward()") { - SECTION("Scalar / Scalar") { - - } - SECTION("Scalar / +1-D Tensor") { - - } + SECTION("Scalar / Scalar") {} + SECTION("Scalar / +1-D Tensor") {} SECTION("+1-D Tensor / +1-D Tensor - same dimensions") { std::size_t number_of_operation = 0; @@ -413,13 +365,17 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { dims.push_back(dimSizeDist(gen)); } - const auto nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const auto nb_elements = + std::accumulate(dims.cbegin(), + dims.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); number_of_operation += nb_elements; // without broadcasting - float* array0 = new float[nb_elements]; - float* array1 = new float[nb_elements]; - float* result = new float[nb_elements]; + float *array0 = new float[nb_elements]; + float *array1 = new float[nb_elements]; + float *result = new float[nb_elements]; for (std::size_t i = 0; i < nb_elements; ++i) { array0[i] = valueDist(gen); @@ -429,21 +385,23 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { // input0 T0->resize(dims); - T0 -> getImpl() -> setRawPtr(array0, nb_elements); + T0->getImpl()->setRawPtr(array0, nb_elements); // input1 T1->resize(dims); - T1 -> getImpl() -> setRawPtr(array1, nb_elements); + T1->getImpl()->setRawPtr(array1, nb_elements); // results Tres->resize(dims); - Tres -> getImpl() -> setRawPtr(result, nb_elements); + Tres->getImpl()->setRawPtr(result, nb_elements); op->forwardDims(); start = std::chrono::system_clock::now(); - myMul->forward(); + op->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += + std::chrono::duration_cast<std::chrono::microseconds>( + end - start); REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); @@ -451,24 +409,23 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { delete[] array1; delete[] result; } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {}μs\n", duration.count()); } - SECTION("+1-D Tensor / +1-D Tensor - broadcasting") { std::size_t number_of_operation = 0; for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // generate 2 random Tensors - // handle dimensions, replace some dimensions with '1' to get broadcasting + // handle dimensions, replace some dimensions with '1' to get + // broadcasting constexpr std::size_t nbDims = 4; std::vector<std::size_t> dimensions; - for (std::size_t i = 0; i < nbDims; ++i) - { + for (std::size_t i = 0; i < nbDims; ++i) { dimensions.push_back(dimSizeDist(gen)); } @@ -476,77 +433,90 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { auto dims1 = dimensions; auto dimsOut = dimensions; - for (std::size_t i = 0; i < nbDims; ++i) - { - if (boolDist(gen)) - { + for (std::size_t i = 0; i < nbDims; ++i) { + if (boolDist(gen)) { dims0[i] = 1; } - if (boolDist(gen)) - { + if (boolDist(gen)) { dims1[i] = 1; } dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i]; } - for(auto dim : dims0) - { + for (auto dim : dims0) { Log::info("Dimension of input 0 : {}", dim); } - for(auto dim : dims1) - { + for (auto dim : dims1) { Log::info("Dimension of input 1 : {}", dim); } // create arrays and fill them with random values - float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]]; - float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]]; - float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]]; - - - for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) - { + float *array0 = + new float[dims0[0] * dims0[1] * dims0[2] * dims0[3]]; + float *array1 = + new float[dims1[0] * dims1[1] * dims1[2] * dims1[3]]; + float *result = new float[dimsOut[0] * dimsOut[1] * + dimsOut[2] * dimsOut[3]]; + + for (std::size_t i = 0; + i < dims0[0] * dims0[1] * dims0[2] * dims0[3]; + ++i) { array0[i] = valueDist(gen); } - for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) - { + for (std::size_t i = 0; + i < dims1[0] * dims1[1] * dims1[2] * dims1[3]; + ++i) { array1[i] = valueDist(gen); } // compute true result - const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1}; - const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1}; - - for (std::size_t a = 0; a < dimsOut[0]; ++a) - { - for (std::size_t b = 0; b < dimsOut[1]; ++b) - { - const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0) - + strides0[1] * ((dims0[1] > 1) ? b : 0); - - const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0) - + strides1[1] * ((dims1[1] > 1) ? b : 0); - - for (std::size_t c = 0; c < dimsOut[2]; ++c) - { - const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a)); - - for (std::size_t d = 0; d < dimsOut[3]; ++d) - { - std::size_t idx0 = idx0_0 - + strides0[2] * ((dims0[2] > 1) ? c : 0) - + ((dims0[3] > 1) ? d : 0); - - std::size_t idx1 = idx1_0 - + strides1[2] * ((dims1[2] > 1) ? c : 0) - + ((dims1[3] > 1) ? d : 0); - - result[idx_out + d] = array0[idx0] * array1[idx1]; - // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl; + const std::size_t strides0[nbDims] = { + dims0[1] * dims0[2] * dims0[3], + dims0[2] * dims0[3], + dims0[3], + 1}; + const std::size_t strides1[nbDims] = { + dims1[1] * dims1[2] * dims1[3], + dims1[2] * dims1[3], + dims1[3], + 1}; + + for (std::size_t a = 0; a < dimsOut[0]; ++a) { + for (std::size_t b = 0; b < dimsOut[1]; ++b) { + const std::size_t idx0_0 = + strides0[0] * ((dims0[0] > 1) ? a : 0) + + strides0[1] * ((dims0[1] > 1) ? b : 0); + + const std::size_t idx1_0 = + strides1[0] * ((dims1[0] > 1) ? a : 0) + + strides1[1] * ((dims1[1] > 1) ? b : 0); + + for (std::size_t c = 0; c < dimsOut[2]; ++c) { + const std::size_t idx_out = + dimsOut[3] * + (c + dimsOut[2] * (b + dimsOut[1] * a)); + + for (std::size_t d = 0; d < dimsOut[3]; ++d) { + std::size_t idx0 = + idx0_0 + + strides0[2] * ((dims0[2] > 1) ? c : 0) + + ((dims0[3] > 1) ? d : 0); + + std::size_t idx1 = + idx1_0 + + strides1[2] * ((dims1[2] > 1) ? c : 0) + + ((dims1[3] > 1) ? d : 0); + + result[idx_out + d] = + array0[idx0] * array1[idx1]; + // std::cout << "(" << idx0 << ", " << idx1 << + // ") -> " << array0[idx0] << " * " << + // array1[idx1] << " -> " << idx_out + d << + // std::endl; } } } @@ -555,22 +525,30 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { // conversion to Aidge::Tensors // input0 T0->resize(dims0); - T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]); + T0->getImpl()->setRawPtr( + array0, + dims0[0] * dims0[1] * dims0[2] * dims0[3]); // input1 T1->resize(dims1); - T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]); + T1->getImpl()->setRawPtr( + array1, + dims1[0] * dims1[1] * dims1[2] * dims1[3]); // results Tres->resize(dimsOut); - Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]); + Tres->getImpl()->setRawPtr( + result, + dimsOut[0] * dimsOut[1] * dimsOut[2] * dimsOut[3]); // compute result op->forwardDims(); start = std::chrono::system_clock::now(); - myMul->forward(); + op->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += + std::chrono::duration_cast<std::chrono::microseconds>( + end - start); // comparison between truth and computed result REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); @@ -579,15 +557,21 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { delete[] array1; delete[] result; - const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dimsOut.cbegin(), + dimsOut.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); number_of_operation += nb_elements; } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {}μs\n", duration.count()); } SECTION("+1-D Tensor / 1-D Tensor") { std::size_t number_of_operation = 0; - std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3)); + std::uniform_int_distribution<std::size_t> nbRemovedDimsDist( + std::size_t(1), + std::size_t(3)); for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // generate 2 random Tensors @@ -604,15 +588,24 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { dims1[i] = 1; } } - dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen)); + dims1.erase(dims1.cbegin(), + dims1.cbegin() + nbRemovedDimsDist(gen)); // create arrays and fill them with random values - float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]]; - std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); - float* array1 = new float[array1_size]; - float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]]; - - for (std::size_t i = 0; i < (dims0[0]*dims0[1]*dims0[2]*dims0[3]); ++i) { + float *array0 = + new float[dims0[0] * dims0[1] * dims0[2] * dims0[3]]; + std::size_t array1_size = + std::accumulate(dims1.cbegin(), + dims1.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); + float *array1 = new float[array1_size]; + float *result = new float[dimsOut[0] * dimsOut[1] * + dimsOut[2] * dimsOut[3]]; + + for (std::size_t i = 0; + i < (dims0[0] * dims0[1] * dims0[2] * dims0[3]); + ++i) { array0[i] = valueDist(gen); } for (std::size_t i = 0; i < array1_size; ++i) { @@ -621,27 +614,48 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { // compute true result auto dims1_tmp = dims1; - dims1_tmp.insert(dims1_tmp.cbegin(), 4 - dims1_tmp.size(), std::size_t(1)); - - const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1}; - const std::size_t strides1[nbDims] = {dims1_tmp[1]*dims1_tmp[2]*dims1_tmp[3], dims1_tmp[2]*dims1_tmp[3], dims1_tmp[3], 1}; + dims1_tmp.insert(dims1_tmp.cbegin(), + 4 - dims1_tmp.size(), + std::size_t(1)); + + const std::size_t strides0[nbDims] = { + dims0[1] * dims0[2] * dims0[3], + dims0[2] * dims0[3], + dims0[3], + 1}; + const std::size_t strides1[nbDims] = { + dims1_tmp[1] * dims1_tmp[2] * dims1_tmp[3], + dims1_tmp[2] * dims1_tmp[3], + dims1_tmp[3], + 1}; for (std::size_t a = 0; a < dimsOut[0]; ++a) { for (std::size_t b = 0; b < dimsOut[1]; ++b) { - const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0) - + strides0[1] * ((dims0[1] > 1) ? b : 0); - const std::size_t idx1_0 = strides1[0] * ((dims1_tmp[0] > 1) ? a : 0) - + strides1[1] * ((dims1_tmp[1] > 1) ? b : 0); + const std::size_t idx0_0 = + strides0[0] * ((dims0[0] > 1) ? a : 0) + + strides0[1] * ((dims0[1] > 1) ? b : 0); + const std::size_t idx1_0 = + strides1[0] * ((dims1_tmp[0] > 1) ? a : 0) + + strides1[1] * ((dims1_tmp[1] > 1) ? b : 0); for (std::size_t c = 0; c < dimsOut[2]; ++c) { - const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a)); + const std::size_t idx_out = + dimsOut[3] * + (c + dimsOut[2] * (b + dimsOut[1] * a)); for (std::size_t d = 0; d < dimsOut[3]; ++d) { - std::size_t idx0 = idx0_0 - + strides0[2] * ((dims0[2] > 1) ? c : 0) - + ((dims0[3] > 1) ? d : 0); - std::size_t idx1 = idx1_0 - + strides1[2] * ((dims1_tmp[2] > 1) ? c : 0) - + ((dims1_tmp[3] > 1) ? d : 0); - result[idx_out + d] = array0[idx0] * array1[idx1]; - // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl; + std::size_t idx0 = + idx0_0 + + strides0[2] * ((dims0[2] > 1) ? c : 0) + + ((dims0[3] > 1) ? d : 0); + std::size_t idx1 = + idx1_0 + + strides1[2] * + ((dims1_tmp[2] > 1) ? c : 0) + + ((dims1_tmp[3] > 1) ? d : 0); + result[idx_out + d] = + array0[idx0] * array1[idx1]; + // std::cout << "(" << idx0 << ", " << idx1 << + // ") -> " << array0[idx0] << " * " << + // array1[idx1] << " -> " << idx_out + d << + // std::endl; } } } @@ -650,22 +664,28 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { // conversion to Aidge::Tensors // input0 T0->resize(dims0); - T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]); + T0->getImpl()->setRawPtr( + array0, + dims0[0] * dims0[1] * dims0[2] * dims0[3]); // input1 T1->resize(dims1); - T1 -> getImpl() -> setRawPtr(array1, array1_size); + T1->getImpl()->setRawPtr(array1, array1_size); // results Tres->resize(dimsOut); - Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]); + Tres->getImpl()->setRawPtr( + result, + dimsOut[0] * dimsOut[1] * dimsOut[2] * dimsOut[3]); // compute result op->forwardDims(); start = std::chrono::system_clock::now(); - myMul->forward(); + op->forward(); end = std::chrono::system_clock::now(); - duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + duration += + std::chrono::duration_cast<std::chrono::microseconds>( + end - start); // comparison between truth and computed result REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); @@ -674,13 +694,18 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { delete[] array1; delete[] result; - const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t nb_elements = + std::accumulate(dimsOut.cbegin(), + dimsOut.cend(), + std::size_t(1), + std::multiplies<std::size_t>()); number_of_operation += nb_elements; } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {}μs\n", duration.count()); } } } } // namespace Aidge + diff --git a/unit_tests/operator/Test_PowImpl.cpp b/unit_tests/operator/Test_PowImpl.cpp index cb5d8872c9c7242bb4aa4efca388d53b578417f9..55a416c3f404506359e06f9937dd958503236901 100644 --- a/unit_tests/operator/Test_PowImpl.cpp +++ b/unit_tests/operator/Test_PowImpl.cpp @@ -9,18 +9,26 @@ * ********************************************************************************/ -#include <catch2/catch_test_macros.hpp> -#include <cmath> -#include <cstddef> // std::size_t -#include <cstdint> // std::uint16_t -#include <chrono> -#include <iostream> +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock, std::chrono::duration +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies #include <memory> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> + +#include <catch2/catch_test_macros.hpp> +#include <fmt/core.h> +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/PowImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/operator/Pow.hpp" +#include "aidge/utils/ArrayHelpers.hpp" #include "aidge/utils/TensorUtils.hpp" namespace Aidge { @@ -118,8 +126,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") { // with broadcasting } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {} μs\n", duration.count()); } SECTION("+1-D Tensor / +1-D Tensor - broadcasting") { @@ -213,8 +221,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") { const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); number_of_operation += nb_elements; } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {} μs\n", duration.count()); } SECTION("+1-D Tensor / 1-D Tensor") { std::size_t number_of_operation = 0; @@ -309,8 +317,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") { number_of_operation += nb_elements; } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {} μs\n", duration.count()); } } @@ -440,7 +448,7 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") { } } )); - const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + const Tensor expectedGrad0 = Array3D<float, 2, 2, 3>( { { { @@ -453,18 +461,13 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") { } } } - )); - const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float, 3>( + ); + const Tensor expectedGrad1 = Array1D<float, 3>( { {14.14779854, 22.99299049, 33.56402588} } - )); + ); - for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1}) - { - T->setBackend("cpu") ; - T->setDataType(DataType::Float32); - } std::shared_ptr<Node> powOp = Pow(); auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator()); opr->setDataType(DataType::Float32); @@ -475,8 +478,8 @@ TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") { powOp->forward(); powOp->backward(); - REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0)); - REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1)); + REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), expectedGrad0)); + REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), expectedGrad1)); } } } diff --git a/unit_tests/operator/Test_ReduceSumImpl.cpp b/unit_tests/operator/Test_ReduceSumImpl.cpp index 49569d1f65ff6c51f9681632b16375605ab326e7..0aa543da4f9d55dfa672790a5d99467cd794001a 100644 --- a/unit_tests/operator/Test_ReduceSumImpl.cpp +++ b/unit_tests/operator/Test_ReduceSumImpl.cpp @@ -9,17 +9,22 @@ * ********************************************************************************/ -#include <catch2/catch_test_macros.hpp> +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t, std::int32_t #include <memory> -#include <numeric> // std::accumulate #include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <vector> + +#include <catch2/catch_test_macros.hpp> +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/data/Data.hpp" // DataType #include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/operator/OperatorTensor.hpp" #include "aidge/operator/ReduceSum.hpp" -#include "aidge/operator/Conv.hpp" - -#include "aidge/backend/cpu.hpp" #include "aidge/utils/TensorUtils.hpp" +#include "aidge/utils/Types.h" using namespace Aidge; @@ -112,7 +117,7 @@ TEST_CASE("[cpu/operator] ReduceSum(forward)", "[ReduceSum][CPU]") { std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims); myInput->setBackend("cpu"); myInput->setDataType(DataType::Float32); - std::shared_ptr<Node> myReduceSum = ReduceSum(std::vector<int32_t>{}, false, true); + std::shared_ptr<Node> myReduceSum = ReduceSum(std::vector<std::int32_t>{}, false, true); auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator()); op->associateInput(0,myInput); op->setDataType(DataType::Float32); diff --git a/unit_tests/operator/Test_RoundImpl.cpp b/unit_tests/operator/Test_RoundImpl.cpp index b4cf9ffbedc18b35b42ebbc05971f86e0fa584e3..e658b0616683633ce19b2284abb9d4fae7942a23 100644 --- a/unit_tests/operator/Test_RoundImpl.cpp +++ b/unit_tests/operator/Test_RoundImpl.cpp @@ -9,15 +9,23 @@ * ********************************************************************************/ -#include <catch2/catch_test_macros.hpp> -#include <cstddef> // std::size_t -#include <cstdint> // std::uint16_t -#include <chrono> -#include <iostream> +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock, std::chrono::duration +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies #include <memory> -#include <numeric> -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution -#include <iomanip> +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> + +#include <catch2/catch_test_macros.hpp> +#include <fmt/core.h> + +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/RoundImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/operator/Round.hpp" #include "aidge/utils/TensorUtils.hpp" @@ -29,7 +37,7 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") { // Create a random number generator std::random_device rd; std::mt19937 gen(rd()); - std::uniform_real_distribution<float> valueDist(-15, 15); + std::uniform_real_distribution<float> valueDist(-15, 15); std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5)); std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3)); @@ -59,7 +67,7 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") { std::size_t number_of_operation = 0; for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { - + // generate 2 random Tensors const std::size_t nbDims = nbDimsDist(gen); std::vector<std::size_t> dims; @@ -72,7 +80,7 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") { // without broadcasting float* array0 = new float[nb_elements]; float* result = new float[nb_elements]; - + for (std::size_t i = 0; i < nb_elements; ++i) { array0[i] = valueDist(gen); result[i] = std::nearbyint(array0[i]); @@ -86,29 +94,22 @@ TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") { // results Tres->resize(dims); Tres -> getImpl() -> setRawPtr(result, nb_elements); - + op->forwardDims(); start = std::chrono::system_clock::now(); myRound->forward(); end = std::chrono::system_clock::now(); duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); - bool is_eq = approxEq<float>(*(op->getOutput(0)), *Tres); - - auto Output = *(op->getOutput(0)); - - auto prt = Output.getImpl()->rawPtr(); - - REQUIRE(is_eq); - + REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); delete[] array0; delete[] result; } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {} μs\n", duration.count()); } } } // namespace Aidge diff --git a/unit_tests/operator/Test_SubImpl.cpp b/unit_tests/operator/Test_SubImpl.cpp index 44666ae631152c8898e24f7003b0c2ede8c67b84..1317e88a371e9a6e7a3deae5b7f662a9cd879a60 100644 --- a/unit_tests/operator/Test_SubImpl.cpp +++ b/unit_tests/operator/Test_SubImpl.cpp @@ -9,17 +9,26 @@ * ********************************************************************************/ -#include <catch2/catch_test_macros.hpp> -#include <cstddef> // std::size_t -#include <cstdint> // std::uint16_t -#include <chrono> -#include <iostream> +#include <chrono> // std::micro, std::chrono::time_point, + // std::chrono::system_clock +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <functional> // std::multiplies #include <memory> -#include <numeric> // std::accumulate -#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution +#include <numeric> // std::accumulate +#include <random> // std::random_device, std::mt19937 + // std::uniform_int_distribution, std::uniform_real_distribution +#include <vector> + +#include <catch2/catch_test_macros.hpp> +#include <fmt/core.h> +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/backend/cpu/operator/SubImpl.hpp" +#include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/operator/Sub.hpp" +#include "aidge/operator/OperatorTensor.hpp" #include "aidge/utils/TensorUtils.hpp" namespace Aidge { @@ -117,8 +126,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") { // with broadcasting } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {}μs\n", duration.count()); } SECTION("+1-D Tensor / +1-D Tensor - broadcasting") { @@ -212,8 +221,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") { const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>()); number_of_operation += nb_elements; } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {}μs\n", duration.count()); } SECTION("+1-D Tensor / 1-D Tensor") { std::size_t number_of_operation = 0; @@ -308,8 +317,8 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") { number_of_operation += nb_elements; } - std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; - std::cout << "total time: " << duration.count() << "μs" << std::endl; + Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); + Log::info("total time: {}μs\n", duration.count()); } } } diff --git a/unit_tests/operator/Test_WeightInterleavingImpl.cpp b/unit_tests/operator/Test_WeightInterleavingImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c95c8fca19eb79eb78fc19e93ded3383054383e7 --- /dev/null +++ b/unit_tests/operator/Test_WeightInterleavingImpl.cpp @@ -0,0 +1,436 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/WeightInterleaving.hpp" +#include "aidge/recipes/Recipes.hpp" +#include "aidge/utils/TensorUtils.hpp" + +#include "aidge/backend/cpu.hpp" + +#include <memory> + +using namespace Aidge; + +TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") { + + std::shared_ptr<Node> myWeightInterleaving = WeightInterleaving(); + auto opWeightInterleaving = std::static_pointer_cast<WeightInterleaving_Op>(myWeightInterleaving -> getOperator()); + + SECTION("CompactDataSize - Single element cases") { + REQUIRE(opWeightInterleaving->compactDataSize(1, 1) == 1); // 1 bit, needs 1 byte + REQUIRE(opWeightInterleaving->compactDataSize(1, 7) == 1); // 7 bits, needs 1 byte + } + + SECTION("CompactDataSize - Boundary cases for different nb_bits values") { + REQUIRE(opWeightInterleaving->compactDataSize(8, 1) == 1); // 8 elements at 1 bit each, fits in 1 byte + REQUIRE(opWeightInterleaving->compactDataSize(8, 2) == 2); // 8 elements at 2 bits each, needs 2 bytes + REQUIRE(opWeightInterleaving->compactDataSize(8, 3) == 4); // 8 elements at 3 bits each, needs 4 bytes + REQUIRE(opWeightInterleaving->compactDataSize(8, 4) == 4); // 8 elements at 4 bits each, needs 4 bytes + } + + SECTION("CompactDataSize - Larger dataSize values") { + REQUIRE(opWeightInterleaving->compactDataSize(16, 1) == 2); // 16 elements at 1 bit each, fits in 2 bytes + REQUIRE(opWeightInterleaving->compactDataSize(16, 2) == 4); // 16 elements at 2 bits each, needs 4 bytes + REQUIRE(opWeightInterleaving->compactDataSize(16, 3) == 8); // 16 elements at 3 bits each, needs 6 bytes + REQUIRE(opWeightInterleaving->compactDataSize(16, 4) == 8); // 16 elements at 4 bits each, needs 8 bytes + } + + SECTION("CompactDataSize - Odd dataSize values with varying nb_bits") { + REQUIRE(opWeightInterleaving->compactDataSize(7, 1) == 1); // 7 elements at 1 bit each, fits in 1 byte + REQUIRE(opWeightInterleaving->compactDataSize(7, 2) == 2); // 7 elements at 2 bits each, needs 2 bytes + REQUIRE(opWeightInterleaving->compactDataSize(7, 3) == 4); // 7 elements at 3 bits each, needs 4 bytes + REQUIRE(opWeightInterleaving->compactDataSize(7, 4) == 4); // 7 elements at 4 bits each, needs 4 bytes + } + + SECTION("CompactDataSize - Minimum and maximum values for nb_bits") { + REQUIRE(opWeightInterleaving->compactDataSize(5, 1) == 1); // 5 elements at 1 bit each, fits in 1 byte + } + + SECTION("CompactDataSize - Edge Case - dataSize of 0 should result in 0 required size") { + REQUIRE(opWeightInterleaving->compactDataSize(0, 1) == 0); // No data elements + } + + + SECTION("CompactData - 4-bit compaction") { + std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{ + {static_cast<std::int8_t>(0x0F), + static_cast<std::int8_t>(0xF5), + static_cast<std::int8_t>(0xB3), + static_cast<std::int8_t>(0x9C)} + }); + + weight->setDataFormat(Aidge::DataFormat::NHWC); + weight->setDataType(Aidge::DataType::Int4); + + std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{ + {static_cast<int8_t>(0xF5), + static_cast<int8_t>(0x3C)} + }); + + expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC); + expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>); + + std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving(); + auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator()); + op->associateInput(0,weight); + op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>); + op->setDataFormat(DataFormat::NHWC); + op->setBackend("cpu"); + myWeightInterleavingNode->forward(); + REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving); + } + + SECTION("CompactData - 3-bit compaction") { + std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{ + {static_cast<int8_t>(0x0F), + static_cast<int8_t>(0x05), + static_cast<int8_t>(0x04), + static_cast<int8_t>(0xD3)} + }); + + weight->setDataFormat(Aidge::DataFormat::NHWC); + weight->setDataType(Aidge::DataType::Int3); + + std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{ + {static_cast<int8_t>(0x75), + static_cast<int8_t>(0x43)} + }); + + expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC); + expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>); + + std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving(); + auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator()); + op->associateInput(0,weight); + op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>); + op->setDataFormat(DataFormat::NHWC); + op->setBackend("cpu"); + myWeightInterleavingNode->forward(); + REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving); + } + + SECTION("CompactData - 2-bit compaction") { + std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{ + {static_cast<std::int8_t>(0x03), + static_cast<std::int8_t>(0x02), + static_cast<std::int8_t>(0x01), + static_cast<std::int8_t>(0x00)} + }); + + weight->setDataFormat(Aidge::DataFormat::NHWC); + weight->setDataType(Aidge::DataType::Int2); + + std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{ + {static_cast<int8_t>(0xE4)} + }); + + expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC); + expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int2>); + + std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving(); + auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator()); + op->associateInput(0,weight); + op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int2>); + op->setDataFormat(DataFormat::NHWC); + op->setBackend("cpu"); + myWeightInterleavingNode->forward(); + REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving); + } + + SECTION("CompactData - Edge Cases - Single element data") { + std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{ + {static_cast<int8_t>(0x0F)} + }); + + weight->setDataFormat(Aidge::DataFormat::NHWC); + weight->setDataType(Aidge::DataType::Int4); + + std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{ + {static_cast<int8_t>(0xF0)} + }); + + expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC); + expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>); + + std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving(); + auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator()); + op->associateInput(0,weight); + op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>); + op->setDataFormat(DataFormat::NHWC); + op->setBackend("cpu"); + myWeightInterleavingNode->forward(); + REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving); + } + + SECTION("CompactData - Edge Cases - Non-divisible dataSize for nbSlot with nbbits=4") { + std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 3>{ + {static_cast<int8_t>(0x0F), + static_cast<int8_t>(0xA5), + static_cast<int8_t>(0x34)} + }); + + weight->setDataFormat(Aidge::DataFormat::NHWC); + weight->setDataType(Aidge::DataType::Int4); + + std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{ + {static_cast<int8_t>(0xF5), + static_cast<int8_t>(0x40)} + }); + + expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC); + expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>); + + std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving(); + auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator()); + op->associateInput(0,weight); + op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>); + op->setDataFormat(DataFormat::NHWC); + op->setBackend("cpu"); + myWeightInterleavingNode->forward(); + REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving); + + } + + SECTION("CompactData - Edge Cases - Non-divisible dataSize for nbSlot with nbbits=3") { + + std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 3>{ + {static_cast<int8_t>(0x0F), + static_cast<int8_t>(0x05), + static_cast<int8_t>(0x04)} + }); + + weight->setDataFormat(Aidge::DataFormat::NHWC); + weight->setDataType(Aidge::DataType::Int3); + + std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{ + {static_cast<int8_t>(0x75), + static_cast<int8_t>(0x40)} + }); + + expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC); + expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>); + + std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving(); + auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator()); + op->associateInput(0,weight); + op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>); + op->setDataFormat(DataFormat::NHWC); + op->setBackend("cpu"); + myWeightInterleavingNode->forward(); + REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving); + + } + + SECTION("Forward Op - Convolution weight interleaving") { + + // Weight [Cout = 2, H = 3, W = 3, Cin = 4]: + std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,4> { + { + { + { + {-6, 0, 5, -8}, // 'A' '0' '5' '8' in hexadecimal format + { 5, 5, 4, -5}, // '5' '5' '4' 'B' in hexadecimal format + {-7, -1, 4, -7} // '9' 'F' '4' '9' in hexadecimal format + }, + { + { 3, -3, -3, -3}, // '3' 'D' 'D' 'D' in hexadecimal format + { 1, 3, 1, -1}, // '1' '3' '1' 'F' in hexadecimal format + { 7, -3, -1, 4} // '7' 'D' 'F' '4' in hexadecimal format + }, + { + {-1, 3, 5, 6}, // 'F' '3' '5' '6' in hexadecimal format + {-8, 4, 7, 1}, // '8' '4' '7' '1' in hexadecimal format + {-5, 0, -1, -2} // 'B' '0' 'F' 'E' in hexadecimal format + } + }, + { + { + { 2, -7, 7, -4}, // '2' '9' '7' 'C' in hexadecimal format + {-7, 3, 0, 2}, // '9' '3' '0' '2' in hexadecimal format + { 1, -1, 2, 3} // '1' 'F' '2' '3' in hexadecimal format + }, + { + {-1, -5, -3, -7}, // 'F' 'B' 'D' '9' in hexadecimal format + {-8, 3, 5, -1}, // '8' '3' '5' 'F' in hexadecimal format + {-7, -4, -6, -1} // '9' 'C' 'A' 'F' in hexadecimal format + }, + { + { 1, 7, 5, -1}, // '1' '7' '5' 'F' in hexadecimal format + { 1, -8, 1, 2}, // '1' '8' '1' '2' in hexadecimal format + {-1, -6, -3, 0} // 'F' 'A' 'D' '0' in hexadecimal format + } + } + } + }); + + std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,2> { + { + { + { + {static_cast<int8_t>(0xA0), static_cast<int8_t>(0x58)}, // 'A' '0' '5' '8' in hexadecimal format + {static_cast<int8_t>(0x55), static_cast<int8_t>(0x4B)}, // '5' '5' '4' 'B' in hexadecimal format + {static_cast<int8_t>(0x9F), static_cast<int8_t>(0x49)} // '9' 'F' '4' '9' in hexadecimal format + }, + { + {static_cast<int8_t>(0x3D), static_cast<int8_t>(0xDD)}, // '3' 'D' 'D' 'D' in hexadecimal format + {static_cast<int8_t>(0x13), static_cast<int8_t>(0x1F)}, // '1' '3' '1' 'F' in hexadecimal format + {static_cast<int8_t>(0x7D), static_cast<int8_t>(0xF4)} // '7' 'D' 'F' '4' in hexadecimal format + }, + { + {static_cast<int8_t>(0xF3), static_cast<int8_t>(0x56)}, // 'F' '3' '5' '6' in hexadecimal format + {static_cast<int8_t>(0x84), static_cast<int8_t>(0x71)}, // '8' '4' '7' '1' in hexadecimal format + {static_cast<int8_t>(0xB0), static_cast<int8_t>(0xFE)} // 'B' '0' 'F' 'E' in hexadecimal format + } + }, + { + { + {static_cast<int8_t>(0x29), static_cast<int8_t>(0x7C)}, // '2' '9' '7' 'C' in hexadecimal format + {static_cast<int8_t>(0x93), static_cast<int8_t>(0x02)}, // '9' '3' '0' '2' in hexadecimal format + {static_cast<int8_t>(0x1F), static_cast<int8_t>(0x23)} // '1' 'F' '2' '3' in hexadecimal format + }, + { + {static_cast<int8_t>(0xFB), static_cast<int8_t>(0xD9)}, // 'F' 'B' 'D' '9' in hexadecimal format + {static_cast<int8_t>(0x83), static_cast<int8_t>(0x5F)}, // '8' '3' '5' 'F' in hexadecimal format + {static_cast<int8_t>(0x9C), static_cast<int8_t>(0xAF)} // '9' 'C' 'A' 'F' in hexadecimal format + }, + { + {static_cast<int8_t>(0x17), static_cast<int8_t>(0x5F)}, // '1' '7' '5' 'F' in hexadecimal format + {static_cast<int8_t>(0x18), static_cast<int8_t>(0x12)}, // '1' '8' '1' '2' in hexadecimal format + {static_cast<int8_t>(0xFA), static_cast<int8_t>(0xD0)} // 'F' 'A' 'D' '0' in hexadecimal format + } + } + } + }); + + weight->setDataFormat(Aidge::DataFormat::NHWC); + weight->setDataType(Aidge::DataType::Int4); + + expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC); + expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>); + + std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving(); + auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator()); + op->associateInput(0,weight); + op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>); + op->setDataFormat(DataFormat::NHWC); + op->setBackend("cpu"); + myWeightInterleavingNode->forward(); + REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving); + } + + SECTION("Recipie ApplyWeightInterleaving") { + + // Weight [Cout = 2, H = 3, W = 3, Cin = 4]: + std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,4> { + { + { + { + {-6, 0, 5, -8}, // 'A' '0' '5' '8' in hexadecimal format + { 5, 5, 4, -5}, // '5' '5' '4' 'B' in hexadecimal format + {-7, -1, 4, -7} // '9' 'F' '4' '9' in hexadecimal format + }, + { + { 3, -3, -3, -3}, // '3' 'D' 'D' 'D' in hexadecimal format + { 1, 3, 1, -1}, // '1' '3' '1' 'F' in hexadecimal format + { 7, -3, -1, 4} // '7' 'D' 'F' '4' in hexadecimal format + }, + { + {-1, 3, 5, 6}, // 'F' '3' '5' '6' in hexadecimal format + {-8, 4, 7, 1}, // '8' '4' '7' '1' in hexadecimal format + {-5, 0, -1, -2} // 'B' '0' 'F' 'E' in hexadecimal format + } + }, + { + { + { 2, -7, 7, -4}, // '2' '9' '7' 'C' in hexadecimal format + {-7, 3, 0, 2}, // '9' '3' '0' '2' in hexadecimal format + { 1, -1, 2, 3} // '1' 'F' '2' '3' in hexadecimal format + }, + { + {-1, -5, -3, -7}, // 'F' 'B' 'D' '9' in hexadecimal format + {-8, 3, 5, -1}, // '8' '3' '5' 'F' in hexadecimal format + {-7, -4, -6, -1} // '9' 'C' 'A' 'F' in hexadecimal format + }, + { + { 1, 7, 5, -1}, // '1' '7' '5' 'F' in hexadecimal format + { 1, -8, 1, 2}, // '1' '8' '1' '2' in hexadecimal format + {-1, -6, -3, 0} // 'F' 'A' 'D' '0' in hexadecimal format + } + } + } + }); + + std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,2> { + { + { + { + {static_cast<int8_t>(0xA0), static_cast<int8_t>(0x58)}, // 'A' '0' '5' '8' in hexadecimal format + {static_cast<int8_t>(0x55), static_cast<int8_t>(0x4B)}, // '5' '5' '4' 'B' in hexadecimal format + {static_cast<int8_t>(0x9F), static_cast<int8_t>(0x49)} // '9' 'F' '4' '9' in hexadecimal format + }, + { + {static_cast<int8_t>(0x3D), static_cast<int8_t>(0xDD)}, // '3' 'D' 'D' 'D' in hexadecimal format + {static_cast<int8_t>(0x13), static_cast<int8_t>(0x1F)}, // '1' '3' '1' 'F' in hexadecimal format + {static_cast<int8_t>(0x7D), static_cast<int8_t>(0xF4)} // '7' 'D' 'F' '4' in hexadecimal format + }, + { + {static_cast<int8_t>(0xF3), static_cast<int8_t>(0x56)}, // 'F' '3' '5' '6' in hexadecimal format + {static_cast<int8_t>(0x84), static_cast<int8_t>(0x71)}, // '8' '4' '7' '1' in hexadecimal format + {static_cast<int8_t>(0xB0), static_cast<int8_t>(0xFE)} // 'B' '0' 'F' 'E' in hexadecimal format + } + }, + { + { + {static_cast<int8_t>(0x29), static_cast<int8_t>(0x7C)}, // '2' '9' '7' 'C' in hexadecimal format + {static_cast<int8_t>(0x93), static_cast<int8_t>(0x02)}, // '9' '3' '0' '2' in hexadecimal format + {static_cast<int8_t>(0x1F), static_cast<int8_t>(0x23)} // '1' 'F' '2' '3' in hexadecimal format + }, + { + {static_cast<int8_t>(0xFB), static_cast<int8_t>(0xD9)}, // 'F' 'B' 'D' '9' in hexadecimal format + {static_cast<int8_t>(0x83), static_cast<int8_t>(0x5F)}, // '8' '3' '5' 'F' in hexadecimal format + {static_cast<int8_t>(0x9C), static_cast<int8_t>(0xAF)} // '9' 'C' 'A' 'F' in hexadecimal format + }, + { + {static_cast<int8_t>(0x17), static_cast<int8_t>(0x5F)}, // '1' '7' '5' 'F' in hexadecimal format + {static_cast<int8_t>(0x18), static_cast<int8_t>(0x12)}, // '1' '8' '1' '2' in hexadecimal format + {static_cast<int8_t>(0xFA), static_cast<int8_t>(0xD0)} // 'F' 'A' 'D' '0' in hexadecimal format + } + } + } + }); + + expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC); + expectedWeightInterleaving->setDataType(Aidge::DataType::Dual_Int4); + + // Create convolution node + std::shared_ptr<Node> conv = Conv(4, 2, {3, 3}, "conv1"); + + // Place the weight tensor in the weight producer of the conv + auto weightProducer = conv->getParent(1); + weightProducer->getOperator()->setOutput(0, weight); + + // Set dataType, dataformat and backend of convolution + conv->getOperator()->setDataFormat(Aidge::DataFormat::NHWC); + conv->getOperator()->setDataType(Aidge::DataType::Int4); + conv->getOperator()->setBackend("cpu"); + + // Apply recipie + applyWeightInterleaving(conv); + + // Compare the weight producer output tensor with the expected weights with interleaving + auto newProdOp = std::static_pointer_cast<OperatorTensor>(conv->getParent(1)->getOperator()); + REQUIRE(*(newProdOp->getOutput(0)) == *expectedWeightInterleaving); + } + +} diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp index 78a10c308a60f026b83ea64cfbd25a848099eb90..9224d6f99ce6af17441c9dac85549ad9f544cb89 100644 --- a/unit_tests/scheduler/Test_Scheduler.cpp +++ b/unit_tests/scheduler/Test_Scheduler.cpp @@ -18,6 +18,10 @@ #include "aidge/graph/GraphView.hpp" #include "aidge/graph/OpArgs.hpp" #include "aidge/operator/Memorize.hpp" +#include "aidge/operator/Pop.hpp" +#include "aidge/operator/Stack.hpp" +#include "aidge/operator/Identity.hpp" +#include "aidge/operator/MetaOperator.hpp" #include "aidge/scheduler/SequentialScheduler.hpp" #include "aidge/scheduler/ParallelScheduler.hpp" @@ -438,4 +442,69 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(backward)", "[scheduler][backward predictedOutput->setGrad(targetOutput); REQUIRE_NOTHROW(scheduler.backward()); } + +std::shared_ptr<Node> Accumulate(int seqLength, const std::string& name) { + auto input = Identity((!name.empty()) ? name + "_input" : ""); + auto hiddenState = Memorize(seqLength, (!name.empty()) ? name + "_hidden_state" : ""); + auto add = Add((!name.empty()) ? name + "_add" : ""); + + input->addChild(add, 0, 0); + add->addChild(hiddenState, 0,0); + hiddenState->addChild(/*otherNode=*/add, /*outId=*/1, /*otherInId=*/1); + + std::shared_ptr<GraphView> microGraph = std::make_shared<GraphView>(); + microGraph->add(input); + microGraph->add({hiddenState, add}); + microGraph->setOrderedInputs({{input, 0}, {hiddenState, 1}}); + microGraph->setOrderedOutputs({{hiddenState, 0}}); + + auto metaOp = MetaOperator("Accumulate", microGraph, {}, name); + return metaOp; +} + +TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") { + std::shared_ptr<Tensor> Input = std::make_shared<Tensor>( + Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, + {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}}); + + std::shared_ptr<Tensor> MemInit = + std::make_shared<Tensor>(Array2D<float, 3, 2>{ + {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}}); + + auto meta = Accumulate(2, "accumulate"); + auto op = std::static_pointer_cast<MetaOperator_Op>(meta->getOperator()); + auto pop_i = Pop("pop_input"); + auto pop_o = Identity("pop_output"); // NOTE: Could be Identity/Stack/Whatever node you want, this is is not the problem here + + pop_i->getOperator()->associateInput(0, Input); + pop_i->addChild(op->getMicroGraph()->getOrderedInputs()[0].first, 0, 0); + op->getMicroGraph()->getOrderedOutputs()[0].first->addChild(pop_o, 0, 0); + + //pop_i->addChild(meta, 0, 0); + //meta->addChild(pop_o, 0, 0); + + //op->associateInput(1, MemInit); + op->getMicroGraph()->getNode("accumulate_hidden_state")->getOperator()->associateInput(1, MemInit); + + // Build the graph. + auto myGraph = std::make_shared<GraphView>(); + myGraph->add(pop_i); + myGraph->add(op->getMicroGraph()); + //myGraph->add(meta); + myGraph->add(pop_o); + myGraph->compile("cpu", DataType::Float32); + + myGraph->save("accumulate_graph", true); + + // Schedule and run + auto scheduler = SequentialScheduler(myGraph); + scheduler.generateScheduling(); + scheduler.saveStaticSchedulingDiagram("accumulate_scheduling"); + REQUIRE_NOTHROW(scheduler.forward(true)); + + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>( + Array2D<float, 3, 2>{{{3.0, 5.0}, {7.0, 9.0}, {11.0, 13.0}}}); + std::shared_ptr<Tensor> output = std::static_pointer_cast<OperatorTensor>(pop_o->getOperator())->getOutput(0); + REQUIRE(*output == *expectedOutput); +} } // namespace Aidge