diff --git a/include/aidge/backend/cpu/operator/AddImpl.hpp b/include/aidge/backend/cpu/operator/AddImpl.hpp index 5e795922a67be178dde588e8e5e346ec268efe86..e39c35b42fdb6065aa72aee092cd1cd23b2b1011 100644 --- a/include/aidge/backend/cpu/operator/AddImpl.hpp +++ b/include/aidge/backend/cpu/operator/AddImpl.hpp @@ -25,7 +25,7 @@ namespace Aidge { // Operator implementation entry point for the backend using AddImpl_cpu = OperatorImpl_cpu<Add_Op, - void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)>; + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>; // Implementation entry point registration to Operator REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create); diff --git a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp index fd1a0305cf734e556a4217e9303c011cb0d25a3a..e6d13fcf3699824a8410015d35ff766adf617c11 100644 --- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp @@ -14,31 +14,137 @@ #include "aidge/utils/Registrar.hpp" -#include <cstdint> // std::int32_t, std::int64_t +#include <cstddef> // std::size_t #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/AddImpl.hpp" namespace Aidge { +namespace { +// suppose values are contiguous in memory template <class I, class O> -void AddImpl_cpu_forward_kernel(const std::vector<const void*> inputs_, const std::vector<std::vector<std::size_t>>& inputDims, const std::size_t outputLength, const std::vector<std::size_t>& outDims, void* output_) { - // FIXME: missing Add attributes as arguments - std::vector<const I*> inputs; - for (const auto& input_ : inputs_) { - inputs.push_back(static_cast<const I*>(input_)); +void add_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I* input1, + const I* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(input1[in1_id] + input2[in2_id]); } +} +} + +template <class I, class O> +void AddImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, + const std::vector<std::size_t>& outputDims, + const void* input0_, + const void* input1_, + void* output_) { + + const I* input_0 = static_cast<const I*>(input0_); + const I* input_1 = static_cast<const I*>(input1_); O* output = static_cast<O*>(output_); - for (std::size_t oIndex = 0; oIndex < outputLength; ++oIndex) - { - output[oIndex] = 0; - std::vector<size_t> indexes = getMultiDimIndices(outDims, oIndex); - for(std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) { - std::size_t idx = getFlattenedIndex(inputDims[iIndex], indexes); - output[oIndex] += inputs[iIndex][idx]; - } - } + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(input_0[i] + input_1[i]); + } + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + add_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } } // Kernels registration to implementation entry point diff --git a/include/aidge/backend/cpu/operator/AndImpl.hpp b/include/aidge/backend/cpu/operator/AndImpl.hpp index 316a2fb922596642088d133a7fec49c988739bb7..8814df2fac36be56332035731679b724b169efe7 100644 --- a/include/aidge/backend/cpu/operator/AndImpl.hpp +++ b/include/aidge/backend/cpu/operator/AndImpl.hpp @@ -23,7 +23,7 @@ namespace Aidge { // Operator implementation entry point for the backend using AndImpl_cpu = OperatorImpl_cpu<And_Op, - void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>; + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>; // Implementation entry point registration to Operator REGISTRAR(And_Op, "cpu", Aidge::AndImpl_cpu::create); diff --git a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp index 197e829f3527ce2f36c3ef5ee812a26477633703..73b710e021ac5031923eb1e9a2492502c02a3633 100644 --- a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp @@ -12,52 +12,152 @@ #ifndef AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ #define AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ -#include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/AndImpl.hpp" #include "aidge/utils/Registrar.hpp" namespace Aidge { -template <class I1, class I2, class O> -void AndImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims, - const std::vector<std::size_t>& input2Dims, + +namespace { +// suppose values are contiguous in memory +template <class I, class O> +void equal_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I* input1, + const I* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]); + } +} +} + + +template <class I, class O> +void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, + const void* input0_, const void* input1_, - const void* input2_, void* output_) { - const I1* input_1 = static_cast<const I1*>(input1_); - const I2* input_2 = static_cast<const I2*>(input2_); + const I* input_0 = static_cast<const I*>(input0_); + const I* input_1 = static_cast<const I*>(input1_); O* output = static_cast<O*>(output_); - size_t totalElements = 1; - for (size_t dimSize : outputDims) { - totalElements *= dimSize; + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(input_0[i] == input_1[i]); + } + return; } - for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) - { - std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } - std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); - std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); + const std::size_t nbDims = dims0.size(); - output[oIndex] = static_cast<O>(input_1[idx1] == input_2[idx2]); + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } } } // Kernels registration to implementation entry point REGISTRAR(AndImpl_cpu, {DataType::Float32}, - {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float, float>, nullptr}); + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr}); REGISTRAR(AndImpl_cpu, {DataType::Float64}, - {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double, double>, nullptr}); + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr}); REGISTRAR(AndImpl_cpu, {DataType::Int32}, - {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr}); + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr}); REGISTRAR(AndImpl_cpu, {DataType::Int64}, - {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr}); + {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr}); + } // namespace Aidge #endif /* AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp index 6da67bb7dd4469b6ca609c5aea1ae70dfca3f939..807d2b972ba385f9382d4121173a75207600d098 100644 --- a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp +++ b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp @@ -24,13 +24,13 @@ namespace Aidge { // Operator implementation entry point for the backend using BitShiftImpl_cpu = OperatorImpl_cpu<BitShift_Op, void(const BitShift_Op::BitShiftDirection, - const std::vector<std::size_t>&, - const std::vector<std::size_t>&, - const std::vector<std::size_t>&, - const void*, + std::vector<std::size_t>, + std::vector<std::size_t>, + const std::vector<std::size_t>&, + const void*, const void*, void*)>; - + // Implementation entry point registration to Operator REGISTRAR(BitShift_Op,"cpu",Aidge::BitShiftImpl_cpu::create); } // namespace Aidge diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp index f815e946ea2e4abaff48a6e5155368d564e88e8c..1f2561afe0be9997116cbd82f754c485a1760090 100644 --- a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp @@ -12,47 +12,150 @@ #ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_ #define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_ -#include "aidge/utils/Registrar.hpp" -#include <cstdint> // std::int32_t, std::int64_t -#include "aidge/operator/BitShift.hpp" +#include <cstdint> // std::int32_t, std::int64_t +#include <cstddef> // std::size_t #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/BitShiftImpl.hpp" +#include "aidge/operator/BitShift.hpp" +#include "aidge/utils/Registrar.hpp" +namespace { +// suppose values are contiguous in memory +template <class I1, class I2, class O> +void bitshift_contiguous_arrays( + const Aidge::BitShift_Op::BitShiftDirection direction, + const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I1* input_1, + const I2* input_2, + O* output) +{ + if(direction == Aidge::BitShift_Op::BitShiftDirection::right) { + for (std::size_t i = 0; i < output1size; ++i) { + const std::size_t idx1 = (input1size != 1) ? i : 0; + const std::size_t idx2 = (input2size != 1) ? i : 0; + output[i]= input_1[idx1] >> input_2[idx2]; + } + + } else { + for (std::size_t i = 0; i < output1size; ++i) { + const std::size_t idx1 = (input1size != 1) ? i : 0; + const std::size_t idx2 = (input2size != 1) ? i : 0; + output[i] = input_1[idx1] << input_2[idx2]; + } + } +} +} namespace Aidge { template <class I1, class I2, class O> void BitShiftImpl_cpu_forward_kernel( const BitShift_Op::BitShiftDirection direction, - const std::vector<std::size_t>& input1Dims, - const std::vector<std::size_t>& input2Dims, + std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, + const void* input0_, const void* input1_, - const void* input2_, void* output_ ) { - const I1* input_1 = static_cast<const I1*>(input1_); - const I2* input_2 = static_cast<const I2*>(input2_); + const I1* input_0 = static_cast<const I1*>(input0_); + const I2* input_1 = static_cast<const I2*>(input1_); O* output = static_cast<O*>(output_); - const size_t totalElements = std::accumulate(outputDims.begin(), outputDims.end(), std::size_t(1), std::multiplies<std::size_t>()); - - for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) - { - std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); - std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); - std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); - if(direction == BitShift_Op::BitShiftDirection::right) - - { - output[oIndex]= input_1[idx1] >> input_2[idx2]; + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // ## Compute compatible input dimensions + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + bitshift_contiguous_arrays(direction, input0_contiguous_size, input0_contiguous_size, input0_contiguous_size, input_0, input_1, output); + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; } - else - { - output[oIndex] = input_1[idx1] << input_2[idx2]; + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + bitshift_contiguous_arrays<I1,I2,O>(direction, input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; } } } diff --git a/include/aidge/backend/cpu/operator/MulImpl.hpp b/include/aidge/backend/cpu/operator/MulImpl.hpp index 05fceba17471229d83d9f8738614b2e747121b49..c927af9ebd4d658c764cc059df9778c273ba178e 100644 --- a/include/aidge/backend/cpu/operator/MulImpl.hpp +++ b/include/aidge/backend/cpu/operator/MulImpl.hpp @@ -23,21 +23,21 @@ namespace Aidge { // Operator implementation entry point for the backend using MulImpl_cpu = OperatorImpl_cpu<Mul_Op, - void(const std::vector<std::size_t>&, - const std::vector<std::size_t>&, - const std::vector<std::size_t>&, - const void*, + void(std::vector<std::size_t>, + std::vector<std::size_t>, + const std::vector<std::size_t>&, + const void*, const void*, void*), - void(const std::size_t, - const std::size_t, + void(const std::size_t, + const std::size_t, const std::size_t, const std::vector<std::size_t>, const std::vector<std::size_t>, - const void*, - const void*, - const void*, - void*, + const void*, + const void*, + const void*, + void*, void*)>; // Implementation entry point registration to Operator diff --git a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp index c015b8f0182608fecd3da94220e9411decfd186c..556dd56cd32f28de14a43d20b97deb0083341fee 100644 --- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp @@ -19,44 +19,143 @@ #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/MulImpl.hpp" +namespace { +// suppose values are contiguous in memory +template <class I1, class I2, class O> +void mul_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I1* input1, + const I2* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(input1[in1_id] * input2[in2_id]); + } +} +} + namespace Aidge { + template <class I1, class I2, class O> -void MulImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims, - const std::vector<std::size_t>& input2Dims, +void MulImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, + const void* input0_, const void* input1_, - const void* input2_, void* output_) { - - const I1* input_1 = static_cast<const I1*>(input1_); - const I2* input_2 = static_cast<const I2*>(input2_); + const I1* input_0 = static_cast<const I1*>(input0_); + const I2* input_1 = static_cast<const I2*>(input1_); O* output = static_cast<O*>(output_); - size_t totalElements = 1; - for (size_t dimSize : outputDims) { - totalElements *= dimSize; + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // ## Compute compatible input dimensions + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(input_0[i] * input_1[i]); + } + return; } - for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) - { - std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } - std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); - std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } - output[oIndex] = input_1[idx1] * input_2[idx2]; + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + mul_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } } } template <class I1, class I2, class O> -void MulImpl_cpu_backward_kernel(const std::size_t input0Length, +void MulImpl_cpu_backward_kernel(const std::size_t input0Length, const std::size_t input1Length, const std::size_t grad0Length, const std::vector<std::size_t> input0Dims, const std::vector<std::size_t> input1Dims, - const void* input0_, - const void* input1_, - const void* grad_output_, + const void* input0_, + const void* input1_, + const void* grad_output_, void* gradientInput0, void* gradientInput1) { diff --git a/include/aidge/backend/cpu/operator/PowImpl.hpp b/include/aidge/backend/cpu/operator/PowImpl.hpp index cfbb8173d1f83162519016a8f2b3c3166977a5b7..b31ce08c9089df05bd2e711fd87f09690fd2df23 100644 --- a/include/aidge/backend/cpu/operator/PowImpl.hpp +++ b/include/aidge/backend/cpu/operator/PowImpl.hpp @@ -12,18 +12,21 @@ #ifndef AIDGE_CPU_OPERATOR_POWIMPL_H_ #define AIDGE_CPU_OPERATOR_POWIMPL_H_ +#include <cstddef> // std::size_t +#include <memory> // std::unique_ptr, std::make_unique +#include <string> +#include <vector> + #include "aidge/backend/cpu/operator/OperatorImpl.hpp" #include "aidge/operator/Pow.hpp" #include "aidge/utils/Registrar.hpp" #include "aidge/utils/Types.h" -#include "aidge/backend/cpu/data/GetCPUPtr.h" -#include <memory> -#include <vector> + namespace Aidge { // Operator implementation entry point for the backend using PowImpl_cpu = OperatorImpl_cpu<Pow_Op, - void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*), + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*), void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, const void*, void*, void*)>; diff --git a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp index ab9b2ccc7b823842decd044b90a5c6364cedc9c9..cae106632053366e1370b5ce1d3a2ee4cfd3b62b 100644 --- a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp @@ -13,36 +13,141 @@ #define AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ #include "aidge/utils/Registrar.hpp" -#include <cmath> + +#include <cstddef> // std::size_t #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/PowImpl.hpp" namespace Aidge { -template <class I1, class I2, class O> -void PowImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims, - const std::vector<std::size_t>& input2Dims, + +namespace { +// suppose values are contiguous in memory +template <class I, class O> +void pow_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I* input1, + const I* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(std::pow(input1[in1_id], input2[in2_id])); + } +} +} + +template <class I, class O> +void PowImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, + const void* input0_, const void* input1_, - const void* input2_, void* output_) { - const I1* input_1 = static_cast<const I1*>(input1_); - const I2* input_2 = static_cast<const I2*>(input2_); + const I* input_0 = static_cast<const I*>(input0_); + const I* input_1 = static_cast<const I*>(input1_); O* output = static_cast<O*>(output_); - std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); - for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) - { - std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex); + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(std::pow(input_0[i], input_1[i])); + } + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } - std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); - std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); - - output[oIndex] = std::pow(input_1[idx1], input_2[idx2]); - } + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + pow_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } } + template <class I1, class I2, class O> void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims, const std::vector<std::size_t>& input1Dims, @@ -82,14 +187,23 @@ void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims, // Kernels registration to implementation entry point REGISTRAR(PowImpl_cpu, - {DataType::Float32}, - {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>}); +REGISTRAR(PowImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>}); +REGISTRAR(PowImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>}); +REGISTRAR(PowImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, Aidge::PowImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>}); REGISTRAR(PowImpl_cpu, - {DataType::Float64}, - {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, Aidge::PowImpl_cpu_backward_kernel<std::int8_t, std::int8_t, std::int8_t>}); REGISTRAR(PowImpl_cpu, - {DataType::Int32}, - {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, Aidge::PowImpl_cpu_backward_kernel<std::uint8_t, std::uint8_t, std::uint8_t>}); } // namespace Aidge #endif /* AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ */ diff --git a/include/aidge/backend/cpu/operator/SubImpl.hpp b/include/aidge/backend/cpu/operator/SubImpl.hpp index 2bb22bda74edf7db09404fd5613b6714ddcdf513..eed26ddcc9f57b3bb7796049a62f3f6be7de4eb5 100644 --- a/include/aidge/backend/cpu/operator/SubImpl.hpp +++ b/include/aidge/backend/cpu/operator/SubImpl.hpp @@ -23,7 +23,7 @@ namespace Aidge { // Operator implementation entry point for the backend using SubImpl_cpu = OperatorImpl_cpu<Sub_Op, - void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>; + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*)>; // Implementation entry point registration to Operator REGISTRAR(Sub_Op, "cpu", Aidge::SubImpl_cpu::create); diff --git a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp index 30ef10191937a03a83a682542d62a6ffbd97c19b..1d789c3c8886d35ce6597d5704c76060bad196c1 100644 --- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp @@ -21,32 +21,132 @@ #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/SubImpl.hpp" +namespace { +// suppose values are contiguous in memory +template <class I1, class I2, class O> +void sub_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I1* input1, + const I2* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(input1[in1_id] - input2[in2_id]); + } +} +} + namespace Aidge { template <class I1, class I2, class O> -void SubImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims, - const std::vector<std::size_t>& input2Dims, +void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, + const void* input0_, const void* input1_, - const void* input2_, void* output_) { - const I1* input_1 = static_cast<const I1*>(input1_); - const I2* input_2 = static_cast<const I2*>(input2_); + const I1* input_0 = static_cast<const I1*>(input0_); + const I2* input_1 = static_cast<const I2*>(input1_); O* output = static_cast<O*>(output_); - size_t totalElements = 1; - for (size_t dimSize : outputDims) { - totalElements *= dimSize; + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(input_0[i] - input_1[i]); + } + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } } + ++contiguousIdx; - for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) - { - std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); - std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); - std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); - output[oIndex] = input_1[idx1] - input_2[idx2]; - } + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + sub_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } } // Kernels registration to implementation entry point diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp index 457a0b17e531fac35ff873f9eedca7bbbe82d459..101743eccb606c998a38f49dd9b89f5ec279bcae 100644 --- a/src/operator/AddImpl.cpp +++ b/src/operator/AddImpl.cpp @@ -12,7 +12,6 @@ #include "aidge/backend/cpu/operator/AddImpl.hpp" #include <cassert> -#include <numeric> // std::accumulate #include <vector> #include "aidge/backend/cpu/data/GetCPUPtr.h" @@ -28,12 +27,11 @@ void Aidge::AddImpl_cpu::forward() { // Check inputs AIDGE_ASSERT(op.getInput(0), "missing input in Add operator"); AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Add forward because the 0-th input has no implementation."); - DataType datatypeFirstInput = op.getInput(0)->dataType(); - for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { - AIDGE_ASSERT(op.getInput(i), "missing input in Add operator"); - AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Add forward because the {}-th input has no implementation.", i); - AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type."); - } + + AIDGE_ASSERT(op.getInput(1), "missing input in Add operator"); + AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Add forward because the 1st input has no implementation."); + + AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot add inputs with two differents data type."); // Find the correct kernel type const auto impl = Registrar<AddImpl_cpu>::create(getBestMatch(getRequiredSpec())); @@ -42,28 +40,17 @@ void Aidge::AddImpl_cpu::forward() { // TODO: right now, if needed, memory will be allocated/deallocated at each // call to forward(). We might put the following shared_ptr as members of // this class to avoid that. - const std::size_t nbDims = op.getOutput(0)->nbDims(); - std::vector<std::vector<std::size_t>> inputsDims; - std::vector<const void*> opInputs; - std::vector<std::shared_ptr<Tensor>> inputsFallback(op.nbInputs()); - for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - std::vector<std::size_t> inputDims(nbDims, 1); - auto dims = op.getInput(i)->dims(); - for(std::size_t j=dims.size()-1; j+1>0; --j) - { - std::size_t idx = nbDims - (dims.size()-j); - inputDims[idx] = dims[j]; - } - inputsDims.push_back(inputDims); - const auto& input = op.getInput(i)->refCastFrom(inputsFallback[i], *op.getOutput(0)); - opInputs.push_back(input.getImpl()->rawPtr()); - } + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; + const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0)); + const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1)); + - impl.forward(opInputs, - inputsDims, - op.getOutput(0)->size(), - op.getOutput(0)->dims(), - getCPUPtr(op.getRawOutput(0))); + impl.forward(op.getInput(0)->dims(), + op.getInput(1)->dims(), + op.getOutput(0)->dims(), + input0.getImpl()->rawPtr(), + input1.getImpl()->rawPtr(), + getCPUPtr(op.getRawOutput(0))); } template <> diff --git a/src/operator/AndImpl.cpp b/src/operator/AndImpl.cpp index 2e0f59769ad86f6e4143ab59d089706e34792244..0cff914a4d03f6ef1ef339d7c7b46e48b6f4c293 100644 --- a/src/operator/AndImpl.cpp +++ b/src/operator/AndImpl.cpp @@ -25,22 +25,34 @@ template <> void Aidge::AndImpl_cpu::forward() { - const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()); - const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); + const And_Op& op = static_cast<const And_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in And operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run And forward because the 0-th input has no implementation."); + AIDGE_ASSERT(op.getInput(1), "missing input in And operator"); + AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run And forward because the 1st input has no implementation."); + + AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot And inputs with two differents data type."); // Find the correct kernel type const auto impl = Registrar<AndImpl_cpu>::create(getBestMatch(getRequiredSpec())); - // Call kernel - impl.forward(inputDims0, - inputDims1, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - getCPUPtr(mOp.getRawInput(0)), - getCPUPtr(mOp.getRawInput(1)), - getCPUPtr(mOp.getRawOutput(0))); + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; + const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0)); + const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1)); + + + impl.forward(op.getInput(0)->dims(), + op.getInput(1)->dims(), + op.getOutput(0)->dims(), + input0.getImpl()->rawPtr(), + input1.getImpl()->rawPtr(), + getCPUPtr(op.getRawOutput(0))); } template <> diff --git a/src/operator/BitShiftImpl.cpp b/src/operator/BitShiftImpl.cpp index 1e0f79fd29fd140f0b41c64d245b9b240da80028..c6940554dd925905a18de66651707c3d58594ade 100644 --- a/src/operator/BitShiftImpl.cpp +++ b/src/operator/BitShiftImpl.cpp @@ -28,27 +28,18 @@ void Aidge::BitShiftImpl_cpu::forward() { const auto& op_ = dynamic_cast<const BitShift_Op&>(mOp); - const auto impl = Registrar<BitShiftImpl_cpu>::create(getBestMatch(getRequiredSpec())); - - const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()); - const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); - - BitShift_Op::BitShiftDirection direction = op_.direction(); - // Call kernel impl.forward( - direction, - inputDims0, - inputDims1, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), + op_.direction(), + op_.getInput(0)->dims(), + op_.getInput(1)->dims(), + op_.getOutput(0)->dims(), getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawOutput(0))); - + } template <> diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp index ea5e3d3ab8ac24934a0cb6f9042858fa094700af..422bdd005f058fc9200cf5f7962bfc8d5877e6e1 100644 --- a/src/operator/MulImpl.cpp +++ b/src/operator/MulImpl.cpp @@ -25,18 +25,15 @@ template <> void Aidge::MulImpl_cpu::forward() { - const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()); - const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); + const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp); // Find the correct kernel type const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec())); // Call kernel - impl.forward(inputDims0, - inputDims1, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), + impl.forward(op_.getInput(0)->dims(), + op_.getInput(1)->dims(), + op_.getOutput(0)->dims(), getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawOutput(0))); @@ -45,7 +42,7 @@ void Aidge::MulImpl_cpu::forward() { template <> void Aidge::MulImpl_cpu::backward() { const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp); - + auto in0 = op_.getInput(0); auto in1 = op_.getInput(1); auto in0grad = op_.getInput(0)->grad(); @@ -56,14 +53,14 @@ void Aidge::MulImpl_cpu::backward() { const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec())); // Call kernel - impl.backward(/* input0Length */ in0grad->size(), + impl.backward(/* input0Length */ in0grad->size(), /* input1Length */ in1grad->size(), /* grad0Length */ out0grad->size(), /* input0Dims */ in0->dims(), /* input1Dims */ in1->dims(), - getCPUPtr(in0), - getCPUPtr(in1), - getCPUPtr(out0grad), - getCPUPtr(in0grad), + getCPUPtr(in0), + getCPUPtr(in1), + getCPUPtr(out0grad), + getCPUPtr(in0grad), getCPUPtr(in1grad)); } diff --git a/src/operator/PowImpl.cpp b/src/operator/PowImpl.cpp index 74a7be71e176ba8e1cb8851050e575d6aa7465df..4448c8e9c455e59b584b084d32a8b17e8ae03453 100644 --- a/src/operator/PowImpl.cpp +++ b/src/operator/PowImpl.cpp @@ -25,21 +25,36 @@ template <> void Aidge::PowImpl_cpu::forward() { - const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()); - const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); + + const Pow_Op& op = static_cast<const Pow_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in Pow operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Pow forward because the 0-th input has no implementation."); + + AIDGE_ASSERT(op.getInput(1), "missing input in Pow operator"); + AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Pow forward because the 1st input has no implementation."); + + AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot compute Pow with inputs of two differents data type."); // Find the correct kernel type const auto impl = Registrar<PowImpl_cpu>::create(getBestMatch(getRequiredSpec())); - // Call kernel - impl.forward(inputDims0, - inputDims1, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - getCPUPtr(mOp.getRawInput(0)), - getCPUPtr(mOp.getRawInput(1)), - getCPUPtr(mOp.getRawOutput(0))); + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; + const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0)); + const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1)); + + + impl.forward(op.getInput(0)->dims(), + op.getInput(1)->dims(), + op.getOutput(0)->dims(), + input0.getImpl()->rawPtr(), + input1.getImpl()->rawPtr(), + getCPUPtr(op.getRawOutput(0))); + } template <> @@ -69,4 +84,4 @@ void Aidge::PowImpl_cpu::backward() { getCPUPtr(out0grad), getCPUPtr(in0grad), getCPUPtr(in1grad)); -} \ No newline at end of file +} diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp index d43771b967889183801cb93418c967ce9d9c8453..e36abe2a9d68a2b56ab1777aa04b0e911df514c8 100644 --- a/src/operator/SubImpl.cpp +++ b/src/operator/SubImpl.cpp @@ -25,18 +25,15 @@ template <> void Aidge::SubImpl_cpu::forward() { - const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()); - const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); + const Sub_Op& op_ = dynamic_cast<const Sub_Op&>(mOp); // Find the correct kernel type const auto impl = Registrar<SubImpl_cpu>::create(getBestMatch(getRequiredSpec())); // Call kernel - impl.forward(inputDims0, - inputDims1, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), + impl.forward(op_.getInput(0)->dims(), + op_.getInput(1)->dims(), + op_.getOutput(0)->dims(), getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawOutput(0))); diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt index 8178df93beb96a3a7538dae8d9a706380c06ecf8..5984524fdc8c596641e505897d16e12de78024cc 100644 --- a/unit_tests/CMakeLists.txt +++ b/unit_tests/CMakeLists.txt @@ -3,7 +3,7 @@ Include(FetchContent) FetchContent_Declare( Catch2 GIT_REPOSITORY https://github.com/catchorg/Catch2.git - GIT_TAG v3.0.1 # or a later release + GIT_TAG v3.7.1 # or a later release ) FetchContent_MakeAvailable(Catch2)