diff --git a/include/aidge/backend/cpu/operator/AddImpl.hpp b/include/aidge/backend/cpu/operator/AddImpl.hpp index 5e795922a67be178dde588e8e5e346ec268efe86..e39c35b42fdb6065aa72aee092cd1cd23b2b1011 100644 --- a/include/aidge/backend/cpu/operator/AddImpl.hpp +++ b/include/aidge/backend/cpu/operator/AddImpl.hpp @@ -25,7 +25,7 @@ namespace Aidge { // Operator implementation entry point for the backend using AddImpl_cpu = OperatorImpl_cpu<Add_Op, - void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)>; + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>; // Implementation entry point registration to Operator REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create); diff --git a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp index fd1a0305cf734e556a4217e9303c011cb0d25a3a..e6d13fcf3699824a8410015d35ff766adf617c11 100644 --- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp @@ -14,31 +14,137 @@ #include "aidge/utils/Registrar.hpp" -#include <cstdint> // std::int32_t, std::int64_t +#include <cstddef> // std::size_t #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/AddImpl.hpp" namespace Aidge { +namespace { +// suppose values are contiguous in memory template <class I, class O> -void AddImpl_cpu_forward_kernel(const std::vector<const void*> inputs_, const std::vector<std::vector<std::size_t>>& inputDims, const std::size_t outputLength, const std::vector<std::size_t>& outDims, void* output_) { - // FIXME: missing Add attributes as arguments - std::vector<const I*> inputs; - for (const auto& input_ : inputs_) { - inputs.push_back(static_cast<const I*>(input_)); +void add_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I* input1, + const I* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(input1[in1_id] + input2[in2_id]); } +} +} + +template <class I, class O> +void AddImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, + const std::vector<std::size_t>& outputDims, + const void* input0_, + const void* input1_, + void* output_) { + + const I* input_0 = static_cast<const I*>(input0_); + const I* input_1 = static_cast<const I*>(input1_); O* output = static_cast<O*>(output_); - for (std::size_t oIndex = 0; oIndex < outputLength; ++oIndex) - { - output[oIndex] = 0; - std::vector<size_t> indexes = getMultiDimIndices(outDims, oIndex); - for(std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) { - std::size_t idx = getFlattenedIndex(inputDims[iIndex], indexes); - output[oIndex] += inputs[iIndex][idx]; - } - } + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(input_0[i] + input_1[i]); + } + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + add_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } } // Kernels registration to implementation entry point diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp index 6da67bb7dd4469b6ca609c5aea1ae70dfca3f939..807d2b972ba385f9382d4121173a75207600d098 100644 --- a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp +++ b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp @@ -24,13 +24,13 @@ namespace Aidge { // Operator implementation entry point for the backend using BitShiftImpl_cpu = OperatorImpl_cpu<BitShift_Op, void(const BitShift_Op::BitShiftDirection, - const std::vector<std::size_t>&, - const std::vector<std::size_t>&, - const std::vector<std::size_t>&, - const void*, + std::vector<std::size_t>, + std::vector<std::size_t>, + const std::vector<std::size_t>&, + const void*, const void*, void*)>; - + // Implementation entry point registration to Operator REGISTRAR(BitShift_Op,"cpu",Aidge::BitShiftImpl_cpu::create); } // namespace Aidge diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp index f815e946ea2e4abaff48a6e5155368d564e88e8c..1f2561afe0be9997116cbd82f754c485a1760090 100644 --- a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp @@ -12,47 +12,150 @@ #ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_ #define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_ -#include "aidge/utils/Registrar.hpp" -#include <cstdint> // std::int32_t, std::int64_t -#include "aidge/operator/BitShift.hpp" +#include <cstdint> // std::int32_t, std::int64_t +#include <cstddef> // std::size_t #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/BitShiftImpl.hpp" +#include "aidge/operator/BitShift.hpp" +#include "aidge/utils/Registrar.hpp" +namespace { +// suppose values are contiguous in memory +template <class I1, class I2, class O> +void bitshift_contiguous_arrays( + const Aidge::BitShift_Op::BitShiftDirection direction, + const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I1* input_1, + const I2* input_2, + O* output) +{ + if(direction == Aidge::BitShift_Op::BitShiftDirection::right) { + for (std::size_t i = 0; i < output1size; ++i) { + const std::size_t idx1 = (input1size != 1) ? i : 0; + const std::size_t idx2 = (input2size != 1) ? i : 0; + output[i]= input_1[idx1] >> input_2[idx2]; + } + + } else { + for (std::size_t i = 0; i < output1size; ++i) { + const std::size_t idx1 = (input1size != 1) ? i : 0; + const std::size_t idx2 = (input2size != 1) ? i : 0; + output[i] = input_1[idx1] << input_2[idx2]; + } + } +} +} namespace Aidge { template <class I1, class I2, class O> void BitShiftImpl_cpu_forward_kernel( const BitShift_Op::BitShiftDirection direction, - const std::vector<std::size_t>& input1Dims, - const std::vector<std::size_t>& input2Dims, + std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, + const void* input0_, const void* input1_, - const void* input2_, void* output_ ) { - const I1* input_1 = static_cast<const I1*>(input1_); - const I2* input_2 = static_cast<const I2*>(input2_); + const I1* input_0 = static_cast<const I1*>(input0_); + const I2* input_1 = static_cast<const I2*>(input1_); O* output = static_cast<O*>(output_); - const size_t totalElements = std::accumulate(outputDims.begin(), outputDims.end(), std::size_t(1), std::multiplies<std::size_t>()); - - for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) - { - std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); - std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); - std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); - if(direction == BitShift_Op::BitShiftDirection::right) - - { - output[oIndex]= input_1[idx1] >> input_2[idx2]; + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // ## Compute compatible input dimensions + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + bitshift_contiguous_arrays(direction, input0_contiguous_size, input0_contiguous_size, input0_contiguous_size, input_0, input_1, output); + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; } - else - { - output[oIndex] = input_1[idx1] << input_2[idx2]; + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + bitshift_contiguous_arrays<I1,I2,O>(direction, input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; } } } diff --git a/include/aidge/backend/cpu/operator/MulImpl.hpp b/include/aidge/backend/cpu/operator/MulImpl.hpp index 05fceba17471229d83d9f8738614b2e747121b49..c927af9ebd4d658c764cc059df9778c273ba178e 100644 --- a/include/aidge/backend/cpu/operator/MulImpl.hpp +++ b/include/aidge/backend/cpu/operator/MulImpl.hpp @@ -23,21 +23,21 @@ namespace Aidge { // Operator implementation entry point for the backend using MulImpl_cpu = OperatorImpl_cpu<Mul_Op, - void(const std::vector<std::size_t>&, - const std::vector<std::size_t>&, - const std::vector<std::size_t>&, - const void*, + void(std::vector<std::size_t>, + std::vector<std::size_t>, + const std::vector<std::size_t>&, + const void*, const void*, void*), - void(const std::size_t, - const std::size_t, + void(const std::size_t, + const std::size_t, const std::size_t, const std::vector<std::size_t>, const std::vector<std::size_t>, - const void*, - const void*, - const void*, - void*, + const void*, + const void*, + const void*, + void*, void*)>; // Implementation entry point registration to Operator diff --git a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp index c015b8f0182608fecd3da94220e9411decfd186c..556dd56cd32f28de14a43d20b97deb0083341fee 100644 --- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp @@ -19,44 +19,143 @@ #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/MulImpl.hpp" +namespace { +// suppose values are contiguous in memory +template <class I1, class I2, class O> +void mul_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I1* input1, + const I2* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(input1[in1_id] * input2[in2_id]); + } +} +} + namespace Aidge { + template <class I1, class I2, class O> -void MulImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims, - const std::vector<std::size_t>& input2Dims, +void MulImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, + const void* input0_, const void* input1_, - const void* input2_, void* output_) { - - const I1* input_1 = static_cast<const I1*>(input1_); - const I2* input_2 = static_cast<const I2*>(input2_); + const I1* input_0 = static_cast<const I1*>(input0_); + const I2* input_1 = static_cast<const I2*>(input1_); O* output = static_cast<O*>(output_); - size_t totalElements = 1; - for (size_t dimSize : outputDims) { - totalElements *= dimSize; + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // ## Compute compatible input dimensions + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(input_0[i] * input_1[i]); + } + return; } - for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) - { - std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } - std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); - std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } - output[oIndex] = input_1[idx1] * input_2[idx2]; + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + mul_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } } } template <class I1, class I2, class O> -void MulImpl_cpu_backward_kernel(const std::size_t input0Length, +void MulImpl_cpu_backward_kernel(const std::size_t input0Length, const std::size_t input1Length, const std::size_t grad0Length, const std::vector<std::size_t> input0Dims, const std::vector<std::size_t> input1Dims, - const void* input0_, - const void* input1_, - const void* grad_output_, + const void* input0_, + const void* input1_, + const void* grad_output_, void* gradientInput0, void* gradientInput1) { diff --git a/include/aidge/backend/cpu/operator/SubImpl.hpp b/include/aidge/backend/cpu/operator/SubImpl.hpp index 2bb22bda74edf7db09404fd5613b6714ddcdf513..eed26ddcc9f57b3bb7796049a62f3f6be7de4eb5 100644 --- a/include/aidge/backend/cpu/operator/SubImpl.hpp +++ b/include/aidge/backend/cpu/operator/SubImpl.hpp @@ -23,7 +23,7 @@ namespace Aidge { // Operator implementation entry point for the backend using SubImpl_cpu = OperatorImpl_cpu<Sub_Op, - void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>; + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*)>; // Implementation entry point registration to Operator REGISTRAR(Sub_Op, "cpu", Aidge::SubImpl_cpu::create); diff --git a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp index 30ef10191937a03a83a682542d62a6ffbd97c19b..1d789c3c8886d35ce6597d5704c76060bad196c1 100644 --- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp @@ -21,32 +21,132 @@ #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/SubImpl.hpp" +namespace { +// suppose values are contiguous in memory +template <class I1, class I2, class O> +void sub_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I1* input1, + const I2* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(input1[in1_id] - input2[in2_id]); + } +} +} + namespace Aidge { template <class I1, class I2, class O> -void SubImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims, - const std::vector<std::size_t>& input2Dims, +void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, + const void* input0_, const void* input1_, - const void* input2_, void* output_) { - const I1* input_1 = static_cast<const I1*>(input1_); - const I2* input_2 = static_cast<const I2*>(input2_); + const I1* input_0 = static_cast<const I1*>(input0_); + const I2* input_1 = static_cast<const I2*>(input1_); O* output = static_cast<O*>(output_); - size_t totalElements = 1; - for (size_t dimSize : outputDims) { - totalElements *= dimSize; + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(input_0[i] - input_1[i]); + } + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } } + ++contiguousIdx; - for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) - { - std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); - std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); - std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); - output[oIndex] = input_1[idx1] - input_2[idx2]; - } + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + sub_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } } // Kernels registration to implementation entry point diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp index 457a0b17e531fac35ff873f9eedca7bbbe82d459..101743eccb606c998a38f49dd9b89f5ec279bcae 100644 --- a/src/operator/AddImpl.cpp +++ b/src/operator/AddImpl.cpp @@ -12,7 +12,6 @@ #include "aidge/backend/cpu/operator/AddImpl.hpp" #include <cassert> -#include <numeric> // std::accumulate #include <vector> #include "aidge/backend/cpu/data/GetCPUPtr.h" @@ -28,12 +27,11 @@ void Aidge::AddImpl_cpu::forward() { // Check inputs AIDGE_ASSERT(op.getInput(0), "missing input in Add operator"); AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Add forward because the 0-th input has no implementation."); - DataType datatypeFirstInput = op.getInput(0)->dataType(); - for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { - AIDGE_ASSERT(op.getInput(i), "missing input in Add operator"); - AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Add forward because the {}-th input has no implementation.", i); - AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type."); - } + + AIDGE_ASSERT(op.getInput(1), "missing input in Add operator"); + AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Add forward because the 1st input has no implementation."); + + AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot add inputs with two differents data type."); // Find the correct kernel type const auto impl = Registrar<AddImpl_cpu>::create(getBestMatch(getRequiredSpec())); @@ -42,28 +40,17 @@ void Aidge::AddImpl_cpu::forward() { // TODO: right now, if needed, memory will be allocated/deallocated at each // call to forward(). We might put the following shared_ptr as members of // this class to avoid that. - const std::size_t nbDims = op.getOutput(0)->nbDims(); - std::vector<std::vector<std::size_t>> inputsDims; - std::vector<const void*> opInputs; - std::vector<std::shared_ptr<Tensor>> inputsFallback(op.nbInputs()); - for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { - std::vector<std::size_t> inputDims(nbDims, 1); - auto dims = op.getInput(i)->dims(); - for(std::size_t j=dims.size()-1; j+1>0; --j) - { - std::size_t idx = nbDims - (dims.size()-j); - inputDims[idx] = dims[j]; - } - inputsDims.push_back(inputDims); - const auto& input = op.getInput(i)->refCastFrom(inputsFallback[i], *op.getOutput(0)); - opInputs.push_back(input.getImpl()->rawPtr()); - } + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; + const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0)); + const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1)); + - impl.forward(opInputs, - inputsDims, - op.getOutput(0)->size(), - op.getOutput(0)->dims(), - getCPUPtr(op.getRawOutput(0))); + impl.forward(op.getInput(0)->dims(), + op.getInput(1)->dims(), + op.getOutput(0)->dims(), + input0.getImpl()->rawPtr(), + input1.getImpl()->rawPtr(), + getCPUPtr(op.getRawOutput(0))); } template <> diff --git a/src/operator/BitShiftImpl.cpp b/src/operator/BitShiftImpl.cpp index 1e0f79fd29fd140f0b41c64d245b9b240da80028..c6940554dd925905a18de66651707c3d58594ade 100644 --- a/src/operator/BitShiftImpl.cpp +++ b/src/operator/BitShiftImpl.cpp @@ -28,27 +28,18 @@ void Aidge::BitShiftImpl_cpu::forward() { const auto& op_ = dynamic_cast<const BitShift_Op&>(mOp); - const auto impl = Registrar<BitShiftImpl_cpu>::create(getBestMatch(getRequiredSpec())); - - const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()); - const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); - - BitShift_Op::BitShiftDirection direction = op_.direction(); - // Call kernel impl.forward( - direction, - inputDims0, - inputDims1, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), + op_.direction(), + op_.getInput(0)->dims(), + op_.getInput(1)->dims(), + op_.getOutput(0)->dims(), getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawOutput(0))); - + } template <> diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp index ea5e3d3ab8ac24934a0cb6f9042858fa094700af..422bdd005f058fc9200cf5f7962bfc8d5877e6e1 100644 --- a/src/operator/MulImpl.cpp +++ b/src/operator/MulImpl.cpp @@ -25,18 +25,15 @@ template <> void Aidge::MulImpl_cpu::forward() { - const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()); - const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); + const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp); // Find the correct kernel type const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec())); // Call kernel - impl.forward(inputDims0, - inputDims1, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), + impl.forward(op_.getInput(0)->dims(), + op_.getInput(1)->dims(), + op_.getOutput(0)->dims(), getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawOutput(0))); @@ -45,7 +42,7 @@ void Aidge::MulImpl_cpu::forward() { template <> void Aidge::MulImpl_cpu::backward() { const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp); - + auto in0 = op_.getInput(0); auto in1 = op_.getInput(1); auto in0grad = op_.getInput(0)->grad(); @@ -56,14 +53,14 @@ void Aidge::MulImpl_cpu::backward() { const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec())); // Call kernel - impl.backward(/* input0Length */ in0grad->size(), + impl.backward(/* input0Length */ in0grad->size(), /* input1Length */ in1grad->size(), /* grad0Length */ out0grad->size(), /* input0Dims */ in0->dims(), /* input1Dims */ in1->dims(), - getCPUPtr(in0), - getCPUPtr(in1), - getCPUPtr(out0grad), - getCPUPtr(in0grad), + getCPUPtr(in0), + getCPUPtr(in1), + getCPUPtr(out0grad), + getCPUPtr(in0grad), getCPUPtr(in1grad)); } diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp index d43771b967889183801cb93418c967ce9d9c8453..e36abe2a9d68a2b56ab1777aa04b0e911df514c8 100644 --- a/src/operator/SubImpl.cpp +++ b/src/operator/SubImpl.cpp @@ -25,18 +25,15 @@ template <> void Aidge::SubImpl_cpu::forward() { - const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()); - const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); + const Sub_Op& op_ = dynamic_cast<const Sub_Op&>(mOp); // Find the correct kernel type const auto impl = Registrar<SubImpl_cpu>::create(getBestMatch(getRequiredSpec())); // Call kernel - impl.forward(inputDims0, - inputDims1, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), + impl.forward(op_.getInput(0)->dims(), + op_.getInput(1)->dims(), + op_.getOutput(0)->dims(), getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawOutput(0)));