From 6186554cb6e9bdea18d337920fd59603979ec70e Mon Sep 17 00:00:00 2001 From: cmoineau <cyril.moineau@cea.fr> Date: Mon, 18 Nov 2024 15:50:57 +0000 Subject: [PATCH] perf: update Pow forward kernel. --- .../aidge/backend/cpu/operator/PowImpl.hpp | 11 +- .../backend/cpu/operator/PowImpl_kernels.hpp | 158 +++++++++++++++--- src/operator/PowImpl.cpp | 39 +++-- 3 files changed, 170 insertions(+), 38 deletions(-) diff --git a/include/aidge/backend/cpu/operator/PowImpl.hpp b/include/aidge/backend/cpu/operator/PowImpl.hpp index cfbb8173..b31ce08c 100644 --- a/include/aidge/backend/cpu/operator/PowImpl.hpp +++ b/include/aidge/backend/cpu/operator/PowImpl.hpp @@ -12,18 +12,21 @@ #ifndef AIDGE_CPU_OPERATOR_POWIMPL_H_ #define AIDGE_CPU_OPERATOR_POWIMPL_H_ +#include <cstddef> // std::size_t +#include <memory> // std::unique_ptr, std::make_unique +#include <string> +#include <vector> + #include "aidge/backend/cpu/operator/OperatorImpl.hpp" #include "aidge/operator/Pow.hpp" #include "aidge/utils/Registrar.hpp" #include "aidge/utils/Types.h" -#include "aidge/backend/cpu/data/GetCPUPtr.h" -#include <memory> -#include <vector> + namespace Aidge { // Operator implementation entry point for the backend using PowImpl_cpu = OperatorImpl_cpu<Pow_Op, - void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*), + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*), void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, const void*, void*, void*)>; diff --git a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp index ab9b2ccc..cae10663 100644 --- a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp @@ -13,36 +13,141 @@ #define AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ #include "aidge/utils/Registrar.hpp" -#include <cmath> + +#include <cstddef> // std::size_t #include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/operator/PowImpl.hpp" namespace Aidge { -template <class I1, class I2, class O> -void PowImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims, - const std::vector<std::size_t>& input2Dims, + +namespace { +// suppose values are contiguous in memory +template <class I, class O> +void pow_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I* input1, + const I* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(std::pow(input1[in1_id], input2[in2_id])); + } +} +} + +template <class I, class O> +void PowImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, const std::vector<std::size_t>& outputDims, + const void* input0_, const void* input1_, - const void* input2_, void* output_) { - const I1* input_1 = static_cast<const I1*>(input1_); - const I2* input_2 = static_cast<const I2*>(input2_); + const I* input_0 = static_cast<const I*>(input0_); + const I* input_1 = static_cast<const I*>(input1_); O* output = static_cast<O*>(output_); - std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); - for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) - { - std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex); + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(std::pow(input_0[i], input_1[i])); + } + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } - std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); - std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); - - output[oIndex] = std::pow(input_1[idx1], input_2[idx2]); - } + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + pow_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } } + template <class I1, class I2, class O> void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims, const std::vector<std::size_t>& input1Dims, @@ -82,14 +187,23 @@ void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims, // Kernels registration to implementation entry point REGISTRAR(PowImpl_cpu, - {DataType::Float32}, - {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>}); +REGISTRAR(PowImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>}); +REGISTRAR(PowImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>}); +REGISTRAR(PowImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, Aidge::PowImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>}); REGISTRAR(PowImpl_cpu, - {DataType::Float64}, - {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, Aidge::PowImpl_cpu_backward_kernel<std::int8_t, std::int8_t, std::int8_t>}); REGISTRAR(PowImpl_cpu, - {DataType::Int32}, - {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>}); + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}}, + {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, Aidge::PowImpl_cpu_backward_kernel<std::uint8_t, std::uint8_t, std::uint8_t>}); } // namespace Aidge #endif /* AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ */ diff --git a/src/operator/PowImpl.cpp b/src/operator/PowImpl.cpp index 74a7be71..4448c8e9 100644 --- a/src/operator/PowImpl.cpp +++ b/src/operator/PowImpl.cpp @@ -25,21 +25,36 @@ template <> void Aidge::PowImpl_cpu::forward() { - const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()); - const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); + + const Pow_Op& op = static_cast<const Pow_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in Pow operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Pow forward because the 0-th input has no implementation."); + + AIDGE_ASSERT(op.getInput(1), "missing input in Pow operator"); + AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Pow forward because the 1st input has no implementation."); + + AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot compute Pow with inputs of two differents data type."); // Find the correct kernel type const auto impl = Registrar<PowImpl_cpu>::create(getBestMatch(getRequiredSpec())); - // Call kernel - impl.forward(inputDims0, - inputDims1, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), - getCPUPtr(mOp.getRawInput(0)), - getCPUPtr(mOp.getRawInput(1)), - getCPUPtr(mOp.getRawOutput(0))); + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; + const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0)); + const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1)); + + + impl.forward(op.getInput(0)->dims(), + op.getInput(1)->dims(), + op.getOutput(0)->dims(), + input0.getImpl()->rawPtr(), + input1.getImpl()->rawPtr(), + getCPUPtr(op.getRawOutput(0))); + } template <> @@ -69,4 +84,4 @@ void Aidge::PowImpl_cpu::backward() { getCPUPtr(out0grad), getCPUPtr(in0grad), getCPUPtr(in1grad)); -} \ No newline at end of file +} -- GitLab