Skip to content
Snippets Groups Projects
Commit 4c8412cc authored by Cyril Moineau's avatar Cyril Moineau Committed by Maxence Naud
Browse files

perf: update Pow forward kernel.

parent 5166330c
No related branches found
No related tags found
No related merge requests found
This commit is part of merge request !40. Comments created here will be created in the context of that merge request.
......@@ -12,18 +12,21 @@
#ifndef AIDGE_CPU_OPERATOR_POWIMPL_H_
#define AIDGE_CPU_OPERATOR_POWIMPL_H_
#include <cstddef> // std::size_t
#include <memory> // std::unique_ptr, std::make_unique
#include <string>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/Pow.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include <memory>
#include <vector>
namespace Aidge {
// Operator implementation entry point for the backend
using PowImpl_cpu = OperatorImpl_cpu<Pow_Op,
void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*),
void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*),
void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, const void*, void*, void*)>;
......
......@@ -13,36 +13,141 @@
#define AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_
#include "aidge/utils/Registrar.hpp"
#include <cmath>
#include <cstddef> // std::size_t
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/PowImpl.hpp"
namespace Aidge {
template <class I1, class I2, class O>
void PowImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
const std::vector<std::size_t>& input2Dims,
namespace {
// suppose values are contiguous in memory
template <class I, class O>
void pow_contiguous_arrays(const std::size_t input1size,
const std::size_t input2size,
const std::size_t output1size,
const I* input1,
const I* input2,
O* output)
{
for (std::size_t i = 0; i < output1size; ++i)
{
const std::size_t in1_id = (input1size != 1) ? i : 0;
const std::size_t in2_id = (input2size != 1) ? i : 0;
output[i] = static_cast<O>(std::pow(input1[in1_id], input2[in2_id]));
}
}
}
template <class I, class O>
void PowImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
std::vector<std::size_t> dims1,
const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_,
const void* input2_,
void* output_) {
const I1* input_1 = static_cast<const I1*>(input1_);
const I2* input_2 = static_cast<const I2*>(input2_);
const I* input_0 = static_cast<const I*>(input0_);
const I* input_1 = static_cast<const I*>(input1_);
O* output = static_cast<O*>(output_);
std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
{
std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);
// [5,2,1,7] & [2,6,7]
// 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
// 2. Find the highest equal dimension -> 3
// Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
// 3. Compute the highest number of contiguous data -> 7
// 4. Compute stride and offset step for the broadcast mechanism
// 5. Call a simple kernel
// special case for equal dimensions, the kernel is called with the entire arrays at once
if (dims0 == dims1) {
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t i = 0; i < input0_contiguous_size; ++i)
{
output[i] = static_cast<O>(std::pow(input_0[i], input_1[i]));
}
return;
}
// set dimensions to be of equal size by filling the smallest one with ones.
if (dims0.size() > dims1.size()) {
dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
output[oIndex] = std::pow(input_1[idx1], input_2[idx2]);
}
const std::size_t nbDims = dims0.size();
// Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
}
}
++contiguousIdx;
// Compute the highest number of contiguous data for each Tensor
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
pow_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
input_0 + offsetIn0*input0_contiguous_size,
input_1 + offsetIn1*input1_contiguous_size,
output + offsetOut*output_contiguous_size);
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outputDims[dim] == 0) {
tmp_stack /= outputDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
}
template <class I1, class I2, class O>
void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims,
const std::vector<std::size_t>& input1Dims,
......@@ -82,14 +187,23 @@ void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims,
// Kernels registration to implementation entry point
REGISTRAR(PowImpl_cpu,
{DataType::Float32},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>});
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>});
REGISTRAR(PowImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>});
REGISTRAR(PowImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>});
REGISTRAR(PowImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, Aidge::PowImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>});
REGISTRAR(PowImpl_cpu,
{DataType::Float64},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>});
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, Aidge::PowImpl_cpu_backward_kernel<std::int8_t, std::int8_t, std::int8_t>});
REGISTRAR(PowImpl_cpu,
{DataType::Int32},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>});
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, Aidge::PowImpl_cpu_backward_kernel<std::uint8_t, std::uint8_t, std::uint8_t>});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ */
......@@ -25,21 +25,36 @@
template <>
void Aidge::PowImpl_cpu::forward() {
const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
const Pow_Op& op = static_cast<const Pow_Op&>(mOp);
// Check inputs
AIDGE_ASSERT(op.getInput(0), "missing input in Pow operator");
AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Pow forward because the 0-th input has no implementation.");
AIDGE_ASSERT(op.getInput(1), "missing input in Pow operator");
AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Pow forward because the 1st input has no implementation.");
AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot compute Pow with inputs of two differents data type.");
// Find the correct kernel type
const auto impl = Registrar<PowImpl_cpu>::create(getBestMatch(getRequiredSpec()));
// Call kernel
impl.forward(inputDims0,
inputDims1,
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
getCPUPtr(mOp.getRawInput(0)),
getCPUPtr(mOp.getRawInput(1)),
getCPUPtr(mOp.getRawOutput(0)));
// Convert input data (no overhead if not needed!)
// TODO: right now, if needed, memory will be allocated/deallocated at each
// call to forward(). We might put the following shared_ptr as members of
// this class to avoid that.
std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0));
const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1));
impl.forward(op.getInput(0)->dims(),
op.getInput(1)->dims(),
op.getOutput(0)->dims(),
input0.getImpl()->rawPtr(),
input1.getImpl()->rawPtr(),
getCPUPtr(op.getRawOutput(0)));
}
template <>
......@@ -69,4 +84,4 @@ void Aidge::PowImpl_cpu::backward() {
getCPUPtr(out0grad),
getCPUPtr(in0grad),
getCPUPtr(in1grad));
}
\ No newline at end of file
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment