Skip to content
Snippets Groups Projects
Commit ae5f86da authored by Maxence Naud's avatar Maxence Naud
Browse files

perf: update 'And' forard kernel

parent 4c8412cc
No related branches found
No related tags found
No related merge requests found
This commit is part of merge request !40. Comments created here will be created in the context of that merge request.
......@@ -23,7 +23,7 @@
namespace Aidge {
// Operator implementation entry point for the backend
using AndImpl_cpu = OperatorImpl_cpu<And_Op,
void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>;
void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
// Implementation entry point registration to Operator
REGISTRAR(And_Op, "cpu", Aidge::AndImpl_cpu::create);
......
......@@ -12,52 +12,153 @@
#ifndef AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/AndImpl.hpp"
#include "aidge/utils/Registrar.hpp"
namespace Aidge {
template <class I1, class I2, class O>
void AndImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
const std::vector<std::size_t>& input2Dims,
namespace {
// suppose values are contiguous in memory
template <class I, class O>
void equal_contiguous_arrays(const std::size_t input1size,
const std::size_t input2size,
const std::size_t output1size,
const I* input1,
const I* input2,
O* output)
{
for (std::size_t i = 0; i < output1size; ++i)
{
const std::size_t in1_id = (input1size != 1) ? i : 0;
const std::size_t in2_id = (input2size != 1) ? i : 0;
output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
}
}
}
template <class I, class O>
void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
std::vector<std::size_t> dims1,
const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_,
const void* input2_,
void* output_) {
const I1* input_1 = static_cast<const I1*>(input1_);
const I2* input_2 = static_cast<const I2*>(input2_);
const I* input_0 = static_cast<const I*>(input0_);
const I* input_1 = static_cast<const I*>(input1_);
O* output = static_cast<O*>(output_);
size_t totalElements = 1;
for (size_t dimSize : outputDims) {
totalElements *= dimSize;
// [5,2,1,7] & [2,6,7]
// 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
// 2. Find the highest equal dimension -> 3
// Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
// 3. Compute the highest number of contiguous data -> 7
// 4. Compute stride and offset step for the broadcast mechanism
// 5. Call a simple kernel
// special case for equal dimensions, the kernel is called with the entire arrays at once
if (dims0 == dims1) {
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t i = 0; i < input0_contiguous_size; ++i)
{
output[i] = static_cast<O>(input_0[i] + input_1[i]);
}
return;
}
for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
{
std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
// set dimensions to be of equal size by filling the smallest one with ones.
if (dims0.size() > dims1.size()) {
dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
const std::size_t nbDims = dims0.size();
output[oIndex] = static_cast<O>(input_1[idx1] == input_2[idx2]);
// Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
}
}
++contiguousIdx;
// Compute the highest number of contiguous data for each Tensor
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
input_0 + offsetIn0*input0_contiguous_size,
input_1 + offsetIn1*input1_contiguous_size,
output + offsetOut*output_contiguous_size);
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outputDims[dim] == 0) {
tmp_stack /= outputDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
}
} // namespace Aidge
// Kernels registration to implementation entry point
REGISTRAR(AndImpl_cpu,
{DataType::Float32},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float, float>, nullptr});
{ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float, float>, nullptr});
REGISTRAR(AndImpl_cpu,
{DataType::Float64},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double, double>, nullptr});
{ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double, double>, nullptr});
REGISTRAR(AndImpl_cpu,
{DataType::Int32},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
{ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
REGISTRAR(AndImpl_cpu,
{DataType::Int64},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr});
{ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ */
......@@ -25,22 +25,34 @@
template <>
void Aidge::AndImpl_cpu::forward() {
const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
const And_Op& op = static_cast<const And_Op&>(mOp);
// Check inputs
AIDGE_ASSERT(op.getInput(0), "missing input in And operator");
AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run And forward because the 0-th input has no implementation.");
AIDGE_ASSERT(op.getInput(1), "missing input in And operator");
AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run And forward because the 1st input has no implementation.");
AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot And inputs with two differents data type.");
// Find the correct kernel type
const auto impl = Registrar<AndImpl_cpu>::create(getBestMatch(getRequiredSpec()));
// Call kernel
impl.forward(inputDims0,
inputDims1,
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
getCPUPtr(mOp.getRawInput(0)),
getCPUPtr(mOp.getRawInput(1)),
getCPUPtr(mOp.getRawOutput(0)));
// Convert input data (no overhead if not needed!)
// TODO: right now, if needed, memory will be allocated/deallocated at each
// call to forward(). We might put the following shared_ptr as members of
// this class to avoid that.
std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0));
const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1));
impl.forward(op.getInput(0)->dims(),
op.getInput(1)->dims(),
op.getOutput(0)->dims(),
input0.getImpl()->rawPtr(),
input1.getImpl()->rawPtr(),
getCPUPtr(op.getRawOutput(0)));
}
template <>
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment