Skip to content
Snippets Groups Projects
Commit 0c483a85 authored by Maxence Naud's avatar Maxence Naud Committed by Maxence Naud
Browse files

perf: update Add, BitShift, Mul and Sub forward kernels

parent f634dac0
No related branches found
No related tags found
2 merge requests!118v0.4.0,!40Resolve "Arithmetic Operator optimization"
Showing with 530 additions and 150 deletions
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
namespace Aidge { namespace Aidge {
// Operator implementation entry point for the backend // Operator implementation entry point for the backend
using AddImpl_cpu = OperatorImpl_cpu<Add_Op, using AddImpl_cpu = OperatorImpl_cpu<Add_Op,
void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)>; void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
// Implementation entry point registration to Operator // Implementation entry point registration to Operator
REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create); REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create);
......
...@@ -14,31 +14,137 @@ ...@@ -14,31 +14,137 @@
#include "aidge/utils/Registrar.hpp" #include "aidge/utils/Registrar.hpp"
#include <cstdint> // std::int32_t, std::int64_t #include <cstddef> // std::size_t
#include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/AddImpl.hpp" #include "aidge/backend/cpu/operator/AddImpl.hpp"
namespace Aidge { namespace Aidge {
namespace {
// suppose values are contiguous in memory
template <class I, class O> template <class I, class O>
void AddImpl_cpu_forward_kernel(const std::vector<const void*> inputs_, const std::vector<std::vector<std::size_t>>& inputDims, const std::size_t outputLength, const std::vector<std::size_t>& outDims, void* output_) { void add_contiguous_arrays(const std::size_t input1size,
// FIXME: missing Add attributes as arguments const std::size_t input2size,
std::vector<const I*> inputs; const std::size_t output1size,
for (const auto& input_ : inputs_) { const I* input1,
inputs.push_back(static_cast<const I*>(input_)); const I* input2,
O* output)
{
for (std::size_t i = 0; i < output1size; ++i)
{
const std::size_t in1_id = (input1size != 1) ? i : 0;
const std::size_t in2_id = (input2size != 1) ? i : 0;
output[i] = static_cast<O>(input1[in1_id] + input2[in2_id]);
} }
}
}
template <class I, class O>
void AddImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
std::vector<std::size_t> dims1,
const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_,
void* output_) {
const I* input_0 = static_cast<const I*>(input0_);
const I* input_1 = static_cast<const I*>(input1_);
O* output = static_cast<O*>(output_); O* output = static_cast<O*>(output_);
for (std::size_t oIndex = 0; oIndex < outputLength; ++oIndex) // [5,2,1,7] & [2,6,7]
{ // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
output[oIndex] = 0; // 2. Find the highest equal dimension -> 3
std::vector<size_t> indexes = getMultiDimIndices(outDims, oIndex); // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
for(std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) { // 3. Compute the highest number of contiguous data -> 7
std::size_t idx = getFlattenedIndex(inputDims[iIndex], indexes); // 4. Compute stride and offset step for the broadcast mechanism
output[oIndex] += inputs[iIndex][idx]; // 5. Call a simple kernel
}
} // special case for equal dimensions, the kernel is called with the entire arrays at once
if (dims0 == dims1) {
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t i = 0; i < input0_contiguous_size; ++i)
{
output[i] = static_cast<O>(input_0[i] + input_1[i]);
}
return;
}
// set dimensions to be of equal size by filling the smallest one with ones.
if (dims0.size() > dims1.size()) {
dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
const std::size_t nbDims = dims0.size();
// Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
}
}
++contiguousIdx;
// Compute the highest number of contiguous data for each Tensor
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
add_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
input_0 + offsetIn0*input0_contiguous_size,
input_1 + offsetIn1*input1_contiguous_size,
output + offsetOut*output_contiguous_size);
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outputDims[dim] == 0) {
tmp_stack /= outputDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
} }
// Kernels registration to implementation entry point // Kernels registration to implementation entry point
......
...@@ -24,13 +24,13 @@ namespace Aidge { ...@@ -24,13 +24,13 @@ namespace Aidge {
// Operator implementation entry point for the backend // Operator implementation entry point for the backend
using BitShiftImpl_cpu = OperatorImpl_cpu<BitShift_Op, using BitShiftImpl_cpu = OperatorImpl_cpu<BitShift_Op,
void(const BitShift_Op::BitShiftDirection, void(const BitShift_Op::BitShiftDirection,
const std::vector<std::size_t>&, std::vector<std::size_t>,
const std::vector<std::size_t>&, std::vector<std::size_t>,
const std::vector<std::size_t>&, const std::vector<std::size_t>&,
const void*, const void*,
const void*, const void*,
void*)>; void*)>;
// Implementation entry point registration to Operator // Implementation entry point registration to Operator
REGISTRAR(BitShift_Op,"cpu",Aidge::BitShiftImpl_cpu::create); REGISTRAR(BitShift_Op,"cpu",Aidge::BitShiftImpl_cpu::create);
} // namespace Aidge } // namespace Aidge
......
...@@ -12,47 +12,150 @@ ...@@ -12,47 +12,150 @@
#ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_ #ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_ #define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_
#include "aidge/utils/Registrar.hpp"
#include <cstdint> // std::int32_t, std::int64_t #include <cstdint> // std::int32_t, std::int64_t
#include "aidge/operator/BitShift.hpp" #include <cstddef> // std::size_t
#include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/BitShiftImpl.hpp" #include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
#include "aidge/operator/BitShift.hpp"
#include "aidge/utils/Registrar.hpp"
namespace {
// suppose values are contiguous in memory
template <class I1, class I2, class O>
void bitshift_contiguous_arrays(
const Aidge::BitShift_Op::BitShiftDirection direction,
const std::size_t input1size,
const std::size_t input2size,
const std::size_t output1size,
const I1* input_1,
const I2* input_2,
O* output)
{
if(direction == Aidge::BitShift_Op::BitShiftDirection::right) {
for (std::size_t i = 0; i < output1size; ++i) {
const std::size_t idx1 = (input1size != 1) ? i : 0;
const std::size_t idx2 = (input2size != 1) ? i : 0;
output[i]= input_1[idx1] >> input_2[idx2];
}
} else {
for (std::size_t i = 0; i < output1size; ++i) {
const std::size_t idx1 = (input1size != 1) ? i : 0;
const std::size_t idx2 = (input2size != 1) ? i : 0;
output[i] = input_1[idx1] << input_2[idx2];
}
}
}
}
namespace Aidge { namespace Aidge {
template <class I1, class I2, class O> template <class I1, class I2, class O>
void BitShiftImpl_cpu_forward_kernel( void BitShiftImpl_cpu_forward_kernel(
const BitShift_Op::BitShiftDirection direction, const BitShift_Op::BitShiftDirection direction,
const std::vector<std::size_t>& input1Dims, std::vector<std::size_t> dims0,
const std::vector<std::size_t>& input2Dims, std::vector<std::size_t> dims1,
const std::vector<std::size_t>& outputDims, const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_, const void* input1_,
const void* input2_,
void* output_ void* output_
) { ) {
const I1* input_1 = static_cast<const I1*>(input1_); const I1* input_0 = static_cast<const I1*>(input0_);
const I2* input_2 = static_cast<const I2*>(input2_); const I2* input_1 = static_cast<const I2*>(input1_);
O* output = static_cast<O*>(output_); O* output = static_cast<O*>(output_);
const size_t totalElements = std::accumulate(outputDims.begin(), outputDims.end(), std::size_t(1), std::multiplies<std::size_t>()); // [5,2,1,7] & [2,6,7]
// 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) // 2. Find the highest equal dimension -> 3
{ // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); // 3. Compute the highest number of contiguous data -> 7
std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); // 4. Compute stride and offset step for the broadcast mechanism
std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); // 5. Call a simple kernel
if(direction == BitShift_Op::BitShiftDirection::right)
// ## Compute compatible input dimensions
{ // special case for equal dimensions, the kernel is called with the entire arrays at once
output[oIndex]= input_1[idx1] >> input_2[idx2]; if (dims0 == dims1) {
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
bitshift_contiguous_arrays(direction, input0_contiguous_size, input0_contiguous_size, input0_contiguous_size, input_0, input_1, output);
return;
}
// set dimensions to be of equal size by filling the smallest one with ones.
if (dims0.size() > dims1.size()) {
dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
const std::size_t nbDims = dims0.size();
// Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
} }
else }
{ ++contiguousIdx;
output[oIndex] = input_1[idx1] << input_2[idx2];
// Compute the highest number of contiguous data for each Tensor
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
bitshift_contiguous_arrays<I1,I2,O>(direction, input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
input_0 + offsetIn0*input0_contiguous_size,
input_1 + offsetIn1*input1_contiguous_size,
output + offsetOut*output_contiguous_size);
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outputDims[dim] == 0) {
tmp_stack /= outputDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
} }
} }
} }
......
...@@ -23,21 +23,21 @@ ...@@ -23,21 +23,21 @@
namespace Aidge { namespace Aidge {
// Operator implementation entry point for the backend // Operator implementation entry point for the backend
using MulImpl_cpu = OperatorImpl_cpu<Mul_Op, using MulImpl_cpu = OperatorImpl_cpu<Mul_Op,
void(const std::vector<std::size_t>&, void(std::vector<std::size_t>,
const std::vector<std::size_t>&, std::vector<std::size_t>,
const std::vector<std::size_t>&, const std::vector<std::size_t>&,
const void*, const void*,
const void*, const void*,
void*), void*),
void(const std::size_t, void(const std::size_t,
const std::size_t, const std::size_t,
const std::size_t, const std::size_t,
const std::vector<std::size_t>, const std::vector<std::size_t>,
const std::vector<std::size_t>, const std::vector<std::size_t>,
const void*, const void*,
const void*, const void*,
const void*, const void*,
void*, void*,
void*)>; void*)>;
// Implementation entry point registration to Operator // Implementation entry point registration to Operator
......
...@@ -19,44 +19,143 @@ ...@@ -19,44 +19,143 @@
#include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/MulImpl.hpp" #include "aidge/backend/cpu/operator/MulImpl.hpp"
namespace {
// suppose values are contiguous in memory
template <class I1, class I2, class O>
void mul_contiguous_arrays(const std::size_t input1size,
const std::size_t input2size,
const std::size_t output1size,
const I1* input1,
const I2* input2,
O* output)
{
for (std::size_t i = 0; i < output1size; ++i)
{
const std::size_t in1_id = (input1size != 1) ? i : 0;
const std::size_t in2_id = (input2size != 1) ? i : 0;
output[i] = static_cast<O>(input1[in1_id] * input2[in2_id]);
}
}
}
namespace Aidge { namespace Aidge {
template <class I1, class I2, class O> template <class I1, class I2, class O>
void MulImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims, void MulImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
const std::vector<std::size_t>& input2Dims, std::vector<std::size_t> dims1,
const std::vector<std::size_t>& outputDims, const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_, const void* input1_,
const void* input2_,
void* output_) { void* output_) {
const I1* input_0 = static_cast<const I1*>(input0_);
const I1* input_1 = static_cast<const I1*>(input1_); const I2* input_1 = static_cast<const I2*>(input1_);
const I2* input_2 = static_cast<const I2*>(input2_);
O* output = static_cast<O*>(output_); O* output = static_cast<O*>(output_);
size_t totalElements = 1; // [5,2,1,7] & [2,6,7]
for (size_t dimSize : outputDims) { // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
totalElements *= dimSize; // 2. Find the highest equal dimension -> 3
// Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
// 3. Compute the highest number of contiguous data -> 7
// 4. Compute stride and offset step for the broadcast mechanism
// 5. Call a simple kernel
// ## Compute compatible input dimensions
// special case for equal dimensions, the kernel is called with the entire arrays at once
if (dims0 == dims1) {
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t i = 0; i < input0_contiguous_size; ++i)
{
output[i] = static_cast<O>(input_0[i] * input_1[i]);
}
return;
} }
for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) // set dimensions to be of equal size by filling the smallest one with ones.
{ if (dims0.size() > dims1.size()) {
std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); const std::size_t nbDims = dims0.size();
std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
// Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
}
}
++contiguousIdx;
// Compute the highest number of contiguous data for each Tensor
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
output[oIndex] = input_1[idx1] * input_2[idx2]; // variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
mul_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
input_0 + offsetIn0*input0_contiguous_size,
input_1 + offsetIn1*input1_contiguous_size,
output + offsetOut*output_contiguous_size);
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outputDims[dim] == 0) {
tmp_stack /= outputDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
} }
} }
template <class I1, class I2, class O> template <class I1, class I2, class O>
void MulImpl_cpu_backward_kernel(const std::size_t input0Length, void MulImpl_cpu_backward_kernel(const std::size_t input0Length,
const std::size_t input1Length, const std::size_t input1Length,
const std::size_t grad0Length, const std::size_t grad0Length,
const std::vector<std::size_t> input0Dims, const std::vector<std::size_t> input0Dims,
const std::vector<std::size_t> input1Dims, const std::vector<std::size_t> input1Dims,
const void* input0_, const void* input0_,
const void* input1_, const void* input1_,
const void* grad_output_, const void* grad_output_,
void* gradientInput0, void* gradientInput0,
void* gradientInput1) void* gradientInput1)
{ {
......
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
namespace Aidge { namespace Aidge {
// Operator implementation entry point for the backend // Operator implementation entry point for the backend
using SubImpl_cpu = OperatorImpl_cpu<Sub_Op, using SubImpl_cpu = OperatorImpl_cpu<Sub_Op,
void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>; void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*)>;
// Implementation entry point registration to Operator // Implementation entry point registration to Operator
REGISTRAR(Sub_Op, "cpu", Aidge::SubImpl_cpu::create); REGISTRAR(Sub_Op, "cpu", Aidge::SubImpl_cpu::create);
......
...@@ -21,32 +21,132 @@ ...@@ -21,32 +21,132 @@
#include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/SubImpl.hpp" #include "aidge/backend/cpu/operator/SubImpl.hpp"
namespace {
// suppose values are contiguous in memory
template <class I1, class I2, class O>
void sub_contiguous_arrays(const std::size_t input1size,
const std::size_t input2size,
const std::size_t output1size,
const I1* input1,
const I2* input2,
O* output)
{
for (std::size_t i = 0; i < output1size; ++i)
{
const std::size_t in1_id = (input1size != 1) ? i : 0;
const std::size_t in2_id = (input2size != 1) ? i : 0;
output[i] = static_cast<O>(input1[in1_id] - input2[in2_id]);
}
}
}
namespace Aidge { namespace Aidge {
template <class I1, class I2, class O> template <class I1, class I2, class O>
void SubImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims, void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
const std::vector<std::size_t>& input2Dims, std::vector<std::size_t> dims1,
const std::vector<std::size_t>& outputDims, const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_, const void* input1_,
const void* input2_,
void* output_) { void* output_) {
const I1* input_1 = static_cast<const I1*>(input1_); const I1* input_0 = static_cast<const I1*>(input0_);
const I2* input_2 = static_cast<const I2*>(input2_); const I2* input_1 = static_cast<const I2*>(input1_);
O* output = static_cast<O*>(output_); O* output = static_cast<O*>(output_);
size_t totalElements = 1; // [5,2,1,7] & [2,6,7]
for (size_t dimSize : outputDims) { // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
totalElements *= dimSize; // 2. Find the highest equal dimension -> 3
// Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
// 3. Compute the highest number of contiguous data -> 7
// 4. Compute stride and offset step for the broadcast mechanism
// 5. Call a simple kernel
// special case for equal dimensions, the kernel is called with the entire arrays at once
if (dims0 == dims1) {
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t i = 0; i < input0_contiguous_size; ++i)
{
output[i] = static_cast<O>(input_0[i] - input_1[i]);
}
return;
}
// set dimensions to be of equal size by filling the smallest one with ones.
if (dims0.size() > dims1.size()) {
dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
const std::size_t nbDims = dims0.size();
// Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
}
} }
++contiguousIdx;
for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) // Compute the highest number of contiguous data for each Tensor
{ const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
output[oIndex] = input_1[idx1] - input_2[idx2]; // initialize strides to iterate through data because of broadcasting
} std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
sub_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
input_0 + offsetIn0*input0_contiguous_size,
input_1 + offsetIn1*input1_contiguous_size,
output + offsetOut*output_contiguous_size);
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outputDims[dim] == 0) {
tmp_stack /= outputDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
} }
// Kernels registration to implementation entry point // Kernels registration to implementation entry point
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
#include "aidge/backend/cpu/operator/AddImpl.hpp" #include "aidge/backend/cpu/operator/AddImpl.hpp"
#include <cassert> #include <cassert>
#include <numeric> // std::accumulate
#include <vector> #include <vector>
#include "aidge/backend/cpu/data/GetCPUPtr.h" #include "aidge/backend/cpu/data/GetCPUPtr.h"
...@@ -28,12 +27,11 @@ void Aidge::AddImpl_cpu::forward() { ...@@ -28,12 +27,11 @@ void Aidge::AddImpl_cpu::forward() {
// Check inputs // Check inputs
AIDGE_ASSERT(op.getInput(0), "missing input in Add operator"); AIDGE_ASSERT(op.getInput(0), "missing input in Add operator");
AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Add forward because the 0-th input has no implementation."); AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Add forward because the 0-th input has no implementation.");
DataType datatypeFirstInput = op.getInput(0)->dataType();
for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { AIDGE_ASSERT(op.getInput(1), "missing input in Add operator");
AIDGE_ASSERT(op.getInput(i), "missing input in Add operator"); AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Add forward because the 1st input has no implementation.");
AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Add forward because the {}-th input has no implementation.", i);
AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type."); AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot add inputs with two differents data type.");
}
// Find the correct kernel type // Find the correct kernel type
const auto impl = Registrar<AddImpl_cpu>::create(getBestMatch(getRequiredSpec())); const auto impl = Registrar<AddImpl_cpu>::create(getBestMatch(getRequiredSpec()));
...@@ -42,28 +40,17 @@ void Aidge::AddImpl_cpu::forward() { ...@@ -42,28 +40,17 @@ void Aidge::AddImpl_cpu::forward() {
// TODO: right now, if needed, memory will be allocated/deallocated at each // TODO: right now, if needed, memory will be allocated/deallocated at each
// call to forward(). We might put the following shared_ptr as members of // call to forward(). We might put the following shared_ptr as members of
// this class to avoid that. // this class to avoid that.
const std::size_t nbDims = op.getOutput(0)->nbDims(); std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
std::vector<std::vector<std::size_t>> inputsDims; const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0));
std::vector<const void*> opInputs; const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1));
std::vector<std::shared_ptr<Tensor>> inputsFallback(op.nbInputs());
for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
std::vector<std::size_t> inputDims(nbDims, 1);
auto dims = op.getInput(i)->dims();
for(std::size_t j=dims.size()-1; j+1>0; --j)
{
std::size_t idx = nbDims - (dims.size()-j);
inputDims[idx] = dims[j];
}
inputsDims.push_back(inputDims);
const auto& input = op.getInput(i)->refCastFrom(inputsFallback[i], *op.getOutput(0));
opInputs.push_back(input.getImpl()->rawPtr());
}
impl.forward(opInputs, impl.forward(op.getInput(0)->dims(),
inputsDims, op.getInput(1)->dims(),
op.getOutput(0)->size(), op.getOutput(0)->dims(),
op.getOutput(0)->dims(), input0.getImpl()->rawPtr(),
getCPUPtr(op.getRawOutput(0))); input1.getImpl()->rawPtr(),
getCPUPtr(op.getRawOutput(0)));
} }
template <> template <>
......
...@@ -28,27 +28,18 @@ void Aidge::BitShiftImpl_cpu::forward() { ...@@ -28,27 +28,18 @@ void Aidge::BitShiftImpl_cpu::forward() {
const auto& op_ = dynamic_cast<const BitShift_Op&>(mOp); const auto& op_ = dynamic_cast<const BitShift_Op&>(mOp);
const auto impl = Registrar<BitShiftImpl_cpu>::create(getBestMatch(getRequiredSpec())); const auto impl = Registrar<BitShiftImpl_cpu>::create(getBestMatch(getRequiredSpec()));
const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
BitShift_Op::BitShiftDirection direction = op_.direction();
// Call kernel // Call kernel
impl.forward( impl.forward(
direction, op_.direction(),
inputDims0, op_.getInput(0)->dims(),
inputDims1, op_.getInput(1)->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), op_.getOutput(0)->dims(),
getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(0)),
getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawInput(1)),
getCPUPtr(mOp.getRawOutput(0))); getCPUPtr(mOp.getRawOutput(0)));
} }
template <> template <>
......
...@@ -25,18 +25,15 @@ ...@@ -25,18 +25,15 @@
template <> template <>
void Aidge::MulImpl_cpu::forward() { void Aidge::MulImpl_cpu::forward() {
const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp);
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
// Find the correct kernel type // Find the correct kernel type
const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec())); const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec()));
// Call kernel // Call kernel
impl.forward(inputDims0, impl.forward(op_.getInput(0)->dims(),
inputDims1, op_.getInput(1)->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), op_.getOutput(0)->dims(),
getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(0)),
getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawInput(1)),
getCPUPtr(mOp.getRawOutput(0))); getCPUPtr(mOp.getRawOutput(0)));
...@@ -45,7 +42,7 @@ void Aidge::MulImpl_cpu::forward() { ...@@ -45,7 +42,7 @@ void Aidge::MulImpl_cpu::forward() {
template <> template <>
void Aidge::MulImpl_cpu::backward() { void Aidge::MulImpl_cpu::backward() {
const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp); const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp);
auto in0 = op_.getInput(0); auto in0 = op_.getInput(0);
auto in1 = op_.getInput(1); auto in1 = op_.getInput(1);
auto in0grad = op_.getInput(0)->grad(); auto in0grad = op_.getInput(0)->grad();
...@@ -56,14 +53,14 @@ void Aidge::MulImpl_cpu::backward() { ...@@ -56,14 +53,14 @@ void Aidge::MulImpl_cpu::backward() {
const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec())); const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec()));
// Call kernel // Call kernel
impl.backward(/* input0Length */ in0grad->size(), impl.backward(/* input0Length */ in0grad->size(),
/* input1Length */ in1grad->size(), /* input1Length */ in1grad->size(),
/* grad0Length */ out0grad->size(), /* grad0Length */ out0grad->size(),
/* input0Dims */ in0->dims(), /* input0Dims */ in0->dims(),
/* input1Dims */ in1->dims(), /* input1Dims */ in1->dims(),
getCPUPtr(in0), getCPUPtr(in0),
getCPUPtr(in1), getCPUPtr(in1),
getCPUPtr(out0grad), getCPUPtr(out0grad),
getCPUPtr(in0grad), getCPUPtr(in0grad),
getCPUPtr(in1grad)); getCPUPtr(in1grad));
} }
...@@ -25,18 +25,15 @@ ...@@ -25,18 +25,15 @@
template <> template <>
void Aidge::SubImpl_cpu::forward() { void Aidge::SubImpl_cpu::forward() {
const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), const Sub_Op& op_ = dynamic_cast<const Sub_Op&>(mOp);
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
// Find the correct kernel type // Find the correct kernel type
const auto impl = Registrar<SubImpl_cpu>::create(getBestMatch(getRequiredSpec())); const auto impl = Registrar<SubImpl_cpu>::create(getBestMatch(getRequiredSpec()));
// Call kernel // Call kernel
impl.forward(inputDims0, impl.forward(op_.getInput(0)->dims(),
inputDims1, op_.getInput(1)->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), op_.getOutput(0)->dims(),
getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(0)),
getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawInput(1)),
getCPUPtr(mOp.getRawOutput(0))); getCPUPtr(mOp.getRawOutput(0)));
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment