Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • mszczep/aidge_backend_cpu
  • eclipse/aidge/aidge_backend_cpu
  • hrouis/aidge_backend_cpu
  • oantoni/aidge_backend_cpu
  • raphaelmillet/aidge_backend_cpu
  • cguillon/aidge_backend_cpu
  • jeromeh/aidge_backend_cpu
  • axelfarr/aidge_backend_cpu
  • noamzerah/aidge_backend_cpu
  • silvanosky/aidge_backend_cpu
  • maab05/aidge_backend_cpu
  • lucaslopez/aidge_backend_cpu_ll
  • farnez/aidge_backend_cpu
13 results
Show changes
Showing
with 712 additions and 96 deletions
......@@ -15,39 +15,36 @@
#include "aidge/utils/Registrar.hpp"
#include <cmath>
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/PowImpl.hpp"
namespace Aidge {
template <class I1, class I2, class O>
void PowImpl_cpu_forward_kernel(std::size_t input1Length,
std::size_t input2Length,
const void* input1_,
const void* input2_,
void* output_) {
void PowImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
const std::vector<std::size_t>& input2Dims,
const std::vector<std::size_t>& outputDims,
const void* input1_,
const void* input2_,
void* output_) {
const I1* input_1 = static_cast<const I1*>(input1_);
const I2* input_2 = static_cast<const I2*>(input2_);
O* output = static_cast<O*>(output_);
if (input2Length == input1Length)
{
for (std::size_t i = 0; i < input1Length; ++i) {
output[i] = std::pow(input_1[i], input_2[i]);
}
}
else if (input2Length == 1)
{
for (std::size_t i = 0; i < input1Length; ++i) {
output[i] = std::pow(input_1[i], input_2[0]);
}
}
else // input_2 is 1d and of size the number of channels of input_1
{
for (std::size_t i = 0; i < input1Length; ++i) {
std::size_t channelIdx = i % input2Length;
output[i] = std::pow(input_1[i], input_2[channelIdx]);
}
size_t totalElements = 1;
for (size_t dimSize : outputDims) {
totalElements *= dimSize;
}
for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
{
std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
output[oIndex] = std::pow(input_1[idx1], input_2[idx2]);
}
}
namespace {
......
......@@ -25,6 +25,7 @@ void ReLUImpl_cpu_forward_kernel(std::size_t inputLenght,
const I* input = static_cast<const I*>(input_);
O* output = static_cast<O*>(output_);
//#pragma omp parallel for if (inputLenght > 1024)
for (std::size_t i = 0; i < inputLenght; ++i) {
output[i] = input[i] > 0 ? input[i] : 0;
}
......
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_SIGMOIDIMPL_H_
#define AIDGE_CPU_OPERATOR_SIGMOIDIMPL_H_
#include "aidge/backend/OperatorImpl.hpp"
#include "aidge/operator/Sigmoid.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include <memory>
#include <vector>
namespace Aidge {
// class Sigmoid_Op;
// compute kernel registry for forward and backward
class SigmoidImplForward_cpu
: public Registrable<SigmoidImplForward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
};
class SigmoidImplBackward_cpu
: public Registrable<SigmoidImplBackward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
};
class SigmoidImpl_cpu : public OperatorImpl {
public:
SigmoidImpl_cpu(const Sigmoid_Op& op) : OperatorImpl(op) {}
static std::unique_ptr<SigmoidImpl_cpu> create(const Sigmoid_Op& op) {
return std::make_unique<SigmoidImpl_cpu>(op);
}
NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
void forward() override;
};
namespace {
static Registrar<Sigmoid_Op> registrarSigmoidImpl_cpu("cpu", Aidge::SigmoidImpl_cpu::create);
}
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_SIGMOIDIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_SIGMOIDIMPL_FORWARD_KERNEL_H_
#define AIDGE_CPU_OPERATOR_SIGMOIDIMPL_FORWARD_KERNEL_H_
#include "aidge/utils/Registrar.hpp"
#include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
namespace Aidge {
template <class I, class O>
void SigmoidImpl_cpu_forward_kernel(std::size_t inputLenght,
const void* input_,
void* output_) {
const I* input = static_cast<const I*>(input_);
O* output = static_cast<O*>(output_);
//#pragma omp parallel for if (inputLenght > 1024)
for (std::size_t i = 0; i < inputLenght; ++i) {
output[i] = static_cast<O>(1.0) / (static_cast<O>(1.0) + std::exp(-input[i]));
}
}
namespace {
static Registrar<SigmoidImplForward_cpu> registrarSigmoidImplForward_cpu_Float32(
{DataType::Float32, DataType::Float32}, Aidge::SigmoidImpl_cpu_forward_kernel<float, float>);
static Registrar<SigmoidImplForward_cpu> registrarSigmoidImplForward_cpu_Float64(
{DataType::Float64, DataType::Float64}, Aidge::SigmoidImpl_cpu_forward_kernel<double, double>);
} // namespace
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_SIGMOIDIMPL_FORWARD_KERNEL_H_ */
......@@ -25,10 +25,10 @@ namespace Aidge {
// compute kernel registry for forward and backward
class SubImplForward_cpu
: public Registrable<SubImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t, const std::size_t, const void*, const void*,void*)> {
: public Registrable<SubImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)> {
};
class SubImplBackward_cpu
: public Registrable<SubImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t, const std::size_t, const void*, const void*, void*)> {
: public Registrable<SubImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)> {
};
class SubImpl_cpu : public OperatorImpl {
......
......@@ -14,39 +14,35 @@
#include "aidge/utils/Registrar.hpp"
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/SubImpl.hpp"
namespace Aidge {
template <class I1, class I2, class O>
void SubImpl_cpu_forward_kernel(std::size_t input1Length,
std::size_t input2Length,
const void* input1_,
const void* input2_,
void* output_) {
void SubImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
const std::vector<std::size_t>& input2Dims,
const std::vector<std::size_t>& outputDims,
const void* input1_,
const void* input2_,
void* output_) {
const I1* input_1 = static_cast<const I1*>(input1_);
const I2* input_2 = static_cast<const I2*>(input2_);
O* output = static_cast<O*>(output_);
if (input2Length == input1Length)
{
for (std::size_t i = 0; i < input1Length; ++i) {
output[i] = input_1[i] - input_2[i];
}
}
else if (input2Length == 1)
{
for (std::size_t i = 0; i < input1Length; ++i) {
output[i] = input_1[i] - input_2[0];
}
}
else // input_2 is 1d and of size the number of channels of input_1
{
for (std::size_t i = 0; i < input1Length; ++i) {
std::size_t channelIdx = i % input2Length;
output[i] = input_1[i] - input_2[channelIdx];
}
size_t totalElements = 1;
for (size_t dimSize : outputDims) {
totalElements *= dimSize;
}
for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
{
std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
output[oIndex] = input_1[idx1] - input_2[idx2];
}
}
namespace {
......
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_TANHIMPL_H_
#define AIDGE_CPU_OPERATOR_TANHIMPL_H_
#include "aidge/backend/OperatorImpl.hpp"
#include "aidge/operator/Tanh.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include <memory>
#include <vector>
namespace Aidge {
// class Tanh_Op;
// compute kernel registry for forward and backward
class TanhImplForward_cpu
: public Registrable<TanhImplForward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
};
class TanhImplBackward_cpu
: public Registrable<TanhImplBackward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
};
class TanhImpl_cpu : public OperatorImpl {
public:
TanhImpl_cpu(const Tanh_Op& op) : OperatorImpl(op) {}
static std::unique_ptr<TanhImpl_cpu> create(const Tanh_Op& op) {
return std::make_unique<TanhImpl_cpu>(op);
}
NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
void forward() override;
};
namespace {
static Registrar<Tanh_Op> registrarTanhImpl_cpu("cpu", Aidge::TanhImpl_cpu::create);
}
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_TANHIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_TANHIMPL_FORWARD_KERNEL_H_
#define AIDGE_CPU_OPERATOR_TANHIMPL_FORWARD_KERNEL_H_
#include "aidge/utils/Registrar.hpp"
#include "aidge/backend/cpu/operator/TanhImpl.hpp"
namespace Aidge {
template <class I, class O>
void TanhImpl_cpu_forward_kernel(std::size_t inputLenght,
const void* input_,
void* output_) {
const I* input = static_cast<const I*>(input_);
O* output = static_cast<O*>(output_);
//#pragma omp parallel for if (inputLenght > 1024)
for (std::size_t i = 0; i < inputLenght; ++i) {
output[i] = std::tanh(input[i]);
}
}
namespace {
static Registrar<TanhImplForward_cpu> registrarTanhImplForward_cpu_Float32(
{DataType::Float32, DataType::Float32}, Aidge::TanhImpl_cpu_forward_kernel<float, float>);
static Registrar<TanhImplForward_cpu> registrarTanhImplForward_cpu_Float64(
{DataType::Float64, DataType::Float64}, Aidge::TanhImpl_cpu_forward_kernel<double, double>);
} // namespace
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_TANHIMPL_FORWARD_KERNEL_H_ */
/********************************************************************************
* Copyright (c) 2024 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include "aidge/backend/cpu/data/Broadcasting.hpp"
std::vector<std::size_t> Aidge::getBroadcastedDims(const std::vector<std::size_t>& outputDims, const std::vector<std::size_t>& dimsToBroadcast){
std::vector<std::size_t> broadcastedDims(outputDims.size(), 1);
for(int j=dimsToBroadcast.size()-1; j>=0; --j)
{
std::size_t idx = outputDims.size() - (dimsToBroadcast.size()-j);
broadcastedDims[idx] = dimsToBroadcast[j];
}
return broadcastedDims;
}
std::vector<std::size_t> Aidge::getMultiDimIndices(const std::vector<std::size_t>& dimensions, std::size_t idx){
std::vector<std::size_t> indices(dimensions.size(), 0);
for (int i = dimensions.size() - 1; i >= 0; --i) {
indices[i] = idx % dimensions[i];
idx /= dimensions[i];
}
return indices;
}
std::size_t Aidge::getFlattenedIndex(const std::vector<std::size_t>& dimensions, const std::vector<std::size_t>& indices){
std::size_t flattenedIdx = 0;
std::size_t stride = 1;
for (int i = dimensions.size() - 1; i >= 0; --i) {
std::size_t idx = dimensions[i]>1 ? indices[i] : 0;
flattenedIdx += idx * stride;
stride *= dimensions[i];
}
return flattenedIdx;
}
......@@ -55,15 +55,26 @@ void Aidge::AddImpl_cpu::forward() {
// TODO: right now, if needed, memory will be allocated/deallocated at each
// call to forward(). We might put the following shared_ptr as members of
// this class to avoid that.
std::size_t nbDims = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->nbDims();
std::vector<std::vector<std::size_t>> inputsDims;
std::vector<const void*> opInputs;
std::vector<std::shared_ptr<Tensor>> inputsFallback(mOp.nbInputs());
for (IOIndex_t i = 0; i < mOp.nbInputs(); ++i) {
std::vector<std::size_t> inputDims(nbDims, 1);
auto dims = std::static_pointer_cast<Tensor>(mOp.getRawInput(i))->dims();
for(std::size_t j=dims.size()-1; j+1>0; --j)
{
std::size_t idx = nbDims - (dims.size()-j);
inputDims[idx] = dims[j];
}
inputsDims.push_back(inputDims);
const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(i))->refCastFrom(inputsFallback[i], *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
opInputs.push_back(input.getImpl()->rawPtr());
}
// Call kernel
kernelFunc(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size(),
opInputs,
kernelFunc(opInputs,
inputsDims,
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->size(),
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
getCPUPtr(mOp.getRawOutput(0)));
}
......@@ -87,4 +87,4 @@ void Aidge::ConcatImpl_cpu::forward() {
getCPUPtr(mOp.getRawOutput(0)));
}
void Aidge::ConcatImpl_cpu::backward() { printf("Not implemented yet.\n"); }
\ No newline at end of file
void Aidge::ConcatImpl_cpu::backward() { fmt::print("Not implemented yet.\n"); }
\ No newline at end of file
......@@ -9,18 +9,15 @@
*
********************************************************************************/
#include <cassert>
#include <chrono> // std::chrono::milliseconds
#include <numeric> // std::accumulate
#include <thread> // std::this_thread::sleep_for
#include <memory>
#include <vector>
#include "aidge/operator/Div.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include "aidge/backend/cpu/operator/DivImpl.hpp"
#include "aidge/backend/cpu/operator/DivImpl_forward_kernels.hpp"
#include "aidge/data/Tensor.hpp"
#include "aidge/utils/Types.h"
Aidge::NbElts_t Aidge::DivImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
// this implementation can be in-place
......@@ -28,16 +25,139 @@ Aidge::NbElts_t Aidge::DivImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_
}
void Aidge::DivImpl_cpu::forward() {
// Find the correct kernel type
// auto kernelFunc = Registrar<DivImplForward_cpu>::create({
// std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
// std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
// std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
// const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
// std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
// const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
// std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
// auto a = std::static_pointer_cast<Tensor>(mOp.getRawInput(0));
// auto b = std::static_pointer_cast<Tensor>(mOp.getRawInput(1));
// // Call kernel
// kernelFunc(inputDims0,
// inputDims1,
// std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
// getCPUPtr(mOp.getRawInput(0)),
// getCPUPtr(mOp.getRawInput(1)),
// getCPUPtr(mOp.getRawOutput(0)));
/////////////////////////////////////////////////////////////////
// [5,2,1,7] & [2,6,7]
// 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
// 2. Find the highest equal dimension -> 3
// Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
// 3. Compute the highest number of contiguous data -> 7
// 4. Compute stride and offset step for the broadcast mechnism
// 5. Call a simple kernel
// Find the correct kernel type
auto kernelFunc = Registrar<DivImplForward_cpu>::create({
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
// Call kernel
kernelFunc(std::static_pointer_cast<Tensor>(std::static_pointer_cast<Tensor>(mOp.getRawInput(0)))->size(),
std::static_pointer_cast<Tensor>(std::static_pointer_cast<Tensor>(mOp.getRawInput(1)))->size(),
getCPUPtr(mOp.getRawInput(0)),
getCPUPtr(mOp.getRawInput(1)),
getCPUPtr(mOp.getRawOutput(0)));
// Compute compatible input dimensions
std::vector<std::size_t> dims0 = static_cast<const Div_Op&>(mOp).getInput(0)->dims();
std::vector<std::size_t> dims1 = static_cast<const Div_Op&>(mOp).getInput(1)->dims();
const std::vector<std::size_t>& outDims = static_cast<const Div_Op&>(mOp).getOutput(0)->dims();
// if (dims0 == dims1) {
// const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
// kernelFunc(input0_contiguous_size, input0_contiguous_size, input0_contiguous_size,
// getCPUPtr(mOp.getRawInput(0)),
// getCPUPtr(mOp.getRawInput(1)),
// getCPUPtr(mOp.getRawOutput(0)));
// return;
// }
if (dims0.size() > dims1.size()) {
dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
const std::size_t nbDims = dims0.size();
// Find the highest equal dimension
std::size_t contiguousIdx = nbDims - 1;
for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
}
}
++contiguousIdx;
// Compute the highest number of contiguous data for each Tensor
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t output_contiguous_size = std::accumulate(outDims.cbegin()+contiguousIdx, outDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::size_t *stride_post0;
std::size_t *stride_post1;
std::int32_t *stride_step0;
std::int32_t *stride_step1;
if (contiguousIdx > 0) {
stride_post0 = new std::size_t[contiguousIdx];
stride_post0[contiguousIdx - 1] = 1;
stride_post1 = new std::size_t[contiguousIdx];
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*dims0[i+1];
stride_post1[i] = stride_post1[i+1]*dims1[i+1];
}
stride_step0 = new std::int32_t[contiguousIdx];
stride_step1 = new std::int32_t[contiguousIdx];
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - static_cast<std::int32_t>(stride_post0[i]) : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - static_cast<std::int32_t>(stride_post1[i]) : 1;
}
}
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outDims.cbegin(), outDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
kernelFunc(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
getCPUPtr(mOp.getRawInput(0), offsetIn0*input0_contiguous_size),
getCPUPtr(mOp.getRawInput(1), offsetIn1*input1_contiguous_size),
getCPUPtr(mOp.getRawOutput(0), offsetOut*output_contiguous_size));
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outDims[dim] == 0) {
tmp_stack /= outDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
if (contiguousIdx > 0) {
delete[] stride_post0;
delete[] stride_post1;
delete[] stride_step0;
delete[] stride_step1;
}
}
......@@ -57,9 +57,10 @@ void Aidge::FCImpl_cpu::forward()
const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
// Call kernel
const auto batchSize = (input0.dims().size() > 1) ? input0.dims()[0] : 1;
kernelFunc(dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(),
input0.dims()[0],
input0.size() / input0.dims()[0],
batchSize,
input0.size() / batchSize,
input0.getImpl()->rawPtr(), input1.getImpl()->rawPtr(), input2.getImpl()->rawPtr(),
getCPUPtr(mOp.getRawOutput(0)));
}
......@@ -9,15 +9,14 @@
*
********************************************************************************/
#include <cassert>
#include <chrono> // std::chrono::milliseconds
#include <numeric> // std::accumulate
#include <thread> // std::this_thread::sleep_for
#include <cstddef> // std::size_t
#include <cstdint> // std::int32_t
#include <numeric> // std::accumulate
#include <vector>
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include "aidge/operator/MatMul.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include "aidge/backend/cpu/operator/MatMulImpl.hpp"
#include "aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp"
......@@ -30,27 +29,110 @@ void Aidge::MatMulImpl_cpu::forward()
// Find the correct kernel type
auto kernelFunc = Registrar<MatMulImplForward_cpu>::create(
{std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
// Call kernel
// if (mOp.getInput(0)->nbDims() == 4) {
// kernelFunc(
// mOp.getStaticAttributes(),
// std::static_pointer_cast<Tensor>(mOp.getInput(0))->template dims<4>(),
// mOp.getInput(0))->getImpl()->rawPtr(),
// mOp.mInputs[1]->getImpl()->rawPtr(),
// mOp.mInputs[2]->getImpl()->rawPtr(),
// getCPUPtr(mOp.getRawOutput(0));
// }
// else
kernelFunc(
dynamic_cast<const MatMul_Op&>(mOp).getStaticAttributes(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0],
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size() / std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0],
getCPUPtr(mOp.getRawInput(0)),
getCPUPtr(mOp.getRawInput(1)),
getCPUPtr(mOp.getRawOutput(0)));
// Compute compatible input dimensions
std::vector<std::size_t> dims0 = static_cast<const MatMul_Op&>(mOp).getInput(0)->dims();
std::vector<std::size_t> dims1 = static_cast<const MatMul_Op&>(mOp).getInput(1)->dims();
// keep second-to-last dimension of dims0
const std::size_t keepDim0 = (dims0.size() > 1) ? 1 : 0;
// keep last dimension of dims1
const std::size_t keepDim1 = (dims1.size() > 1) ? 1 : 0;
if (dims0.size() == 1) {
dims0.insert(dims0.cbegin(), 1);
}
if (dims1.size() == 1) {
dims1.push_back(1);
}
if (dims0.size() > dims1.size()) {
dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
// const std::size_t dims_size = std::max(dims0.size(), dims1.size());
// at this point, dims0.size() == dims1.size()
const std::size_t nbDims = dims0.size();
// initialize strides to iterate through data because of broadcasting
std::size_t *stride_post0;
std::size_t *stride_post1;
std::int32_t *stride_step0;
std::int32_t *stride_step1;
if (nbDims > 2) {
stride_post0 = new std::size_t[nbDims-2];
stride_post0[nbDims - 3] = 1;
stride_post1 = new std::size_t[nbDims-2];
stride_post1[nbDims - 3] = 1;
for (std::size_t i = nbDims-4; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*dims0[i+1];
stride_post1[i] = stride_post1[i+1]*dims1[i+1];
}
stride_step0 = new std::int32_t[nbDims-2];
stride_step1 = new std::int32_t[nbDims-2];
for (std::size_t i = 0; i != nbDims-2; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - static_cast<std::int32_t>(stride_post0[i]) : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - static_cast<std::int32_t>(stride_post1[i]) : 1;
}
}
const std::vector<std::size_t>& outDims = static_cast<const MatMul_Op&>(mOp).getOutput(0)->dims();
const std::size_t nbMatrices = std::accumulate(outDims.cbegin(), outDims.cend() - keepDim0 - keepDim1, 1, std::multiplies<std::size_t>());
std::size_t dim = outDims.size() - 1 - keepDim0 - keepDim1;
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
const std::size_t n = dims0[nbDims - 2];
const std::size_t k = dims0[nbDims - 1];
const std::size_t m = dims1[nbDims - 1];
const std::size_t matrix0Size = n*k;
const std::size_t matrix1Size = k*m;
const std::size_t matrixOutSize = n*m;
for (std::size_t stack = 0; stack < nbMatrices;) {
kernelFunc(n, k, m,
getCPUPtr(mOp.getRawInput(0), offsetIn0*matrix0Size),
getCPUPtr(mOp.getRawInput(1), offsetIn1*matrix1Size),
getCPUPtr(mOp.getRawOutput(0), offsetOut*matrixOutSize));
if (++stack < nbMatrices) {
std::size_t tmp_stack = stack;
while(tmp_stack % outDims[dim] == 0) {
tmp_stack /= outDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = outDims.size() - 1 - keepDim0 - keepDim1;
}
}
if (nbDims > 2) {
delete[] stride_post0;
delete[] stride_post1;
delete[] stride_step0;
delete[] stride_step1;
}
}
// void Aidge::MatMulImpl_cpu::forward()
// {
// assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) && "missing input #0");
// assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(1)) && "missing input #1");
// // Find the correct kernel type
// auto kernelFunc = Registrar<MatMulImplForward_cpu>::create(
// {std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
// std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
// kernelFunc(
// std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims(),
// std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims(),
// getCPUPtr(mOp.getRawInput(0)),
// getCPUPtr(mOp.getRawInput(1)),
// getCPUPtr(mOp.getRawOutput(0)));
// }
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <cassert>
#include <chrono> // std::chrono::milliseconds
#include <numeric> // std::accumulate
#include <thread> // std::this_thread::sleep_for
#include <vector>
#include "aidge/operator/Memorize.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include "aidge/backend/cpu/operator/MemorizeImpl.hpp"
Aidge::DimSize_t Aidge::MemorizeImpl_cpu::getNbRequiredData(
Aidge::IOIndex_t inputIdx) const
{
const Memorize_Op& op = dynamic_cast<const Memorize_Op&>(mOp);
const unsigned int scheduleStep = op.template getAttr<MemorizeAttr::ScheduleStep>();
if (scheduleStep == 0 && inputIdx == 0) {
// No data input is required for the initial step.
// Initialization data is required however.
return 0;
}
else if (scheduleStep > 0 && inputIdx == 1) {
// No initialization data is required after the initial step.
return 0;
}
else {
return OperatorImpl::getNbRequiredData(inputIdx);
}
}
Aidge::NbElts_t Aidge::MemorizeImpl_cpu::getRequiredMemory(const Aidge::IOIndex_t outputIdx,
const std::vector<Aidge::DimSize_t> &/*inputsSize*/) const {
assert(mOp.getRawOutput(outputIdx) && "requires valid output");
const Memorize_Op& op = dynamic_cast<const Memorize_Op&>(mOp);
const unsigned int scheduleStep = op.template getAttr<MemorizeAttr::ScheduleStep>();
const unsigned int endStep = op.template getAttr<MemorizeAttr::EndStep>();
if (endStep > 0 && outputIdx == 1 && scheduleStep >= endStep) {
return 0;
}
else {
return std::static_pointer_cast<Tensor>(mOp.getRawOutput(outputIdx))->size();
}
}
void Aidge::MemorizeImpl_cpu::updateConsummerProducer() {
OperatorImpl::updateConsummerProducer();
const Memorize_Op& op = dynamic_cast<const Memorize_Op&>(mOp);
const unsigned int scheduleStep = op.template getAttr<MemorizeAttr::ScheduleStep>();
const unsigned int endStep = op.template getAttr<MemorizeAttr::EndStep>();
AIDGE_ASSERT(endStep == 0 || scheduleStep <= endStep, "cannot update consumer producer anymore, number of cycles exceeded");
}
void Aidge::MemorizeImpl_cpu::forward() {
const Memorize_Op& op = dynamic_cast<const Memorize_Op&>(mOp);
const unsigned int forwardStep = op.template getAttr<MemorizeAttr::ForwardStep>();
const unsigned int endStep = op.template getAttr<MemorizeAttr::EndStep>();
AIDGE_ASSERT(endStep == 0 || forwardStep <= endStep, "cannot forward anymore, number of cycles exceeded");
if (forwardStep == 0) {
op.getOutput(0)->getImpl()->copy(op.getInput(1)->getImpl()->rawPtr(), op.getInput(1)->size());
}
else {
op.getOutput(0)->getImpl()->copy(op.getInput(0)->getImpl()->rawPtr(), op.getInput(0)->size());
}
}
......@@ -17,6 +17,7 @@
#include "aidge/operator/Mul.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include "aidge/backend/cpu/operator/MulImpl.hpp"
......@@ -34,9 +35,15 @@ void Aidge::MulImpl_cpu::forward() {
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
// Call kernel
kernelFunc(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->size(),
kernelFunc(inputDims0,
inputDims1,
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
getCPUPtr(mOp.getRawInput(0)),
getCPUPtr(mOp.getRawInput(1)),
getCPUPtr(mOp.getRawOutput(0)));
......
......@@ -10,26 +10,30 @@
********************************************************************************/
#include <cassert>
#include <chrono> // std::chrono::milliseconds
#include <numeric> // std::accumulate
#include <thread> // std::this_thread::sleep_for
#include <vector>
#include "aidge/data/Tensor.hpp"
#include "aidge/operator/Producer.hpp"
#include "aidge/operator/Pop.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include "aidge/backend/cpu/operator/ProducerImpl.hpp"
#include "aidge/backend/cpu/operator/PopImpl.hpp"
Aidge::DimSize_t Aidge::ProducerImpl_cpu::getNbProducedData(
Aidge::IOIndex_t outputIdx) const
{
// Requires the whole tensors, regardless of available data on inputs
assert(outputIdx == 0 && "operator has only one output");
(void) outputIdx;
Aidge::NbElts_t Aidge::PopImpl_cpu::getNbRequiredData(const Aidge::IOIndex_t inputIdx) const {
assert(mOp.getRawInput(inputIdx) && "requires valid input");
return std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->size();
return std::static_pointer_cast<Tensor>(mOp.getRawInput(inputIdx))->size()
/ std::static_pointer_cast<Tensor>(mOp.getRawInput(inputIdx))->dims()[0];
}
void Aidge::ProducerImpl_cpu::forward()
{
void Aidge::PopImpl_cpu::forward() {
assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) && "missing input #0");
const Pop_Op& op = dynamic_cast<const Pop_Op&>(mOp);
const unsigned int forwardStep = op.template getAttr<PopAttr::ForwardStep>();
*std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))
= std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->extract({forwardStep});
}
......@@ -17,6 +17,7 @@
#include "aidge/operator/Pow.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include "aidge/backend/cpu/operator/PowImpl.hpp"
......@@ -34,9 +35,15 @@ void Aidge::PowImpl_cpu::forward() {
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
// Call kernel
kernelFunc(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->size(),
kernelFunc(inputDims0,
inputDims1,
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
getCPUPtr(mOp.getRawInput(0)),
getCPUPtr(mOp.getRawInput(1)),
getCPUPtr(mOp.getRawOutput(0)));
......
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <cassert>
#include <chrono> // std::chrono::milliseconds
#include <numeric> // std::accumulate
#include <thread> // std::this_thread::sleep_for
#include <vector>
#include "aidge/operator/Sigmoid.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
#include "aidge/backend/cpu/operator/SigmoidImpl_forward_kernels.hpp"
Aidge::NbElts_t Aidge::SigmoidImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
// this implementation can be in-place
return 0;
}
void Aidge::SigmoidImpl_cpu::forward() {
assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) && "missing input #0");
// Find the correct kernel type
auto kernelFunc = Registrar<SigmoidImplForward_cpu>::create({
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
// Call kernel
kernelFunc(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size(),
getCPUPtr(mOp.getRawInput(0)),
getCPUPtr(mOp.getRawOutput(0)));
}
......@@ -79,4 +79,4 @@ void Aidge::SliceImpl_cpu::forward() {
mNbProducedData[0] += getRequiredMemory(0, {});
}
void Aidge::SliceImpl_cpu::backward() { printf("Not implemented yet.\n"); }
void Aidge::SliceImpl_cpu::backward() { fmt::print("Not implemented yet.\n"); }