Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • mszczep/aidge_backend_cpu
  • eclipse/aidge/aidge_backend_cpu
  • hrouis/aidge_backend_cpu
  • oantoni/aidge_backend_cpu
  • raphaelmillet/aidge_backend_cpu
  • cguillon/aidge_backend_cpu
  • jeromeh/aidge_backend_cpu
  • axelfarr/aidge_backend_cpu
  • noamzerah/aidge_backend_cpu
  • silvanosky/aidge_backend_cpu
  • maab05/aidge_backend_cpu
  • lucaslopez/aidge_backend_cpu_ll
  • farnez/aidge_backend_cpu
  • mick94/aidge_backend_cpu
14 results
Show changes
Showing
with 2133 additions and 0 deletions
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_
#include <array>
#include <cmath>
#include <cstdint>
#include <tuple>
#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include "aidge/data/Data.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
/**
* @brief Forward kernel for 2D MaxPoolingolution on CPU backend.
* @tparam I Input data type.
* @tparam O Output data type.
* @param attrs tuple of Attributes from the Operator
* @param dims Array of input dimensions.
* @param input_ const input Tensor.
* @param output_ Output Tensor.
*/
template <class I, class O>
void MaxPoolingImpl2D_cpu_forward_kernel(
const std::array<DimSize_t, 2>& strideDims,
const std::array<DimSize_t, 2>& kernelDims,
const std::array<DimSize_t, 2>& dilations,
const bool ceilMode,
const std::array<DimSize_t, 4> &dims,
const void *input_,
void *output_)
{
const I *input = static_cast<const I *>(input_);
O *output = static_cast<O *>(output_);
// output H size
auto hOut = static_cast<float>(
dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]
) / static_cast<float>(strideDims[0]);
const std::size_t outXSize = ceilMode
? static_cast<std::size_t>(std::ceil(hOut))
: static_cast<std::size_t>(std::floor(hOut));
// output W size
auto wOut = static_cast<float>(
dims[3] - ( kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]
) / static_cast<float>(strideDims[1]);
const std::size_t outYSize = ceilMode
? static_cast<std::size_t>(std::ceil(wOut))
: static_cast<std::size_t>(std::floor(wOut));
using signedsize = std::make_signed<std::size_t>::type;
#ifdef _OPENMP
#pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16)
#endif
for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch){
for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel){
auto batchChannelIndex = (channel + batch * dims[1]);
const std::size_t outputBaseIndex = batchChannelIndex * outXSize * outYSize;
const std::size_t inputBaseIndex = batchChannelIndex * dims[2] * dims[3];
for (std::size_t outX = 0; outX < outXSize; ++outX) {
const signedsize negStrideX = static_cast<signedsize>(
-outX * strideDims[0]
);
const std::size_t kernelXMin = static_cast<std::size_t>(
std::max(negStrideX, signedsize(0))
);
/* Compute kernelXMax */
std::size_t kernelXMax = dims[2] + negStrideX;
if ((static_cast<signedsize>(dims[2]) + negStrideX) < 0){
kernelXMax = 0;
}
else if (kernelXMax > kernelDims[0]){
kernelXMax = kernelDims[0];
}
for (std::size_t outY = 0; outY < outYSize; ++outY) {
const signedsize negStrideY = static_cast<signedsize>(-outY * strideDims[1]);
const std::size_t kernelYMin = static_cast<std::size_t>(
std::max(negStrideY, signedsize(0))
);
/* Compute kernelYMax */
std::size_t kernelYMax = dims[3] + negStrideY;
const std::size_t outputIndex = outputBaseIndex + outX * outYSize + outY;
const std::size_t strideXoffset = outX * strideDims[0];
const std::size_t strideYoffset = outY * strideDims[1];
I poolValue(0.0);
bool valid = false;
if (static_cast<signedsize>(dims[3]) + negStrideY < 0){
kernelYMax = 0;
}
else if(kernelYMax > kernelDims[1]){
kernelYMax = kernelDims[1];
}
for (unsigned int kY = kernelYMin; kY < kernelYMax ; ++kY){
for (unsigned int kX = kernelXMin; kX < kernelXMax; ++kX){
// Apply dilation factor to kernel indices
const std::size_t dilatedkernelX = kX * dilations[0];
const std::size_t dilatedkernelY = kY * dilations[1];
// Ensure indices are within bounds
auto inputXPostDilation = strideXoffset + dilatedkernelX;
auto inputYPostDilation = strideYoffset + dilatedkernelY;
if (inputXPostDilation < dims[2] && inputYPostDilation < dims[3]){
const I inputValue = input[
inputBaseIndex + inputXPostDilation * dims[3]
+ inputYPostDilation
];
if (!valid || inputValue > poolValue) {
poolValue = inputValue;
valid = true;
}
}
}
}
output[outputIndex] = poolValue;
}
}
}
}
}
template <class I, class O>
void MaxPoolingImpl2D_cpu_backward_kernel(
const std::array<DimSize_t, 2>& strideDims,
const std::array<DimSize_t, 2>& kernelDims,
const std::array<DimSize_t, 2>& dilations,
const bool ceilMode,
const std::array<DimSize_t, 4> &dims,
const void *input_,
void *grad_
)
{
const I *input = static_cast<const I *>(input_);
I *grad = static_cast<I *>(grad_);
// output H size
auto hOut = static_cast<float>(
dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]
) / static_cast<float>(strideDims[0]);
const std::size_t outXSize = ceilMode
? static_cast<std::size_t>(std::ceil(hOut))
: static_cast<std::size_t>(std::floor(hOut));
// output W size
auto wOut = static_cast<float>(
dims[3] - ( kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]
) / static_cast<float>(strideDims[1]);
const std::size_t outYSize = ceilMode
? static_cast<std::size_t>(std::ceil(wOut))
: static_cast<std::size_t>(std::floor(wOut));
using signedsize = std::make_signed<std::size_t>::type;
for (std::size_t batch = 0; batch < dims[0]; ++batch){
for (std::size_t channel = 0; channel < dims[1]; ++channel){
auto batchChannelIndex = (channel + batch * dims[1]);
const std::size_t inputBaseIndex = batchChannelIndex * dims[2] * dims[3];
for (std::size_t outX = 0; outX < outXSize; ++outX) {
const signedsize negStrideX = static_cast<signedsize>(
-outX * strideDims[0]
);
const std::size_t kernelXMin = static_cast<std::size_t>(
std::max(negStrideX, signedsize(0))
);
/* Compute kernelXMax */
std::size_t kernelXMax = dims[2] + negStrideX;
if ((static_cast<signedsize>(dims[2]) + negStrideX) < 0){
kernelXMax = 0;
}
else if (kernelXMax > kernelDims[0]){
kernelXMax = kernelDims[0];
}
for (std::size_t outY = 0; outY < outYSize; ++outY) {
const signedsize negStrideY = static_cast<signedsize>(-outY * strideDims[1]);
const std::size_t kernelYMin = static_cast<std::size_t>(
std::max(negStrideY, signedsize(0))
);
/* Compute kernelYMax */
std::size_t kernelYMax = dims[3] + negStrideY;
const std::size_t strideXoffset = outX * strideDims[0];
const std::size_t strideYoffset = outY * strideDims[1];
I poolValue(0.0);
bool valid = false;
if (static_cast<signedsize>(dims[3]) + negStrideY < 0){
kernelYMax = 0;
}
else if(kernelYMax > kernelDims[1]){
kernelYMax = kernelDims[1];
}
std::size_t saveIndex = 0;
for (unsigned int kY = kernelYMin; kY < kernelYMax ; ++kY){
for (unsigned int kX = kernelXMin; kX < kernelXMax; ++kX){
// Apply dilation factor to kernel indices
const std::size_t dilatedkernelX = kX * dilations[0];
const std::size_t dilatedkernelY = kY * dilations[1];
// Ensure indices are within bounds
auto inputXPostDilation = strideXoffset + dilatedkernelX;
auto inputYPostDilation = strideYoffset + dilatedkernelY;
if (inputXPostDilation < dims[2] && inputYPostDilation < dims[3]){
std::size_t inputIndex =
inputBaseIndex + inputXPostDilation * dims[3]
+ inputYPostDilation;
const I inputValue = input[inputIndex];
if (!valid || inputValue > poolValue) {
poolValue = inputValue;
saveIndex = inputIndex;
valid = true;
}
}
}
}
if (valid){
grad[saveIndex]++;
}
}
}
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(MaxPoolingImpl2D_cpu,
{DataType::Float32},
{
ProdConso::inPlaceModel,
Aidge::MaxPoolingImpl2D_cpu_forward_kernel<float, float>,
Aidge::MaxPoolingImpl2D_cpu_backward_kernel<float, float>,
}
);
REGISTRAR(MaxPoolingImpl2D_cpu,
{DataType::Float64},
{
ProdConso::inPlaceModel,
Aidge::MaxPoolingImpl2D_cpu_forward_kernel<double, double>,
Aidge::MaxPoolingImpl2D_cpu_backward_kernel<double, double>,
}
);
REGISTRAR(MaxPoolingImpl2D_cpu,
{DataType::Int32},
{
ProdConso::inPlaceModel,
Aidge::MaxPoolingImpl2D_cpu_forward_kernel<int32_t, int32_t>,
Aidge::MaxPoolingImpl2D_cpu_backward_kernel<int32_t, int32_t>,
}
);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_MODIMPL_H_
#define AIDGE_CPU_OPERATOR_MODIMPL_H_
#include <memory>
#include <tuple>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/Mod.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
// Operator implementation entry point for the backend
using ModImpl_cpu = OperatorImpl_cpu<Mod_Op,
void(bool, const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)>;
// Implementation entry point registration to Operator
REGISTRAR(Mod_Op, "cpu", Aidge::ModImpl_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_MODIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_
#include <numeric> // std::accumulate
#include <cstddef> // std::size_t
#include <cstdint> // std::int32_t, std::int64_t
#include <functional> // std::multiplies
#include "aidge/utils/Registrar.hpp"
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/ModImpl.hpp"
namespace Aidge {
template <typename T,
typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
static inline T modulus(T a, T b) {
return a % b;
}
template <typename T,
typename std::enable_if<!std::is_integral<T>::value>::type* = nullptr>
static inline T modulus(T /*a*/, T /*b*/) {
AIDGE_THROW_OR_ABORT(std::runtime_error, "Mod Operator with fmod attribute set to false only supports integer types.");
}
template <class I1, class I2, class O>
constexpr void ModImpl_cpu_forward_kernel(bool fmod,
const std::size_t input1size_,
const std::size_t input2size_,
const std::size_t output1size_,
const void* input1_,
const void* input2_,
void* output_) {
const I1* input_1 = static_cast<const I1*>(input1_);
const I2* input_2 = static_cast<const I2*>(input2_);
O* output = static_cast<O*>(output_);
// suppose values are contiguous in memory
for (std::size_t i = 0; i < output1size_; ++i) {
const std::size_t in1_id = (input1size_ != 1) ? i : 0;
const std::size_t in2_id = (input2size_ != 1) ? i : 0;
if (fmod) {
output[i] = static_cast<O>(std::fmod(input_1[in1_id], input_2[in2_id]));
}
else {
output[i] = static_cast<O>(modulus(input_1[in1_id], input_2[in2_id]));
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(ModImpl_cpu,
{DataType::Float32},
{ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<float, float, float>, nullptr});
REGISTRAR(ModImpl_cpu,
{DataType::Float64},
{ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<double, double, double>, nullptr});
REGISTRAR(ModImpl_cpu,
{DataType::Int32},
{ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
REGISTRAR(ModImpl_cpu,
{DataType::UInt64},
{ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<std::uint64_t, std::uint64_t, std::uint64_t>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_MULIMPL_H_
#define AIDGE_CPU_OPERATOR_MULIMPL_H_
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/Mul.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include <memory>
#include <vector>
namespace Aidge {
// Operator implementation entry point for the backend
using MulImpl_cpu = OperatorImpl_cpu<Mul_Op,
void(std::vector<std::size_t>,
std::vector<std::size_t>,
const std::vector<std::size_t>&,
const void*,
const void*,
void*),
void(const std::size_t,
const std::size_t,
const std::size_t,
const std::vector<std::size_t>,
const std::vector<std::size_t>,
const std::vector<std::size_t>,
const void*,
const void*,
const void*,
void*,
void*)>;
// Implementation entry point registration to Operator
REGISTRAR(Mul_Op, "cpu", Aidge::MulImpl_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_MULIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_MULIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_MULIMPL_KERNELS_H_
#include "aidge/utils/Registrar.hpp"
#include <cstdint> // std::int32_t, std::int64_t
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/MulImpl.hpp"
namespace {
// suppose values are contiguous in memory
template <class I1, class I2, class O>
void mul_contiguous_arrays(const std::size_t input1size,
const std::size_t input2size,
const std::size_t output1size,
const I1* input1,
const I2* input2,
O* output)
{
for (std::size_t i = 0; i < output1size; ++i)
{
const std::size_t in1_id = (input1size != 1) ? i : 0;
const std::size_t in2_id = (input2size != 1) ? i : 0;
output[i] = static_cast<O>(input1[in1_id] * input2[in2_id]);
}
}
}
namespace Aidge {
template <class I1, class I2, class O>
void MulImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
std::vector<std::size_t> dims1,
const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_,
void* output_) {
const I1* input_0 = static_cast<const I1*>(input0_);
const I2* input_1 = static_cast<const I2*>(input1_);
O* output = static_cast<O*>(output_);
// [5,2,1,7] & [2,6,7]
// 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
// 2. Find the highest equal dimension -> 3
// Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
// 3. Compute the highest number of contiguous data -> 7
// 4. Compute stride and offset step for the broadcast mechanism
// 5. Call a simple kernel
// ## Compute compatible input dimensions
// special case for equal dimensions, the kernel is called with the entire arrays at once
if (dims0 == dims1) {
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t i = 0; i < input0_contiguous_size; ++i)
{
output[i] = static_cast<O>(input_0[i] * input_1[i]);
}
return;
}
// set dimensions to be of equal size by filling the smallest one with ones.
if (dims0.size() > dims1.size()) {
dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
const std::size_t nbDims = dims0.size();
// Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
}
}
++contiguousIdx;
// Compute the highest number of contiguous data for each Tensor
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
mul_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
input_0 + offsetIn0*input0_contiguous_size,
input_1 + offsetIn1*input1_contiguous_size,
output + offsetOut*output_contiguous_size);
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outputDims[dim] == 0) {
tmp_stack /= outputDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
}
template <class I1, class I2, class O>
void MulImpl_cpu_backward_kernel(const std::size_t /*input0Length*/,
const std::size_t /*input1Length*/,
const std::size_t gradOutputLength,
const std::vector<std::size_t>& dims0,
const std::vector<std::size_t>& dims1,
const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_,
const void* grad_output_,
void* gradientInput0_,
void* gradientInput1_)
{
const I1* input0 = static_cast<const I1*>(input0_);
const I2* input1 = static_cast<const I2*>(input1_);
const O* grad_output = static_cast<const O*>(grad_output_);
auto* grad_input_0 = static_cast<I1*>(gradientInput0_);
auto* grad_input_1 = static_cast<I2*>(gradientInput1_);
// Broadcast dims0 and dims1 to match the shape of outputDims
auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
for (std::size_t i = 0; i < gradOutputLength; ++i) {
auto idxOutputGrad = getMultiDimIndices(outputDims, i);
std::vector<std::size_t> idxInput0(broadcastedDims0.size());
std::vector<std::size_t> idxInput1(broadcastedDims1.size());
// Map output indices to input0 indices, considering broadcasting
for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
// If input0 is broadcasted along this dimension (== 1) or both dimensions are 1, index is 0.
// idxInput0 represent the multi dim index of input0 contributing
// to the output at index i.
idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
}
for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
}
// We have to access tensors with a flat index, hence the conversion
auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
grad_input_0[idx0] += static_cast<I1>(grad_output[i] * input1[idx1]);
grad_input_1[idx1] += static_cast<I2>(grad_output[i] * input0[idx0]);
}
}
// Kernels registration to implementation entry point
REGISTRAR(MulImpl_cpu,
{DataType::Float32},
{ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, float, float>, Aidge::MulImpl_cpu_backward_kernel<float, float, float>});
REGISTRAR(MulImpl_cpu,
{{{DataType::Float32}, {DataType::Float64}}, {DataType::Float32}},
{ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, double, float>, Aidge::MulImpl_cpu_backward_kernel<float, double, float>});
REGISTRAR(MulImpl_cpu,
{DataType::Float64},
{ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<double, double, double>, Aidge::MulImpl_cpu_backward_kernel<double, double, double>});
REGISTRAR(MulImpl_cpu,
{DataType::Int32},
{ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, Aidge::MulImpl_cpu_backward_kernel<std::int32_t, std::int32_t, std::int32_t>});
REGISTRAR(MulImpl_cpu,
{DataType::Int64},
{ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, Aidge::MulImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_MULIMPL_KERNELS_H_ */
......@@ -9,43 +9,44 @@
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_PRODUCERIMPL_H_
#define AIDGE_CPU_OPERATOR_PRODUCERIMPL_H_
#ifndef AIDGE_CPU_OPERATOR_IMPL_H_
#define AIDGE_CPU_OPERATOR_IMPL_H_
#include <cstddef> // std::size_t
#include <memory>
#include <tuple> // std::tuple
#include <vector>
#include "aidge/backend/OperatorImpl.hpp"
#include "aidge/operator/Producer.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
class ProducerImpl_cpu : public OperatorImpl {
private:
const Producer_Op &mOp;
public:
ProducerImpl_cpu(const Producer_Op &op) : mOp(op) {}
static std::unique_ptr<ProducerImpl_cpu> create(const Producer_Op &op) {
return std::make_unique<ProducerImpl_cpu>(op);
template <class Op, class FwdFunc, class BwdFunc = void()>
class OperatorImpl_cpu : public OperatorImpl,
public Registrable<OperatorImpl_cpu<Op, FwdFunc, BwdFunc>, ImplSpec, Impl<FwdFunc, BwdFunc>>
{
public:
OperatorImpl_cpu(const Op& op) : OperatorImpl(op, "cpu") {}
static std::unique_ptr<OperatorImpl_cpu<Op, FwdFunc, BwdFunc>> create(const Op& op) {
return std::make_unique<OperatorImpl_cpu<Op, FwdFunc, BwdFunc>>(op);
}
public:
NbElts_t getNbRequiredData(const IOIndex_t inputIdx) const override final;
NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
NbElts_t getRequiredMemory(__attribute__((unused)) const IOIndex_t outputIdx, __attribute__((unused)) const std::vector<DimSize_t> &inputsSize) const override final;
NbElts_t getNbConsumedData(const IOIndex_t inputIdx) const override final;
NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final;
virtual std::shared_ptr<ProdConso> getProdConso() const override {
const auto impl = Registrar<OperatorImpl_cpu>::create(getBestMatch(getRequiredSpec()));
return impl.prodConso(mOp);
}
void forward();
virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
// return Registrar<OperatorImpl_cpu>::getKeys(); // Note: cannot return set due to python binding
std::set<ImplSpec> implSpecsSet = Registrar<OperatorImpl_cpu>::getKeys();
return std::vector<ImplSpec>(implSpecsSet.begin(), implSpecsSet.end());
}
void backward();
void forward() override;
void backward() override;
};
namespace {
static Registrar<Producer_Op> registrarProducer1DImpl_cpu("cpu", Aidge::ProducerImpl_cpu::create);
} // namespace
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_PRODUCERIMPL_H_ */
\ No newline at end of file
#endif /* AIDGE_CPU_OPERATOR_IMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_PADIMPL_H_
#define AIDGE_CPU_OPERATOR_PADIMPL_H_
#include <array>
#include <memory>
#include <tuple>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/Pad.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
namespace Aidge {
class Pad_ProdConso_cpu : public ProdConso {
public:
Pad_ProdConso_cpu(const Operator& op): ProdConso(op) {}
static std::unique_ptr<ProdConso> defaultModel(const Operator& op) {
return std::make_unique<Pad_ProdConso_cpu>(op);
}
Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
};
// Operator implementation entry point for the backend
using Pad1D_Op = Pad_Op<1>;
using PadImpl1D_cpu = OperatorImpl_cpu<Pad_Op<1>,
void(const std::array<DimSize_t, 2>&,
const PadBorderType,
const double,
const std::array<DimSize_t, 3> &,
const void *,
void *)>;
using Pad2D_Op = Pad_Op<2>;
using PadImpl2D_cpu = OperatorImpl_cpu<Pad_Op<2>,
void(const std::array<DimSize_t, 4>&,
const PadBorderType,
const double,
const std::array<DimSize_t, 4> &,
const void *,
void *)>;
// Implementation entry point registration to Operator
REGISTRAR(Pad1D_Op, "cpu", Aidge::PadImpl1D_cpu::create);
REGISTRAR(Pad2D_Op, "cpu", Aidge::PadImpl2D_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_PADIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_PADIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_PADIMPL_KERNELS_H_
#include <algorithm> // std::max, std::min
#include <array>
#include <cstddef> // std::size_t
#include <cstdint> // std::int32_t
#include "aidge/backend/cpu/operator/PadImpl.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
/**
* @brief Forward kernel for 1D Padding on CPU backend.
* @tparam I Input data type.
* @tparam O Output data type.
* @param attrs tuple of Parameters from the Operator
* @param dims Array of input dimensions.
* @param input_ const input Tensor.
* @param output_ Output Tensor.
*/
template <class I, class O>
void PadImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorders,
const PadBorderType borderType,
const double borderValue,
const std::array<DimSize_t, 3>& dims,
const void *input_,
void *output_)
{
const I *input = static_cast<const I *>(input_);
O *output = static_cast<O *>(output_);
const std::size_t oxSize = dims[2] + beginEndBorders[0] + beginEndBorders[1];
for (std::size_t batch = 0; batch < dims[0]; ++batch) {
for (std::size_t ch = 0; ch < dims[1]; ++ch) {
const std::size_t iIndex = (ch + batch*dims[1]) * dims[2];
const std::size_t oIndex = (ch + batch*dims[1]) * oxSize;
for (unsigned int ox = 0; ox < oxSize; ++ox) {
const std::size_t oIndexFull = oIndex + ox;
O outputValue = static_cast<O>(borderValue);
if (borderType == PadBorderType::Constant) {
int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[0]);
if (ix >= 0 && ix < static_cast<int>(dims[2])) {
outputValue = input[iIndex + static_cast<std::size_t>(ix)];
}
}
else if (borderType == PadBorderType::Edge) {
int ix = std::max(0, std::min(static_cast<int>(dims[2]) - 1, static_cast<int>(ox) - static_cast<int>(beginEndBorders[0])));
outputValue = input[iIndex + static_cast<std::size_t>(ix)];
}
else if (borderType == PadBorderType::Reflect) {
int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[0]);
if (ix < 0)
ix = 0 - ix;
if (ix >= static_cast<int>(dims[2]))
ix = static_cast<int>(dims[2]) - ix;
outputValue = input[iIndex + static_cast<std::size_t>(ix)];
}
else if (borderType == PadBorderType::Wrap) {
int ix = (static_cast<int>(dims[2]) + static_cast<int>(ox) - static_cast<int>(beginEndBorders[0])) % static_cast<int>(dims[2]);
outputValue = input[iIndex + static_cast<std::size_t>(ix)];
}
output[oIndexFull] = outputValue;
}
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(PadImpl1D_cpu,
{{DataType::Float32, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
{Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Float32>, cpptype_t<DataType::Float32>>, nullptr});
REGISTRAR(PadImpl1D_cpu,
{{DataType::Float64, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
{Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Float64>, cpptype_t<DataType::Float64>>, nullptr});
REGISTRAR(PadImpl1D_cpu,
{{DataType::Int32, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
{Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Int32>, cpptype_t<DataType::Int32>>, nullptr});
/**
* @brief Forward kernel for 2D Padding on CPU backend.
* @tparam I Input data type.
* @tparam O Output data type.
* @param attrs tuple of Parameters from the Operator
* @param dims Array of input dimensions.
* @param input_ const input Tensor.
* @param output_ Output Tensor.
*/
template <class I, class O>
void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorders,
const PadBorderType borderType,
const double borderValue,
const std::array<DimSize_t, 4> &dims,
const void *input_,
void *output_)
{
const I *input = static_cast<const I *>(input_);
O *output = static_cast<O *>(output_);
const std::size_t oySize = dims[2] + beginEndBorders[0] + beginEndBorders[2];
const std::size_t oxSize = dims[3] + beginEndBorders[1] + beginEndBorders[3];
for (std::size_t batch = 0; batch < dims[0]; ++batch) {
for (std::size_t ch = 0; ch < dims[1]; ++ch) {
const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
for (std::uint32_t oy = 0; oy < oySize; ++oy) {
for (std::uint32_t ox = 0; ox < oxSize; ++ox) {
const std::size_t oIndexFull = oIndex + oy*oxSize + ox;
O outputValue = static_cast<O>(borderValue);
if (borderType == PadBorderType::Constant) {
std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1]);
std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0]);
if (ix >= 0 && ix < static_cast<std::int32_t>(dims[3]) && iy >= 0 && iy < static_cast<std::int32_t>(dims[2])) {
outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
}
}
else if (borderType == PadBorderType::Edge) {
std::int32_t ix = std::max(0, std::min(static_cast<std::int32_t>(dims[3]) - 1, static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1])));
std::int32_t iy = std::max(0, std::min(static_cast<std::int32_t>(dims[2]) - 1, static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0])));
outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
}
else if (borderType == PadBorderType::Reflect) {
std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1]);
std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0]);
if (ix < 0)
ix = 0 - ix;
if (iy < 0)
iy = 0 - iy;
if (ix >= static_cast<std::int32_t>(dims[3]))
ix = static_cast<std::int32_t>(dims[3]) - ix;
if (iy >= static_cast<std::int32_t>(dims[2]))
iy = static_cast<std::int32_t>(dims[2]) - iy;
outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
}
else if (borderType == PadBorderType::Wrap) {
std::int32_t ix = (static_cast<std::int32_t>(dims[3]) + static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1])) % static_cast<std::int32_t>(dims[3]);
std::int32_t iy = (static_cast<std::int32_t>(dims[2]) + static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0])) % static_cast<std::int32_t>(dims[2]);
outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
}
output[oIndexFull] = outputValue;
}
}
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(PadImpl2D_cpu,
{{DataType::Float32, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
{Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl2D_cpu_forward_kernel<cpptype_t<DataType::Float32>, cpptype_t<DataType::Float32>>, nullptr});
REGISTRAR(PadImpl2D_cpu,
{{DataType::Float64, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
{Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl2D_cpu_forward_kernel<cpptype_t<DataType::Float64>, cpptype_t<DataType::Float64>>, nullptr});
REGISTRAR(PadImpl2D_cpu,
{{DataType::Int32, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
{Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl2D_cpu_forward_kernel<cpptype_t<DataType::Int32>, cpptype_t<DataType::Int32>>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_PADIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_
#define AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_
#include <array>
#include <memory>
#include <tuple>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/MetaOperatorDefs.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
// Operator implementation entry point for the backend
using PaddedConv1D_Op = MetaOperator_Op;
using PaddedConvImpl1D_cpu = OperatorImpl_cpu<MetaOperator_Op,
void(const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 1>&,
const std::array<DimSize_t, 1>&,
const std::array<DimSize_t, 1>&,
const std::array<DimSize_t, 3> &,
DimSize_t,
const void *,
const void *,
const void *,
void *)>;
using PaddedConv2D_Op = MetaOperator_Op;
using PaddedConvImpl2D_cpu = OperatorImpl_cpu<MetaOperator_Op,
void(const std::array<DimSize_t, 4>&,
const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 4> &,
DimSize_t,
const void *,
const void *,
const void *,
void *)>;
// Implementation entry point registration to Operator
// Uncomment to activate implementation for PaddedConv. It is currently less efficient, hence why it is commented.
// REGISTRAR(PaddedConv1D_Op, std::array<std::string, 2>({"cpu", "PaddedConv1D"}), Aidge::PaddedConvImpl1D_cpu::create);
// REGISTRAR(PaddedConv2D_Op, std::array<std::string, 2>({"cpu", "PaddedConv2D"}), Aidge::PaddedConvImpl2D_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_
#include <array>
#include <cstddef>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
#include "aidge/operator/Pad.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
// Only works for constant padding zero
/**
* @brief Forward kernel for 1D Convolution on CPU backend.
* @tparam I Input data type.
* @tparam W Weight data type.
* @tparam B Bias data type.
* @tparam O Output data type.
* @param params tuple of Attributes from the Operator
* @param inputDims Array of input dimensions.
* @param input_ const input Tensor.
* @param weights_ const weight Tensor.
* @param biases_ const Biais Tensor.
* @param output_ Output Tensor.
*/
template <class I, class W, class B, class O>
void PaddedConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorders,
const std::array<DimSize_t, 1>& strideDims,
const std::array<DimSize_t, 1>& dilationDims,
const std::array<DimSize_t, 1>& kernelDims,
const std::array<DimSize_t, 3>& inputDims,
DimSize_t outChannels,
const void *input_,
const void *weights_,
const void *biases_,
void *output_)
{
// FIXME: missing convolution attributes as arguments
const I *input = static_cast<const I *>(input_);
const W *weights = static_cast<const W *>(weights_);
const B *biases = static_cast<const B *>(biases_);
O *output = static_cast<O *>(output_);
// output H size
const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
const std::size_t oxSize =
static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
static_cast<float>(strideDims[0])));
// TODO: kernel computation
// output (batch, outCh, Xout, Yout)
// input (batch, inCh, Xin, Yin)
// weight (outCh, inCh, kernelX, kernelY)
// does not take Dilation attribute into account
using signedsize = std::make_signed<std::size_t>::type;
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
const std::size_t oIndex = (outCh + batch*outChannels) * oxSize;
// If bias = nullptr, set B(0)
B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2];
const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0];
for (std::size_t ox = 0; ox < oxSize; ++ox) {
const signedsize difx = static_cast<signedsize>(ox * strideDims[0]);
const std::size_t sxMin = static_cast<std::size_t>(std::max(static_cast<signedsize>(beginEndBorders[0]) - difx, signedsize(0)));
const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + static_cast<signedsize>(beginEndBorders[1]) - difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
const std::size_t oIndexFull = oIndex + ox;
const signedsize ix = static_cast<signedsize>(ox * strideDims[0]) - static_cast<signedsize>(beginEndBorders[0]);
for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
output[oIndexFull] += weights[wIndex + sx] *
input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
}
}
}
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(PaddedConvImpl1D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
{ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr});
REGISTRAR(PaddedConvImpl1D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
{ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
REGISTRAR(PaddedConvImpl1D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
{ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
REGISTRAR(PaddedConvImpl1D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
{ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr});
/**
* @brief Forward kernel for 2D Convolution on CPU backend.
* @tparam I Input data type.
* @tparam W Weight data type.
* @tparam B Bias data type.
* @tparam O Output data type.
* @param params tuple of Attributes from the Operator
* @param inputDims Array of input dimensions.
* @param input_ const input Tensor.
* @param weights_ const weight Tensor.
* @param biases_ const Biais Tensor.
* @param output_ Output Tensor.
*/
template <class I, class W, class B, class O>
void PaddedConvImpl2D_cpu_forward_kernel(
const std::array<DimSize_t, 4>& beginEndBorders,
const std::array<DimSize_t, 2>& strideDims,
const std::array<DimSize_t, 2>& dilationDims,
const std::array<DimSize_t, 2>& kernelDims,
const std::array<DimSize_t, 4> &inputDims,
DimSize_t outChannels,
const void *input_,
const void *weights_,
const void *biases_,
void *output_)
{
// FIXME: missing convolution attributes as arguments
const I *input = static_cast<const I *>(input_);
const W *weights = static_cast<const W *>(weights_);
const B *biases = static_cast<const B *>(biases_);
O *output = static_cast<O *>(output_);
// output H size
const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
const std::size_t oxSize =
static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + beginEndBorders[0] + beginEndBorders[2] + strideDims[0]) /
static_cast<float>(strideDims[0])));
// output W size
const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
const std::size_t oySize =
static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + beginEndBorders[1] + beginEndBorders[3] + strideDims[1]) /
static_cast<float>(strideDims[1])));
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize;
// If bias = nullptr, set B(0)
B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
for (std::size_t ox = 0; ox < oxSize; ++ox) {
const std::size_t difx = ox * strideDims[0];
const std::size_t sxMin = beginEndBorders[0] < difx ? std::size_t(0) : beginEndBorders[0] - difx;
const std::size_t sxMax = (inputDims[2] + beginEndBorders[2]) < difx ?
0 :
((inputDims[2] + beginEndBorders[2]) > dilated_kernel_x + difx ?
dilated_kernel_x :
(inputDims[2] + beginEndBorders[2] - difx));
for (std::size_t oy = 0; oy < oySize; ++oy) {
const std::size_t dify = oy * strideDims[1];
const std::size_t syMin = beginEndBorders[1] < dify ? std::size_t(0) : beginEndBorders[1] - dify;
const std::size_t syMax = (inputDims[3] + beginEndBorders[3]) < dify ?
0 :
((inputDims[3] + beginEndBorders[3]) > dilated_kernel_y + dify ?
dilated_kernel_y :
(inputDims[3] + beginEndBorders[3] - dify));
const std::size_t oIndexFull = oIndex + ox*oySize + oy;
const std::size_t ix = ox * strideDims[0] - beginEndBorders[0];
const std::size_t iy = oy * strideDims[1] - beginEndBorders[1];
if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
} else {
for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) {
output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
input[iIndex + (sx*dilationDims[0] + ix)*inputDims[3] + sy*dilationDims[1] + iy];
}
}
}
}
}
}
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(PaddedConvImpl2D_cpu,
// ImplSpec{std::vector<ImplSpec::IOSpec>({ImplSpec::IOSpec{DataType::Any, DataFormat::NCHW}, ImplSpec::IOSpec{DataType::Any, DataFormat::NCHW}}) , std::vector<ImplSpec::IOSpec>({ImplSpec::IOSpec{DataType::Int32, DataFormat::NCHW}})},
{{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
{ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>, nullptr});
REGISTRAR(PaddedConvImpl2D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
{ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
REGISTRAR(PaddedConvImpl2D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
{ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr});
REGISTRAR(PaddedConvImpl2D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
{ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_POWIMPL_H_
#define AIDGE_CPU_OPERATOR_POWIMPL_H_
#include <cstddef> // std::size_t
#include <memory> // std::unique_ptr, std::make_unique
#include <string>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/Pow.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
// Operator implementation entry point for the backend
using PowImpl_cpu = OperatorImpl_cpu<Pow_Op,
void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*),
void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, const void*, void*, void*)>;
// Implementation entry point registration to Operator
REGISTRAR(Pow_Op, "cpu", Aidge::PowImpl_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_POWIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_
#include "aidge/utils/Registrar.hpp"
#include <cstddef> // std::size_t
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/PowImpl.hpp"
namespace Aidge {
namespace {
// suppose values are contiguous in memory
template <class I, class O>
void pow_contiguous_arrays(const std::size_t input1size,
const std::size_t input2size,
const std::size_t output1size,
const I* input1,
const I* input2,
O* output)
{
for (std::size_t i = 0; i < output1size; ++i)
{
const std::size_t in1_id = (input1size != 1) ? i : 0;
const std::size_t in2_id = (input2size != 1) ? i : 0;
output[i] = static_cast<O>(std::pow(input1[in1_id], input2[in2_id]));
}
}
}
template <class I, class O>
void PowImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
std::vector<std::size_t> dims1,
const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_,
void* output_) {
const I* input_0 = static_cast<const I*>(input0_);
const I* input_1 = static_cast<const I*>(input1_);
O* output = static_cast<O*>(output_);
// [5,2,1,7] & [2,6,7]
// 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
// 2. Find the highest equal dimension -> 3
// Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
// 3. Compute the highest number of contiguous data -> 7
// 4. Compute stride and offset step for the broadcast mechanism
// 5. Call a simple kernel
// special case for equal dimensions, the kernel is called with the entire arrays at once
if (dims0 == dims1) {
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t i = 0; i < input0_contiguous_size; ++i)
{
output[i] = static_cast<O>(std::pow(input_0[i], input_1[i]));
}
return;
}
// set dimensions to be of equal size by filling the smallest one with ones.
if (dims0.size() > dims1.size()) {
dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
const std::size_t nbDims = dims0.size();
// Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
}
}
++contiguousIdx;
// Compute the highest number of contiguous data for each Tensor
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
pow_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
input_0 + offsetIn0*input0_contiguous_size,
input_1 + offsetIn1*input1_contiguous_size,
output + offsetOut*output_contiguous_size);
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outputDims[dim] == 0) {
tmp_stack /= outputDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
}
template <class I1, class I2, class O>
void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims,
const std::vector<std::size_t>& input1Dims,
const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_,
const void* gradOutput_,
void* gradientInput0_,
void* gradientInput1_) {
const I1* input0 = static_cast<const I1*>(input0_);
I1* grad0 = static_cast<I1*>(gradientInput0_);
const I2* input1 = static_cast<const I2*>(input1_);
I2* grad1 = static_cast<I2*>(gradientInput1_);
const O* gradOut = static_cast<const O*>(gradOutput_);
std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
for (size_t oIndex = 0; oIndex < totalElements; ++oIndex)
{
// Compute indexes in inputs 0 and 1 to support broadcasting
std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);
std::size_t idx0 = getFlattenedIndex(input0Dims, indexes);
std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
// grad0 = grad_output * (input1 * pow(input0, (input1 -1)))
grad0[idx0] += gradOut[oIndex]*input1[idx1]* std::pow(input0[idx0], input1[idx1]-1);
// grad1 = grad_output * (output * ln(input0))
grad1[idx1] += gradOut[oIndex] * std::pow(input0[idx0], input1[idx1]) * std::log(input0[idx0]);
}
}
// Kernels registration to implementation entry point
REGISTRAR(PowImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>});
REGISTRAR(PowImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>});
REGISTRAR(PowImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>});
REGISTRAR(PowImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, Aidge::PowImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>});
REGISTRAR(PowImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, Aidge::PowImpl_cpu_backward_kernel<std::int8_t, std::int8_t, std::int8_t>});
REGISTRAR(PowImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}},
{ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, Aidge::PowImpl_cpu_backward_kernel<std::uint8_t, std::uint8_t, std::uint8_t>});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ */
......@@ -12,52 +12,24 @@
#ifndef AIDGE_CPU_OPERATOR_RELUIMPL_H_
#define AIDGE_CPU_OPERATOR_RELUIMPL_H_
#include "aidge/backend/OperatorImpl.hpp"
#include <cstddef> // std::size_t
#include <memory>
#include <tuple> // std::tuple
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/ReLU.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include <memory>
#include <vector>
namespace Aidge {
// class ReLU_Op;
// compute kernel registry for forward and backward
class ReLUImplForward_cpu
: public Registrable<ReLUImplForward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
};
class ReLUImplBackward_cpu
: public Registrable<ReLUImplBackward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
};
class ReLUImpl_cpu : public OperatorImpl {
protected:
const ReLU_Op& mOp;
std::array<NbElts_t, 1> mNbConsumedData;
std::array<NbElts_t, 1> mNbProducedData;
public:
ReLUImpl_cpu(const ReLU_Op& op) : mOp(op), mNbConsumedData({0}), mNbProducedData({0}) {}
static std::unique_ptr<ReLUImpl_cpu> create(const ReLU_Op& op) {
return std::make_unique<ReLUImpl_cpu>(op);
}
public:
NbElts_t getNbRequiredData(const IOIndex_t inputIdx) const override final;
NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
NbElts_t getRequiredMemory(__attribute__((unused)) const IOIndex_t outputIdx, __attribute__((unused)) const std::vector<DimSize_t>& inputsSize) const override final;
NbElts_t getNbConsumedData(const IOIndex_t inputIdx) const override final;
NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final;
void forward();
void backward();
};
// Operator implementation entry point for the backend
using ReLUImpl_cpu = OperatorImpl_cpu<ReLU_Op,
void(const std::size_t, const void*, void*),
void(const std::size_t, const void*, const void*, void*)>;
namespace {
static Registrar<ReLU_Op> registrarReLUImpl_cpu("cpu", Aidge::ReLUImpl_cpu::create);
}
// Implementation entry point registration to Operator
REGISTRAR(ReLU_Op, "cpu", Aidge::ReLUImpl_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_RELUIMPL_H_ */
\ No newline at end of file
#endif /* AIDGE_CPU_OPERATOR_RELUIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_RELUIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_RELUIMPL_KERNELS_H_
#include <cstddef> // std::size_t
#include <memory>
#include <tuple> // std::tuple
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/backend/cpu/operator/ReLUImpl.hpp"
#include "aidge/operator/ReLU.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
// Kernels
template <class I, class O>
void ReLUImpl_cpu_forward_kernel(std::size_t inputLength,
const void* input_,
void* output_) {
const I* input = static_cast<const I*>(input_);
O* output = static_cast<O*>(output_);
//#pragma omp parallel for if (inputLength > 1024)
for (std::size_t i = 0; i < inputLength; ++i) {
output[i] = (input[i] > 0) ? input[i] : 0;
}
}
template <class I, class GI, class GO>
void ReLUImpl_cpu_backward_kernel(const std::size_t inputLength,
const void* input_, const void* grad_output_,
void* grad_input_) {
const I* input = static_cast<const I*>(input_);
const GO* grad_output = static_cast<const GO*>(grad_output_);
GI* grad_input = static_cast<GI*>(grad_input_);
for (std::size_t i = 0; i < inputLength; ++i) {
grad_input[i] += (input[i] > 0) ? grad_output[i] : 0;
}
}
// Kernels registration to implementation entry point
REGISTRAR(ReLUImpl_cpu,
{DataType::Float32},
{ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<float, float>, Aidge::ReLUImpl_cpu_backward_kernel<float, float, float>});
REGISTRAR(ReLUImpl_cpu,
{DataType::Float64},
{ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<double, double>, Aidge::ReLUImpl_cpu_backward_kernel<double, double, double>});
REGISTRAR(ReLUImpl_cpu,
{DataType::Int32},
{ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
REGISTRAR(ReLUImpl_cpu,
{DataType::Int8},
{ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<int8_t, int8_t>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_RELUIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_H_
#define AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_H_
#include <memory>
#include <tuple>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/ReduceMean.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
// Operator implementation entry point for the backend
using ReduceMeanImpl_cpu = OperatorImpl_cpu<ReduceMean_Op,
void(const std::vector<std::int32_t>&,
DimSize_t,
const std::vector<DimSize_t>&,
const void *,
void *)>;
// Implementation entry point registration to Operator
REGISTRAR(ReduceMean_Op, "cpu", Aidge::ReduceMeanImpl_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_KERNELS_H_
#include <algorithm> // std::for_each
#include <cstddef> // std::size_t
#include <cstdint> // std::int32_t
#include <functional> //std::multiplies
#include <numeric> //std::accumulate
#include <vector>
#include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
#include "aidge/data/Data.hpp"
#include "aidge/operator/ReduceMean.hpp"
#include "aidge/utils/Registrar.hpp"
namespace Aidge {
template <typename T>
using Acc_T = typename std::conditional_t<std::is_floating_point<T>::value, T, double>;
template <typename T>
typename std::enable_if<std::is_floating_point<T>::value, T>::type
stableMean(const T* vec, std::size_t len, std::size_t stride) {
T mean = 0;
for (std::size_t i = 0; i < len; ++i) {
mean = std::fma(vec[i * stride] - mean, static_cast<T>(1) / static_cast<T>(i + 1), mean);
}
return mean;
}
// Specialization for integers: perform the mean computation in float
template <typename T>
typename std::enable_if_t<!std::is_floating_point<T>::value, double>
stableMean(const T* vec, std::size_t len, std::size_t stride) {
double mean = 0;
for (size_t i = 0; i < len; ++i) {
mean = std::fma<double>(static_cast<double>(vec[i * stride]) - mean, 1.0 / static_cast<double>(i + 1), mean);
}
return mean;
}
template <typename T>
typename std::enable_if_t<std::is_floating_point<T>::value, T>
castFromFloat(T value) {
return value;
}
template <typename T>
typename std::enable_if_t<!std::is_floating_point<T>::value, T>
castFromFloat(double value) {
return static_cast<T>(std::nearbyint(value));
}
template <class I, class O>
void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
DimSize_t /*keepDims*/,
const std::vector<DimSize_t>& inputDims,
const void* input_,
void* output_) {
const I* input = static_cast<const I*>(input_);
O* output = static_cast<O*>(output_);
const std::size_t nb_dims = inputDims.size();
const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
if (axes.empty()){
std::copy_n(input,totalElements, output);
}
else if (axes.size() == 1) {
const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axes[0], 1, std::multiplies<std::size_t>());
const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axes[0], 1, std::multiplies<std::size_t>());
const std::size_t dim_i = inputDims[axes[0]];
for (std::size_t pre = 0; pre < stride_pre; ++pre) {
for (std::size_t post = 0; post < stride_post; ++post) {
const std::size_t idx_i = pre * dim_i * stride_post + post;
const std::size_t idx_o = pre * stride_post + post;
output[idx_o] = castFromFloat<O>(stableMean(input + idx_i, dim_i, stride_post));
}
}
} else {
std::size_t outputElements = totalElements;
auto stride_post = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
stride_post[nb_dims - 1] = 1;
for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) {
stride_post[i] = stride_post[i+1]*inputDims[i+1];
}
auto stride_pre = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
stride_pre[0] = 1;
for (std::size_t i = 1; i < nb_dims; ++i) {
stride_pre[i] = stride_pre[i-1]*inputDims[i-1];
}
// Type should be the return type of stableMean<I>(), which is always floating point
const Acc_T<I>* inputAccumulation = nullptr;
Acc_T<I>* outputAccumulation = nullptr;
for (const auto& axisInt : axes) {
const std::size_t a = static_cast<std::size_t>(axisInt);
outputElements /= inputDims[a];
outputAccumulation = new Acc_T<I>[outputElements];
const std::size_t dim_i = inputDims[a];
for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) {
for (std::size_t post = 0; post < stride_post[a]; ++post) {
const std::size_t idx_i = pre * dim_i * stride_post[a] + post;
const std::size_t idx_o = pre * stride_post[a] + post;
if (inputAccumulation == nullptr) {
outputAccumulation[idx_o] = stableMean<I>(input + idx_i, dim_i, stride_post[a]);
}
else {
outputAccumulation[idx_o] = stableMean<Acc_T<I>>(inputAccumulation + idx_i, dim_i, stride_post[a]);
}
}
}
std::for_each(stride_pre.get()+a+1, stride_pre.get()+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; });
if (inputAccumulation != nullptr) {
delete[] inputAccumulation;
}
inputAccumulation = outputAccumulation;
}
std::transform(inputAccumulation, inputAccumulation + outputElements, output,
[](auto value) { return castFromFloat<O>(value); });
if (outputAccumulation) {
delete[] outputAccumulation;
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(ReduceMeanImpl_cpu,
{DataType::Float32},
{ProdConso::inPlaceModel, Aidge::ReduceMeanImpl_cpu_forward_kernel<float, float>, nullptr});
REGISTRAR(ReduceMeanImpl_cpu,
{DataType::Float64},
{ProdConso::inPlaceModel, Aidge::ReduceMeanImpl_cpu_forward_kernel<double, double>, nullptr});
REGISTRAR(ReduceMeanImpl_cpu,
{DataType::Int32},
{ProdConso::inPlaceModel, Aidge::ReduceMeanImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2024 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_
#define AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_
#include <array>
#include <memory>
#include <tuple>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/ReduceSum.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
// Operator implementation entry point for the backend
using ReduceSumImpl_cpu = OperatorImpl_cpu<ReduceSum_Op,
void(const std::vector<std::int32_t>&,
DimSize_t,
const std::vector<DimSize_t>&,
const void *,
void *)>;
// Implementation entry point registration to Operator
REGISTRAR(ReduceSum_Op, "cpu", Aidge::ReduceSumImpl_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_ */
/********************************************************************************
* Copyright (c) 2024 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_REDUCESUMIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_REDUCESUMIMPL_KERNELS_H_
#include <algorithm> // std::for_each
#include <cstddef> // std::size_t
#include <cstdint> // std::int32_t
#include <functional> //std::multiplies
#include <numeric> //std::accumulate
#include <vector>
#include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
#include "aidge/data/Data.hpp"
#include "aidge/operator/ReduceSum.hpp"
#include "aidge/utils/Registrar.hpp"
namespace Aidge {
template <class I, class O>
void ReduceSumImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
DimSize_t /*keepDims*/,
const std::vector<DimSize_t>& inputDims,
const void* input_,
void* output_) {
const I* input = static_cast<const I*>(input_);
O* output = static_cast<O*>(output_);
const std::size_t nb_dims = inputDims.size();
const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
if (axes.empty()){
std::copy_n(input,totalElements, output);
}
else if (axes.size() == 1) {
const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axes[0], 1, std::multiplies<std::size_t>());
const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axes[0], 1, std::multiplies<std::size_t>());
const std::size_t dim_i = inputDims[axes[0]];
for (std::size_t pre = 0; pre < stride_pre; ++pre) {
for (std::size_t post = 0; post < stride_post; ++post) {
const std::size_t idx_i = pre * dim_i * stride_post + post;
const std::size_t idx_o = pre * stride_post + post;
O sum = 0;
for (std::size_t i = 0; i < dim_i; ++i) {
sum +=input[idx_i + i*stride_post];
}
output[idx_o] = sum;
}
}
} else {
std::size_t outputElements = totalElements;
auto stride_post = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
stride_post[nb_dims - 1] = 1;
for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) {
stride_post[i] = stride_post[i+1]*inputDims[i+1];
}
auto stride_pre = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
stride_pre[0] = 1;
for (std::size_t i = 1; i < nb_dims; ++i) {
stride_pre[i] = stride_pre[i-1]*inputDims[i-1];
}
const I* inputAccumulation = input;
I* outputAccumulation = nullptr;
for (const auto& axisInt : axes) {
const std::size_t a = static_cast<std::size_t>(axisInt);
outputElements /= inputDims[a];
outputAccumulation = new I[outputElements];
const std::size_t dim_i = inputDims[a];
for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) {
for (std::size_t post = 0; post < stride_post[a]; ++post) {
const std::size_t idx_i = pre * dim_i * stride_post[a] + post;
const std::size_t idx_o = pre * stride_post[a] + post;
I sum = 0;
for (std::size_t i = 0; i < dim_i; ++i) {
sum += inputAccumulation[idx_i + i*stride_post[a]];
}
outputAccumulation[idx_o] = sum;
}
}
std::for_each(stride_pre.get()+a+1, stride_pre.get()+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; });
if (inputAccumulation != input) {
delete[] inputAccumulation;
}
inputAccumulation = outputAccumulation;
}
// Copy elements from inputAccumulation to output while dividing by divisor
std::copy(inputAccumulation, inputAccumulation + outputElements, output);
if (outputAccumulation) {
delete[] outputAccumulation;
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(ReduceSumImpl_cpu,
{DataType::Float32},
{ProdConso::inPlaceModel, Aidge::ReduceSumImpl_cpu_forward_kernel<float, float>, nullptr});
REGISTRAR(ReduceSumImpl_cpu,
{DataType::Float64},
{ProdConso::inPlaceModel, Aidge::ReduceSumImpl_cpu_forward_kernel<double, double>, nullptr});
REGISTRAR(ReduceSumImpl_cpu,
{DataType::Int32},
{ProdConso::inPlaceModel, Aidge::ReduceSumImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_REDUCESUMIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_RESIZEIMPL_H_
#define AIDGE_CPU_OPERATOR_RESIZEIMPL_H_
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/Resize.hpp"
#include "aidge/utils/Registrar.hpp"
#include <aidge/data/Interpolation.hpp>
#include <aidge/operator/Pad.hpp>
#include <cstdint>
namespace Aidge {
// Operator implementation entry point for the backend
using ResizeImpl_cpu = OperatorImpl_cpu<
Resize_Op,
void(const void *, // input
const std::vector<DimSize_t> &, // INput dims
const std::vector<DimSize_t> &, // OUTput dims
const Interpolation::CoordinateTransformation, // coord transfo
const Interpolation::Mode, // interpolation mode
const PadBorderType, // padding mode
void *)>; // output
// Implementation entry point registration to Operator
REGISTRAR(Resize_Op, "cpu", Aidge::ResizeImpl_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_RESIZEIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_
#define AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_
#include "aidge/backend/cpu/operator/ResizeImpl.hpp"
#include <aidge/data/Data.hpp>
#include <aidge/data/half.hpp>
#include <aidge/operator/Pad.hpp>
#include <cmath>
#include <cstdint>
#include <numeric>
#include "aidge/backend/cpu/data/Interpolation.hpp"
#include "aidge/data/Interpolation.hpp"
#include "aidge/data/Tensor.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
template <typename IO>
void ResizeImpl_cpu_forward_kernel(
const void *input_,
const std::vector<DimSize_t> &inputDims,
const std::vector<DimSize_t> &outputDims,
const Interpolation::CoordinateTransformation coordTransfoMode,
const Interpolation::Mode interpMode,
const PadBorderType paddingMode,
// const double * /*roi*/,
// const float * /*scales*/,
// const int64_t * /*sizes*/,
void *output_) {
// Seting a data
const IO *input = static_cast<const IO *>(input_);
IO *output = static_cast<IO *>(output_);
const DimSize_t outputLen = std::accumulate(outputDims.cbegin(),
outputDims.cend(),
1,
std::multiplies<DimSize_t>());
#ifdef _OPENMP
#pragma omp parallel for if (outputLen >= 16)
#endif
for (int idxFlatOut = 0; idxFlatOut < static_cast<int>(outputLen); ++idxFlatOut) {
const auto coordOut = Tensor::toCoord(outputDims, idxFlatOut);
auto coordInApprox =
Interpolation::untransformCoordinates(coordOut,
inputDims,
outputDims,
coordTransfoMode);
if ((interpMode == Interpolation::Mode::Ceil) || (interpMode == Interpolation::Mode::Floor) || (interpMode == Interpolation::Mode::RoundPreferCeil) || (interpMode == Interpolation::Mode::RoundPreferFloor)) {
for (std::size_t i = 0; i < coordInApprox.size(); ++i) {
if (interpMode == Interpolation::Mode::Ceil) {
coordInApprox[i] = std::ceil(coordInApprox[i]);
} else if (interpMode == Interpolation::Mode::Floor) {
coordInApprox[i] = std::floor(coordInApprox[i]);
} else if (interpMode == Interpolation::Mode::RoundPreferCeil) {
coordInApprox[i] = std::floor(coordInApprox[i] + 0.5f);
} else { // (interpMode == Interpolation::Mode::RoundPreferFloor)
coordInApprox[i] = std::ceil(coordInApprox[i] - 0.5f);
}
}
std::vector<std::size_t> coordIn(inputDims.size());
if (Tensor::isInBounds<float>(inputDims, coordInApprox)) {
for (std::size_t i = 0; i < coordInApprox.size(); ++i) {
coordIn[i] = static_cast<std::size_t>(coordInApprox[i]);
}
} else {
if (paddingMode == PadBorderType::Edge) {
for (std::size_t i = 0; i < coordInApprox.size(); ++i) {
coordIn[i] = coordInApprox[i] < 0 ? 0 : (coordInApprox[i] >=inputDims[i] ? inputDims[i] - 1 : static_cast<std::size_t>(coordInApprox[i]));
}
} else {
AIDGE_THROW_OR_ABORT(std::runtime_error, "Padding mode not supported");
}
}
output[idxFlatOut] = input[Tensor::toIndex(inputDims, coordIn)];
} else {
std::set<Interpolation::Point<IO>> neighbours =
InterpolationCPU::retrieveNeighbours(input,
inputDims,
coordInApprox,
paddingMode);
output[idxFlatOut] = InterpolationCPU::interpolate(coordInApprox,
neighbours,
interpMode);
}
}
return;
}
// Kernels registration to implementation entry point
REGISTRAR(ResizeImpl_cpu,
{{{DataType::Int16},
{DataType::Any},
{DataType::Any},
{DataType::Any}},
{DataType::Int16}},
{ProdConso::inPlaceModel,
ResizeImpl_cpu_forward_kernel<int16_t>,
nullptr});
REGISTRAR(ResizeImpl_cpu,
{{{DataType::Int32},
{DataType::Any},
{DataType::Any},
{DataType::Any}},
{DataType::Int32}},
{ProdConso::inPlaceModel,
ResizeImpl_cpu_forward_kernel<int32_t>,
nullptr});
REGISTRAR(ResizeImpl_cpu,
{{{DataType::Int64},
{DataType::Any},
{DataType::Any},
{DataType::Any}},
{DataType::UInt64}},
{ProdConso::inPlaceModel,
ResizeImpl_cpu_forward_kernel<int64_t>,
nullptr});
REGISTRAR(ResizeImpl_cpu,
{{{DataType::Float16},
{DataType::Any},
{DataType::Any},
{DataType::Any}},
{DataType::Float16}},
{ProdConso::inPlaceModel,
ResizeImpl_cpu_forward_kernel<half_float::half>,
nullptr});
REGISTRAR(ResizeImpl_cpu,
{{{DataType::Float32},
{DataType::Any},
{DataType::Any},
{DataType::Any}},
{DataType::Float32}},
{ProdConso::inPlaceModel,
ResizeImpl_cpu_forward_kernel<float>,
nullptr});
REGISTRAR(ResizeImpl_cpu,
{{{DataType::Float64},
{DataType::Any},
{DataType::Any},
{DataType::Any}},
{DataType::Float64}},
{ProdConso::inPlaceModel,
ResizeImpl_cpu_forward_kernel<double>,
nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_ */