Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • mszczep/aidge_backend_cpu
  • eclipse/aidge/aidge_backend_cpu
  • hrouis/aidge_backend_cpu
  • oantoni/aidge_backend_cpu
  • raphaelmillet/aidge_backend_cpu
  • cguillon/aidge_backend_cpu
  • jeromeh/aidge_backend_cpu
  • axelfarr/aidge_backend_cpu
  • noamzerah/aidge_backend_cpu
  • silvanosky/aidge_backend_cpu
  • maab05/aidge_backend_cpu
  • lucaslopez/aidge_backend_cpu_ll
  • farnez/aidge_backend_cpu
  • mick94/aidge_backend_cpu
14 results
Show changes
Showing
with 2212 additions and 0 deletions
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_
#define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_
#include <cstddef>
#include <memory>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/ConstantOfShape.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
// Operator implementation entry point for the backend
using ConstantOfShapeImpl_cpu = OperatorImpl_cpu<ConstantOfShape_Op,
void(const std::vector<DimSize_t>, const Tensor&, void *)>;
// Implementation entry point registration to Operator
REGISTRAR(ConstantOfShape_Op, "cpu", Aidge::ConstantOfShapeImpl_cpu::create);
} // namespace Aidge
#endif /* _AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_
#include <aidge/data/Tensor.hpp>
#include <aidge/data/half.hpp>
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <functional> // std::multiplies
#include <numeric> // std::accumulate
#include <vector>
#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
#include "aidge/data/Data.hpp"
#include "aidge/utils/ErrorHandling.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
template <class O>
void ConstantOfShapeimpl_cpu_forward_kernel(
const std::vector<DimSize_t> output_dims, const Tensor &value,
void *output_) {
O *output = static_cast<O *>(output_);
O val;
std::copy(static_cast<O *>(value.getImpl()->hostPtr()),
static_cast<O *>(value.getImpl()->hostPtr()) +
static_cast<NbElts_t>(1),
&val);
const size_t output_size = std::accumulate(
output_dims.begin(), output_dims.end(), 1, std::multiplies<DimSize_t>());
for (size_t i = 0; i < output_size; ++i) {
output[i] = val;
}
}
// Kernels registration to implementation entry point
REGISTRAR(ConstantOfShapeImpl_cpu,
{ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float16}},
{ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<half_float::half>, nullptr});
REGISTRAR(ConstantOfShapeImpl_cpu,
{ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float32}},
{ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<float>, nullptr});
REGISTRAR(ConstantOfShapeImpl_cpu,
{ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float64}},
{ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<double>, nullptr});
REGISTRAR(ConstantOfShapeImpl_cpu,
{ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int16}},
{ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int16_t>, nullptr});
REGISTRAR(ConstantOfShapeImpl_cpu,
{ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int32}},
{ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int32_t>, nullptr});
REGISTRAR(ConstantOfShapeImpl_cpu,
{ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int64}},
{ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int64_t>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_H_
#define AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_H_
#include <array>
#include <memory>
#include <tuple>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/ConvDepthWise.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
namespace Aidge {
// Operator implementation entry point for the backend
using ConvDepthWise1D_Op = ConvDepthWise_Op<1>;
using ConvDepthWiseImpl1D_cpu = OperatorImpl_cpu<ConvDepthWise_Op<1>,
void(const std::array<DimSize_t, 1>&,
const std::array<DimSize_t, 1>&,
const std::array<DimSize_t, 1>&,
const std::array<DimSize_t, 3>&,
const void *,
const void *,
const void *,
void *)>;
using ConvDepthWise2D_Op = ConvDepthWise_Op<2>;
using ConvDepthWiseImpl2D_cpu = OperatorImpl_cpu<ConvDepthWise_Op<2>,
void(const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 4> &,
const void *,
const void *,
const void *,
void *)>;
// Implementation entry point registration to Operator
REGISTRAR(ConvDepthWise1D_Op, "cpu", Aidge::ConvDepthWiseImpl1D_cpu::create);
REGISTRAR(ConvDepthWise2D_Op, "cpu", Aidge::ConvDepthWiseImpl2D_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_KERNELS_H_
#include <algorithm>
#include <array>
#include <cmath>
#include <cstddef>
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
/**
* @brief Forward kernel for 1D ConvDepthWiseolution on CPU backend.
* @tparam I Input data type.
* @tparam W Weight data type.
* @tparam B Bias data type.
* @tparam O Output data type.
* @param params tuple of Attributes from the Operator
* @param inputDims Array of input dimensions.
* @param input_ const input Tensor.
* @param weights_ const weight Tensor.
* @param biases_ const Biais Tensor.
* @param output_ Output Tensor.
*/
template <class I, class W, class B, class O>
void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
const std::array<DimSize_t, 1>& dilationDims,
const std::array<DimSize_t, 1>& kernelDims,
const std::array<DimSize_t, 3>& inputDims,
const void *input_,
const void *weights_,
const void *biases_,
void *output_) {
// FIXME: missing convolution attributes as arguments
const I *input = static_cast<const I *>(input_);
const W *weights = static_cast<const W *>(weights_);
const B *biases = static_cast<const B *>(biases_);
O *output = static_cast<O *>(output_);
// output H size
const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
const std::size_t oxSize =
static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
static_cast<float>(strideDims[0])));
// TODO: kernel computation
// output (batch, outCh, Xout, Yout)
// input (batch, ch, Xin, Yin)
// weight (outCh, ch, kernelX, kernelY)
// does not take Dilation attribute into account
using signedsize = std::make_signed<std::size_t>::type;
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize;
B biasVal = (biases != nullptr) ? biases[ch] : B(0);
std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2];
const std::size_t wIndex = ch * kernelDims[0];
for (std::size_t ox = 0; ox < oxSize; ++ox) {
// const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
// const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
// const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
const std::size_t sxMin = 0;
const std::size_t sxMax = dilated_kernel_x;
const std::size_t oIndexFull = oIndex + ox;
const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
output[oIndexFull] += weights[wIndex + sx] *
input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
}
}
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(ConvDepthWiseImpl1D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
{ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr});
REGISTRAR(ConvDepthWiseImpl1D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
{ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>, nullptr});
REGISTRAR(ConvDepthWiseImpl1D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
{ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr});
/**
* @brief Forward kernel for 2D ConvDepthWiseolution on CPU backend.
* @tparam I Input data type.
* @tparam W Weight data type.
* @tparam B Bias data type.
* @tparam O Output data type.
* @param params tuple of Attributes from the Operator
* @param inputDims Array of input dimensions.
* @param input_ const input Tensor.
* @param weights_ const weight Tensor.
* @param biases_ const Biais Tensor.
* @param output_ Output Tensor.
*/
template <class I, class W, class B, class O>
void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
const std::array<DimSize_t, 2>& dilationDims,
const std::array<DimSize_t, 2>& kernelDims,
const std::array<DimSize_t, 4>& inputDims,
const void *input_,
const void *weights_,
const void *biases_,
void *output_)
{
// FIXME: missing convolution attributes as arguments
const I *input = static_cast<const I *>(input_);
const W *weights = static_cast<const W *>(weights_);
const B *biases = static_cast<const B *>(biases_);
O *output = static_cast<O *>(output_);
// output H size
const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
const std::size_t oxSize =
static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
static_cast<float>(strideDims[0])));
// output W size
const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
const std::size_t oySize =
static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
static_cast<float>(strideDims[1])));
// TODO: kernel computation
// output (batch, outCh, Xout, Yout)
// input (batch, ch, Xin, Yin)
// weight (outCh, ch, kernelX, kernelY)
// does not take Dilation attribute into account
const std::size_t outChannels_s = oxSize * oySize;
if (dilated_kernel_x ==3 && dilated_kernel_y == 3) {
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
B biasVal = (biases != nullptr) ? biases[ch] : B(0);
std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = ch * 9;
if (strideDims[0] == 1 && strideDims[1]==1) {
for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
}
iIndex+=inputDims[3];
for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
}
iIndex+=inputDims[3];
for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
}
}
} else {
for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) {
for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2];
}
iIndex+=inputDims[3];
for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy*strideDims[1]]+weights[wIndex+4]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+5]*input[iIndex+oy*strideDims[1]+2];
}
iIndex+=inputDims[3];
for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy*strideDims[1]]+weights[wIndex+7]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+8]*input[iIndex+oy*strideDims[1]+2];
}
}
}
output += outChannels_s;
}
}
} else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
B biasVal = (biases != nullptr) ? biases[ch] : B(0);
std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = ch;
if (strideDims[0] == 1 && strideDims[1] == 1) {
for (std::size_t i = iIndex; i < iIndex + oxSize*oySize; ++i) {
output[i] = biasVal + weights[wIndex] * input[i];
}
} else {
std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize * oySize;
for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=strideDims[0]*inputDims[3]) {
for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
output[oIndex + oy] = biasVal + weights[wIndex]*input[iIndex+iy];
}
}
}
}
}
} else {
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
B biasVal = (biases != nullptr) ? biases[ch] : B(0);
std::fill(output, output+outChannels_s, biasVal);
const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
for (std::size_t ox = 0; ox < oxSize; ++ox) {
for (std::size_t oy = 0; oy < oySize; ++oy) {
const std::size_t oIndexFull = ox*oySize + oy;
const std::size_t ix = ox * strideDims[0];
const std::size_t iy = oy * strideDims[1];
for (std::size_t kx = 0; kx*dilationDims[0] < dilated_kernel_x; ++kx) {
for (std::size_t ky = 0; ky*dilationDims[1] < dilated_kernel_y; ++ky) {
output[oIndexFull] += weights[wIndex + kx*kernelDims[1] + ky] *
input[iIndex + (ix + kx*dilationDims[0])*inputDims[3] + (iy + ky*dilationDims[1])];
}
}
}
}
output += outChannels_s;
}
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(ConvDepthWiseImpl2D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
{ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr});
REGISTRAR(ConvDepthWiseImpl2D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
{ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>, nullptr});
REGISTRAR(ConvDepthWiseImpl2D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
{ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_H_
#define AIDGE_CPU_OPERATOR_CONVIMPL_H_
#include <array>
#include <memory>
#include <tuple>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/Conv.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
namespace Aidge {
// Operator implementation entry point for the backend
using Conv1D_Op = Conv_Op<1>;
using ConvImpl1D_cpu = OperatorImpl_cpu<Conv_Op<1>,
void(const std::array<DimSize_t, 1>&,
const std::array<DimSize_t, 1>&,
const std::array<DimSize_t, 1>&,
const std::array<DimSize_t, 3> &,
DimSize_t,
const void *,
const void *,
const void *,
void *)>;
using Conv2D_Op = Conv_Op<2>;
using ConvImpl2D_cpu = OperatorImpl_cpu<Conv_Op<2>,
void(const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 4> &,
DimSize_t,
const void *,
const void *,
const void *,
void *)>;
// Implementation entry point registration to Operator
REGISTRAR(Conv1D_Op, "cpu", Aidge::ConvImpl1D_cpu::create);
REGISTRAR(Conv2D_Op, "cpu", Aidge::ConvImpl2D_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
#include <array>
#include <memory>
#include <tuple>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/backend/cpu/operator/ConvImpl.hpp"
#include "aidge/operator/Conv.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
namespace Aidge {
/**
* @brief Forward kernel for 1D Convolution on CPU backend.
* @tparam I Input data type.
* @tparam W Weight data type.
* @tparam B Bias data type.
* @tparam O Output data type.
* @param params tuple of Attributes from the Operator
* @param inputDims Array of input dimensions.
* @param input_ const input Tensor.
* @param weights_ const weight Tensor.
* @param biases_ const Biais Tensor.
* @param output_ Output Tensor.
*/
template <class I, class W, class B, class O>
void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
const std::array<DimSize_t, 1>& dilationDims,
const std::array<DimSize_t, 1>& kernelDims,
const std::array<DimSize_t, 3>& inputDims,
DimSize_t outChannels,
const void *input_,
const void *weights_,
const void *biases_,
void *output_)
{
// FIXME: missing convolution attributes as arguments
const I *input = static_cast<const I *>(input_);
const W *weights = static_cast<const W *>(weights_);
const B *biases = static_cast<const B *>(biases_);
O *output = static_cast<O *>(output_);
// output H size
const std::size_t oxSize =
static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) /
static_cast<float>(strideDims[0])));
const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
// TODO: kernel computation
// output (batch, outCh, Xout, Yout)
// input (batch, inCh, Xin, Yin)
// weight (outCh, inCh, kernelX, kernelY)
// does not take Dilation attribute into account
using signedsize = std::make_signed<std::size_t>::type;
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
const std::size_t oIndex = (outCh + batch*outChannels) * oxSize;
// If bias = nullptr, set B(0)
B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2];
const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0];
for (std::size_t ox = 0; ox < oxSize; ++ox) {
// const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
// const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
// const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
const std::size_t sxMin = 0;
const std::size_t sxMax = dilated_kernel_x;
const std::size_t oIndexFull = oIndex + ox;
const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
output[oIndexFull] += weights[wIndex + sx] *
input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
}
}
}
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(ConvImpl1D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
{ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr});
REGISTRAR(ConvImpl1D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}},
{ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
REGISTRAR(ConvImpl1D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
{ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
REGISTRAR(ConvImpl1D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
{ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr});
/**
* @brief Forward kernel for 2D Convolution on CPU backend.
* @tparam I Input data type.
* @tparam W Weight data type.
* @tparam B Bias data type.
* @tparam O Output data type.
* @param params tuple of Attributes from the Operator
* @param inputDims Array of input dimensions.
* @param input_ const input Tensor.
* @param weights_ const weight Tensor.
* @param biases_ const Biais Tensor.
* @param output_ Output Tensor.
*/
template <class I, class W, class B, class O>
void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
const std::array<DimSize_t, 2>& dilationDims,
const std::array<DimSize_t, 2>& kernelDims,
const std::array<DimSize_t, 4> &inputDims,
DimSize_t outChannels,
const void *input_,
const void *weights_,
const void *biases_,
void *output_)
{
// FIXME: missing convolution attributes as arguments
const I *input = static_cast<const I *>(input_);
const W *weights = static_cast<const W *>(weights_);
const B *biases = static_cast<const B *>(biases_);
O *output = static_cast<O *>(output_);
// output H size
const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
const std::size_t oxSize =
static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
static_cast<float>(strideDims[0])));
// output W size
const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
const std::size_t oySize =
static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
static_cast<float>(strideDims[1])));
// TODO: kernel computation
// output (batch, outCh, Xout, Yout)
// input (batch, inCh, Xin, Yin)
// weight (outCh, inCh, kernelX, kernelY)
// does not take Dilation attribute into account
const std::size_t outChannels_s = oxSize * oySize;
if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
// If bias = nullptr, set B(0)
B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
std::fill(output, output+outChannels_s, biasVal);
for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9;
if (strideDims[0] == 1 && strideDims[1]==1) {
for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
}
iIndex+=inputDims[3];
for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
}
iIndex+=inputDims[3];
for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
}
}
} else {
for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) {
for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2];
}
iIndex+=inputDims[3];
for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy*strideDims[1]]+weights[wIndex+4]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+5]*input[iIndex+oy*strideDims[1]+2];
}
iIndex+=inputDims[3];
for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy*strideDims[1]]+weights[wIndex+7]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+8]*input[iIndex+oy*strideDims[1]+2];
}
}
}
}
output += outChannels_s;
}
}
} else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
// If bias = nullptr, set B(0)
B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
std::fill(output, output+outChannels_s, biasVal);
for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = (inCh + outCh*inputDims[1]);
if (strideDims[0] == 1 && strideDims[1] == 1) {
for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) {
output[oIndex] += weights[wIndex] * input[iIndex];
}
} else {
for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) {
for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy];
}
}
}
}
output += outChannels_s;
}
}
} else {
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
// If bias = nullptr, set B(0)
B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
std::fill(output, output+outChannels_s, biasVal);
for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
// loop over each ouput line
for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) {
// loop over associated input line
for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) {
// loop over the entire line
for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
const std::size_t iIndex = iIndex_channel + ix + iy;
// loop over elements assosicated with one output
for (std::size_t kx = 0; kx < kernelDims[0]; ++kx) {
output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]];
}
}
}
}
}
output += outChannels_s;
}
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(ConvImpl2D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
{ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr});
REGISTRAR(ConvImpl2D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}},
{ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
REGISTRAR(ConvImpl2D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
{ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
REGISTRAR(ConvImpl2D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
{ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_DIVIMPL_H_
#define AIDGE_CPU_OPERATOR_DIVIMPL_H_
#include <memory>
#include <tuple>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/Div.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
// Operator implementation entry point for the backend
using DivImpl_cpu = OperatorImpl_cpu<Div_Op,
void(const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)>;
// Implementation entry point registration to Operator
REGISTRAR(Div_Op, "cpu", Aidge::DivImpl_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_DIVIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_
#include <numeric> // std::accumulate
#include <cstddef> // std::size_t
#include <cstdint> // std::int32_t, std::int64_t
#include <functional> // std::multiplies
#include "aidge/utils/Registrar.hpp"
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/DivImpl.hpp"
namespace Aidge {
// template <class I1, class I2, class O>
// void DivImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
// const std::vector<std::size_t>& input2Dims,
// const std::vector<std::size_t>& outputDims,
// const void* input1_,
// const void* input2_,
// void* output_) {
// const I1* input_1 = static_cast<const I1*>(input1_);
// const I2* input_2 = static_cast<const I2*>(input2_);
// O* output = static_cast<O*>(output_);
// const std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
// {
// std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);
// std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
// std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
// // TODO assert if input_2 is bad?
// output[oIndex] = input_1[idx1] / input_2[idx2];
// }
// }
template <class I1, class I2, class O>
constexpr void DivImpl_cpu_forward_kernel(const std::size_t input1size_,
const std::size_t input2size_,
const std::size_t output1size_,
const void* input1_,
const void* input2_,
void* output_) {
const I1* input_1 = static_cast<const I1*>(input1_);
const I2* input_2 = static_cast<const I2*>(input2_);
O* output = static_cast<O*>(output_);
// suppose values are contiguous in memory
for (std::size_t i = 0; i < output1size_; ++i) {
const std::size_t in1_id = (input1size_ != 1) ? i : 0;
const std::size_t in2_id = (input2size_ != 1) ? i : 0;
output[i] = static_cast<O>(input_1[in1_id] / input_2[in2_id]);
}
}
// Kernels registration to implementation entry point
REGISTRAR(DivImpl_cpu,
{DataType::Float32},
{ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<float, float, float>, nullptr});
REGISTRAR(DivImpl_cpu,
{DataType::Float64},
{ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<double, double, double>, nullptr});
REGISTRAR(DivImpl_cpu,
{DataType::Int32},
{ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_ERFIMPL_H_
#define AIDGE_CPU_OPERATOR_ERFIMPL_H_
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/Erf.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include <memory>
#include <vector>
namespace Aidge {
// Operator implementation entry point for the backend
using ErfImpl_cpu = OperatorImpl_cpu<Erf_Op,
void(const std::size_t, const void*, void*)>;
// Implementation entry point registration to Operator
REGISTRAR(Erf_Op, "cpu", Aidge::ErfImpl_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_ERFIMPL_H_ */
/******************************************************************************** /********************************************************************************
* Copyright (c) 2023 CEA-List * Copyright (c) 2023 CEA-List
* *
* This program and the accompanying materials are made available under the * This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at * terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0. * http://www.eclipse.org/legal/epl-2.0.
* *
* SPDX-License-Identifier: EPL-2.0 * SPDX-License-Identifier: EPL-2.0
* *
********************************************************************************/ ********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_FORWARD_KERNEL_H_ #ifndef AIDGE_CPU_OPERATOR_ERFIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_FORWARD_KERNEL_H_ #define AIDGE_CPU_OPERATOR_ERFIMPL_KERNELS_H_
#include "aidge/utils/Registrar.hpp" #include <cmath>
#include "aidge/operator/LeakyReLUImpl.hpp" #include "aidge/utils/Registrar.hpp"
namespace Aidge { #include "aidge/backend/cpu/operator/ErfImpl.hpp"
template <class I, class O>
void LeakyReLUImpl_cpu_forward_kernel(const LeakyReLU_Op::Parameters& params, namespace Aidge {
std::size_t inputLenght, template <class I, class O>
const void* input_, void ErfImpl_cpu_forward_kernel(std::size_t inputLenght,
void* output_) { const void* input_,
void* output_) {
const I* input = static_cast<const I*>(input_);
O* output = static_cast<O*>(output_); const I* input = static_cast<const I*>(input_);
I negativeSlope = static_cast<I>(std::get<0>(params)); O* output = static_cast<O*>(output_);
for (std::size_t i = 0; i < inputLenght; ++i) { for (std::size_t i = 0; i < inputLenght; ++i) {
output[i] = input[i] >= 0 ? input[i] : input[i] * negativeSlope; output[i] = std::erf(input[i]);
} }
} }
namespace { // Kernels registration to implementation entry point
static Registrar<LeakyReLUImplForward_cpu> registrarLeakyReLUImplForward_cpu_Float32( REGISTRAR(ErfImpl_cpu,
{DataType::Float32, DataType::Float32}, Aidge::LeakyReLUImpl_cpu_forward_kernel<float, float>); {DataType::Float32},
static Registrar<LeakyReLUImplForward_cpu> registrarLeakyReLUImplForward_cpu_Int32( {ProdConso::inPlaceModel, Aidge::ErfImpl_cpu_forward_kernel<float, float>, nullptr});
{DataType::Int32, DataType::Int32}, Aidge::LeakyReLUImpl_cpu_forward_kernel<int, int>); REGISTRAR(ErfImpl_cpu,
static Registrar<LeakyReLUImplForward_cpu> registrarLeakyReLUImplForward_cpu_Float64( {DataType::Float64},
{DataType::Float64, DataType::Float64}, Aidge::LeakyReLUImpl_cpu_forward_kernel<double, double>); {ProdConso::inPlaceModel, Aidge::ErfImpl_cpu_forward_kernel<double, double>, nullptr});
} // namespace REGISTRAR(ErfImpl_cpu,
} // namespace Aidge {DataType::Int32},
{ProdConso::inPlaceModel, Aidge::ErfImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
#endif /* AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_FORWARD_KERNEL_H_ */ } // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_ERFIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_EXPANDIMPL_H_
#define AIDGE_CPU_OPERATOR_EXPANDIMPL_H_
#include <memory>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/Expand.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
// Operator implementation entry point for the backend
using ExpandImpl_cpu = OperatorImpl_cpu<Expand_Op,
void(const std::shared_ptr<Tensor> &,
const std::shared_ptr<Tensor> &,
void *,
const std::vector<DimSize_t> &)>;
// Implementation entry point registration to Operator
REGISTRAR(Expand_Op, "cpu", Aidge::ExpandImpl_cpu::create);
} // namespace Aidge
#endif /* _AIDGE_CPU_OPERATOR_EXPANDIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_EXPANDIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_EXPANDIMPL_KERNELS_H_
#include "aidge/backend/cpu/operator/ExpandImpl.hpp"
#include "aidge/utils/Registrar.hpp"
#include <aidge/data/Data.hpp>
#include <aidge/data/Tensor.hpp>
#include <aidge/data/half.hpp>
#include <aidge/scheduler/ProdConso.hpp>
#include <aidge/utils/Types.h>
#include <cmath>
#include <cstdint> // std::int32_t, std::int64_t
#include <memory>
#include <numeric>
namespace {
// suppose values are contiguous in memory
template <class IO>
void expandContiguousArray(const std::size_t inputStackSize,
const std::size_t outputStackSize,
const IO *input,
IO *output) {
for (std::size_t i = 0; i < outputStackSize; ++i) {
output[i] = (inputStackSize == 1) ? input[0] : input[i];
}
return;
}
} // namespace
namespace Aidge {
template <class IO>
void ExpandImpl_cpu_forward_kernel(
const std::shared_ptr<Tensor> &inData,
const std::shared_ptr<Tensor> &_inExpandShape,
void *_output,
const std::vector<DimSize_t> &outputDims) {
// retrieving data of inputShape & dimensions of inputDims
// as the process will require to modify the values
IO *output = static_cast<IO *>(_output);
std::vector<DimSize_t> inExpandShape(_inExpandShape->size());
for (DimSize_t i = 0; i < _inExpandShape->size(); ++i) {
inExpandShape[i] = _inExpandShape->get<std::int64_t>(i);
}
std::vector<DimSize_t> inDataDims = inData->dims();
// Example with 2 tensors
// [5,2,1,7] & [2,6,7]
// 1. Same number of dimensions but adding 1s to le left of "smallest"
// tensor -> [5,2,1,7] & [1,2,6,7]
// 2. Find the highest equal dimension -> 3
// Exception: if the first diverging dimension is the last one, then ->
// 4 (dims.size())
// 3. Compute the highest number of contiguous data -> 7
// 4. Compute stride and offset step for the broadcast mechanism
// 5. Call a simple kernel
// ## Compute compatible input dimensions
// special case for equal dimensions, the kernel is called with the entire
// arrays at once
if (inDataDims == inExpandShape) {
const std::size_t input0ContiguousSize =
std::accumulate(inDataDims.cbegin(),
inDataDims.cend(),
static_cast<std::size_t>(1),
std::multiplies<std::size_t>());
for (std::size_t i = 0; i < input0ContiguousSize; ++i) {
output[i] = inData->get<IO>(i);
}
return;
}
// set dimensions to be of equal size by filling the smallest one with
// ones.
if (inDataDims.size() > inExpandShape.size()) {
inExpandShape.insert(inExpandShape.cbegin(),
inDataDims.size() - inExpandShape.size(),
static_cast<DimSize_t>(1));
} else if (_inExpandShape->size() > inDataDims.size()) {
inDataDims.insert(inDataDims.cbegin(),
inExpandShape.size() - inDataDims.size(),
static_cast<DimSize_t>(1));
}
const std::size_t nbDims = inDataDims.size();
// Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (inDataDims[contiguousIdx] != inExpandShape[contiguousIdx]) {
break;
}
}
if (contiguousIdx == (nbDims - 1)) {
// last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t> &dims =
(inDataDims[contiguousIdx] == 1) ? inDataDims : inExpandShape;
while ((contiguousIdx + 1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
++contiguousIdx;
// Compute the highest number of contiguous data for each Tensor
const std::size_t inputDataContiguousSize =
std::accumulate(inDataDims.cbegin() + contiguousIdx,
inDataDims.cend(),
static_cast<std::size_t>(1),
std::multiplies<std::size_t>());
const std::size_t outputContiguousSize =
std::accumulate(outputDims.cbegin() + contiguousIdx,
outputDims.cend(),
static_cast<std::size_t>(1),
std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::unique_ptr<std::int32_t[]> stridePostIn =
std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> strideStepIn =
std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stridePostIn[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2;
i != static_cast<std::size_t>(-1);
--i) {
stridePostIn[i] = stridePostIn[i + 1] *
static_cast<std::int32_t>(inDataDims[i + 1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
strideStepIn[i] = (inDataDims[i] == 1) ? 1 - stridePostIn[i] : 1;
}
}
// variables for arrays offsets
std::size_t offsetInData = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks =
std::accumulate(outputDims.cbegin(),
outputDims.cbegin() + contiguousIdx,
static_cast<std::size_t>(1),
std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
expandContiguousArray<IO>(
inputDataContiguousSize,
outputContiguousSize,
&static_cast<const IO *>(
inData->getImpl()
->rawPtr())[offsetInData * inputDataContiguousSize],
&output[offsetOut * outputContiguousSize]);
if (++stack < nbStacks) {
std::size_t tmpStack = stack;
while (tmpStack % outputDims[dim] == 0) {
tmpStack /= outputDims[dim];
dim--;
}
offsetInData += strideStepIn[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
}
REGISTRAR(ExpandImpl_cpu,
{{DataType::Int16, DataType::Int64}, {DataType::Int16}},
{ProdConso::inPlaceModel,
Aidge::ExpandImpl_cpu_forward_kernel<std::int16_t>,
nullptr});
REGISTRAR(ExpandImpl_cpu,
{{DataType::Int32, DataType::Int64}, {DataType::Int32}},
{ProdConso::inPlaceModel,
Aidge::ExpandImpl_cpu_forward_kernel<std::int32_t>,
nullptr});
REGISTRAR(ExpandImpl_cpu,
{{DataType::Int64, DataType::Int64}, {DataType::Int64}},
{ProdConso::inPlaceModel,
Aidge::ExpandImpl_cpu_forward_kernel<std::int64_t>,
nullptr});
REGISTRAR(ExpandImpl_cpu,
{{DataType::Float16, DataType::Int64}, {DataType::Float16}},
{ProdConso::inPlaceModel,
Aidge::ExpandImpl_cpu_forward_kernel<half_float::half>,
nullptr});
REGISTRAR(ExpandImpl_cpu,
{{DataType::Float32, DataType::Int64}, {DataType::Float32}},
{ProdConso::inPlaceModel,
Aidge::ExpandImpl_cpu_forward_kernel<float>,
nullptr});
REGISTRAR(ExpandImpl_cpu,
{{DataType::Float64, DataType::Int64}, {DataType::Float64}},
{ProdConso::inPlaceModel,
Aidge::ExpandImpl_cpu_forward_kernel<double>,
nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_EXPANDIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_FCIMPL_H_
#define AIDGE_CPU_OPERATOR_FCIMPL_H_
#include <array>
#include <memory>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/FC.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
// Operator implementation entry point for the backend
using FCImpl_cpu = OperatorImpl_cpu<FC_Op,
void(const DimSize_t,
const DimSize_t,
const DimSize_t,
const void *,
const void *,
const void *,
void *),
void(const DimSize_t,
const DimSize_t,
const DimSize_t,
const void *,
const void *,
const void *,
void *,
void *,
void *)>;
// Implementation entry point registration to Operator
REGISTRAR(FC_Op, "cpu", Aidge::FCImpl_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_FCIMPL_H_ */
/******************************************************************************** /********************************************************************************
* Copyright (c) 2023 CEA-List * Copyright (c) 2023 CEA-List
* *
* This program and the accompanying materials are made available under the * This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at * terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0. * http://www.eclipse.org/legal/epl-2.0.
* *
* SPDX-License-Identifier: EPL-2.0 * SPDX-License-Identifier: EPL-2.0
* *
********************************************************************************/ ********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ #ifndef AIDGE_CPU_OPERATOR_FCIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ #define AIDGE_CPU_OPERATOR_FCIMPL_KERNELS_H_
#include "aidge/utils/Registrar.hpp" #include <algorithm>
#include <algorithm>
#include "aidge/backend/cpu/operator/FCImpl.hpp"
#include "aidge/operator/FCImpl.hpp" #include "aidge/utils/Registrar.hpp"
namespace Aidge { namespace Aidge {
// template <class I, class W, class B, class O> // template <class I, class W, class B, class O>
// void FCImpl_cpu_forward_kernel(const FC_Op::Parameters& params, const std::array<DimSize_t, 4>& dims, // void FCImpl_cpu_forward_kernel(const FC_Op::Attrs& attrs, const std::array<DimSize_t, 4>& dims,
// const void* input_, const void* weights_, const void* biases_, void* output_) { // const void* input_, const void* weights_, const void* biases_, void* output_) {
// // FIXME: missing FC parameters as arguments // // FIXME: missing FC attributes as arguments
// const I* input = static_cast<const I*>(input_); // const I* input = static_cast<const I*>(input_);
// const W* weights = static_cast<const W*>(weights_); // const W* weights = static_cast<const W*>(weights_);
// const B* biases = static_cast<const B*>(biases_); // const B* biases = static_cast<const B*>(biases_);
// O* output = static_cast<O*>(output_); // O* output = static_cast<O*>(output_);
// for (std::size_t outIdx = 0; outIdx < std::get<0>(params); ++outIdx) { // for (std::size_t outIdx = 0; outIdx < outputFeatureSize; ++outIdx) {
// std::size_t oIndex = outIdx * dims[3]; // std::size_t oIndex = outIdx * dims[3];
// const B bias = std::get<1>(params) ? B(0) : biases[outIdx]; // const B bias = std::get<0>(attrs) ? B(0) : biases[outIdx];
// for (std::size_t batch = 0; batch < dims[3]; ++batch) { // for (std::size_t batch = 0; batch < dims[3]; ++batch) {
// output[oIndex + batch] = bias; // output[oIndex + batch] = bias;
// } // }
// } // }
// for (std::size_t ix = 0; ix < dims[0]; ++ix) { // for (std::size_t ix = 0; ix < dims[0]; ++ix) {
// for (std::size_t iy = 0; iy < dims[1]; ++iy) { // for (std::size_t iy = 0; iy < dims[1]; ++iy) {
// for (std::size_t inCh = 0; inCh < dims[2]; ++inCh) { // for (std::size_t inCh = 0; inCh < dims[2]; ++inCh) {
// const std::size_t iIndex = dims[3] * (inCh + dims[2] * (iy + dims[1] * ix)); // const std::size_t iIndex = dims[3] * (inCh + dims[2] * (iy + dims[1] * ix));
// for (std::size_t outCh = 0; outCh < std::get<0>(params); ++outCh) { // for (std::size_t outCh = 0; outCh < outputFeatureSize; ++outCh) {
// const std::size_t oIndex = dims[3] * outCh; // const std::size_t oIndex = dims[3] * outCh;
// const std::size_t wIndex = (inCh + dims[2] * (iy + dims[1] * ix)) * std::get<0>(params) + // const std::size_t wIndex = (inCh + dims[2] * (iy + dims[1] * ix)) * outputFeatureSize +
// outCh; // (iIndex*std::get<0>(params) + oIndex)/dims[3]; // outCh; // (iIndex*outputFeatureSize + oIndex)/dims[3];
// for (std::size_t batch = 0; batch < dims[3]; ++batch) { // for (std::size_t batch = 0; batch < dims[3]; ++batch) {
// output[oIndex + batch] += weights[wIndex] * input[iIndex + batch]; // output[oIndex + batch] += weights[wIndex] * input[iIndex + batch];
// } // }
// } // }
// } // }
// } // }
// } // }
// } // }
// template <class I, class W, class B, class O> // template <class I, class W, class B, class O>
// void FCImpl_cpu_forward_kernel(const FC_Op::Parameters& params, const std::array<DimSize_t, 2>& dims, // void FCImpl_cpu_forward_kernel(const FC_Op::Attrs& attrs, const std::array<DimSize_t, 2>& dims,
// const void* input_, const void* weights_, const void* biases_, void* output_) { // const void* input_, const void* weights_, const void* biases_, void* output_) {
// // FIXME: missing FC parameters as arguments // // FIXME: missing FC attributes as arguments
// const I* input = static_cast<const I*>(input_); // const I* input = static_cast<const I*>(input_);
// const W* weights = static_cast<const W*>(weights_); // const W* weights = static_cast<const W*>(weights_);
// const B* biases = static_cast<const B*>(biases_); // const B* biases = static_cast<const B*>(biases_);
// O* output = static_cast<O*>(output_); // O* output = static_cast<O*>(output_);
// // let's have I.dims() = [N, C, H, W] instead of [H, W, C, N] // // let's have I.dims() = [N, C, H, W] instead of [H, W, C, N]
// for (std::size_t outIdx = 0; outIdx < std::get<0>(params); ++outIdx) { // for (std::size_t outIdx = 0; outIdx < outputFeatureSize; ++outIdx) {
// std::size_t oIndex = outIdx * dims[0]; // std::size_t oIndex = outIdx * dims[0];
// const B bias = std::get<1>(params) ? B(0) : biases[outIdx]; // const B bias = std::get<0>(attrs) ? B(0) : biases[outIdx];
// for (std::size_t batch = 0; batch < dims[0]; ++batch) { // for (std::size_t batch = 0; batch < dims[0]; ++batch) {
// output[oIndex + batch] = bias; // output[oIndex + batch] = bias;
// } // }
// } // }
// for (std::size_t batch = 0; batch < dims[0]; ++batch) { // for (std::size_t batch = 0; batch < dims[0]; ++batch) {
// const std::size_t oIndex = dims[1] * batch; // const std::size_t oIndex = dims[1] * batch;
// for (std::size_t i = 0; i < dims[1]; ++i) { // for (std::size_t i = 0; i < dims[1]; ++i) {
// for (std::size_t outCh = 0; outCh < std::get<0>(params); ++outCh) { // for (std::size_t outCh = 0; outCh < outputFeatureSize; ++outCh) {
// std::size_t wIndex = i * std::get<0>(params) + outCh; // (iIndex*std::get<0>(params) + oIndex)/dims[3]; // std::size_t wIndex = i * outputFeatureSize + outCh; // (iIndex*outputFeatureSize + oIndex)/dims[3];
// output[oIndex + outCh] += weights[wIndex] * input[i + batch]; // output[oIndex + outCh] += weights[wIndex] * input[i + batch];
// } // }
// } // }
// } // }
// } // }
template <class I, class W, class B, class O> template <class I, class W, class B, class O>
void FCImpl_cpu_forward_kernel(const FC_Op::Parameters& params, const DimSize_t batchSize, const DimSize_t oneInputSize, void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
const void* input_, const void* weights_, const void* biases_, void* output_) { const DimSize_t inputFeatureSize,
// FIXME: missing FC parameters as arguments const DimSize_t outputFeatureSize,
const I* input = static_cast<const I*>(input_); const void* input_,
const W* weights = static_cast<const W*>(weights_); const void* weights_,
const B* biases = static_cast<const B*>(biases_); const void* biases_,
O* output = static_cast<O*>(output_); void* output_) {
// FIXME: missing FC attributes as arguments
if (std::get<1>(params)) { const I* input = static_cast<const I*>(input_);
std::fill(output, output+(batchSize*std::get<0>(params)), B(0)); const W* weights = static_cast<const W*>(weights_);
} const B* biases = static_cast<const B*>(biases_);
else { O* output = static_cast<O*>(output_);
for (std::size_t batch = 0; batch < batchSize; ++batch) {
std::copy(biases, biases+std::get<0>(params), output+(batch*std::get<0>(params))); if (biases == nullptr) {
} std::fill(output, output+(batchSize*outputFeatureSize), B(0));
} }
else {
for (std::size_t batch = 0; batch < batchSize; ++batch) { for (std::size_t batch = 0; batch < batchSize; ++batch) {
for (std::size_t out = 0; out < std::get<0>(params); ++out) { std::copy(biases, biases+outputFeatureSize, output+(batch*outputFeatureSize));
output[out + batch*std::get<0>(params)] = std::inner_product(input + batch*oneInputSize, }
input + (batch + 1)*oneInputSize, }
weights + out*oneInputSize,
output[out + batch*std::get<0>(params)]); for (std::size_t batch = 0; batch < batchSize; ++batch) {
} for (std::size_t out = 0; out < outputFeatureSize; ++out) {
} output[out + batch*outputFeatureSize] = std::inner_product(input + batch*inputFeatureSize,
} input + (batch + 1)*inputFeatureSize,
weights + out*inputFeatureSize,
output[out + batch*outputFeatureSize]);
namespace { }
static Registrar<FCImplForward_cpu> registrarFCImpl2DForward_cpu_Float32( }
{DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32}, }
Aidge::FCImpl_cpu_forward_kernel<float, float, float, float>);
static Registrar<FCImplForward_cpu> registrarFCImpl2DForward_cpu_Int32( template <class I, class O, class W, class B>
{DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32}, void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
Aidge::FCImpl_cpu_forward_kernel<int, int, int, int>); const DimSize_t inputFeatureSize,
static Registrar<FCImplForward_cpu> registrarFCImpl2DForward_cpu_Float64( const DimSize_t outputFeatureSize,
{DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64}, const void* input_,
Aidge::FCImpl_cpu_forward_kernel<double, double, double, double>); const void* originalInput_,
} // namespace const void* weight_,
void* output_,
} // namespace Aidge void* weightGrad_,
void* biasesGrad_)
#endif /* AIDGE_CPU_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */ {
// FIXME: missing FC attributes as arguments
const I* input = static_cast<const I*>(input_);
const I* originalInput = static_cast<const I*>(originalInput_);
const W* weight = static_cast<const W*>(weight_);
O* output = static_cast<O*>(output_);
W* weightGrad = static_cast<W*>(weightGrad_);
B* biasesGrad = static_cast<B*>(biasesGrad_);
// bias grad
if (biasesGrad == nullptr) { // no bias
std::fill(biasesGrad, biasesGrad + outputFeatureSize, B(0));
} else {
for (std::size_t o = 0; o < outputFeatureSize; ++o) { // nb outputs
B sum{0};
for (std::size_t b = 0; b < batchSize; ++b) {
sum += input[b*outputFeatureSize + o];
}
biasesGrad[o] = sum;
}
}
// weight grad
for (std::size_t o = 0; o < outputFeatureSize; ++o) {
for (std::size_t c = 0; c < inputFeatureSize; ++c) {
W sum{0};
for (std::size_t b = 0; b < batchSize; ++b) {
sum += originalInput[b*inputFeatureSize + c]*input[b*outputFeatureSize + o];
}
weightGrad[o*inputFeatureSize + c] = sum;
}
}
// input grad
for (std::size_t b = 0; b < batchSize; ++b) {
for (std::size_t c = 0; c < inputFeatureSize; ++c) {
O sum{0};
for (std::size_t o = 0; o < outputFeatureSize; ++o) {
sum += weight[o*inputFeatureSize + c] * input[b*outputFeatureSize + o];
}
output[b*inputFeatureSize + c] = sum;
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(FCImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
{ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<float, float, float, float>, Aidge::FCImpl_cpu_backward_kernel<float, float, float, float>});
REGISTRAR(FCImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
{ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<double, double, double, double>, Aidge::FCImpl_cpu_backward_kernel<double, double, double, double>});
REGISTRAR(FCImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
{ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, Aidge::FCImpl_cpu_backward_kernel<int32_t, int32_t, int32_t, int32_t>});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_FCIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_FOLDIMPL_H_
#define AIDGE_CPU_OPERATOR_FOLDIMPL_H_
#include <array>
#include <memory>
#include <tuple>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/Fold.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
namespace Aidge {
// Operator implementation entry point for the backend
using Fold2D_Op = Fold_Op<2>;
using FoldImpl2D_cpu = OperatorImpl_cpu<Fold_Op<2>,
void(const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 2>&,
const std::vector<DimSize_t> &,
const void *,
void *)>;
// Implementation entry point registration to Operator
REGISTRAR(Fold2D_Op, "cpu", Aidge::FoldImpl2D_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_FOLDIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_FOLDIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_FOLDIMPL_KERNELS_H_
#include "aidge/utils/Registrar.hpp"
#include "aidge/backend/cpu/operator/FoldImpl.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include <cmath>
#include <array>
#include <algorithm>
namespace Aidge {
template <class I, class O>
void FoldImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& outputDims,
const std::array<DimSize_t, 2>& strideDims,
const std::array<DimSize_t, 2>& dilationDims,
const std::array<DimSize_t, 2>& kernelDims,
const std::vector<DimSize_t> &dims,
const void *input_, void *output_)
{
const I *input = static_cast<const I *>(input_);
O *output = static_cast<O *>(output_);
const DimSize_t inHeight = outputDims[0];
const DimSize_t inWidth = outputDims[1];
const DimSize_t kernelExtentHeight = dilationDims[0] *
(kernelDims[0] - 1) + 1;
const DimSize_t outHeight = 1 + static_cast<DimSize_t>(
floor(static_cast<float>(inHeight - kernelExtentHeight) /
static_cast<float>(strideDims[0])));
const DimSize_t kernelExtentWidth = dilationDims[1] *
(kernelDims[1] - 1) + 1;
const DimSize_t outWidth = 1 + static_cast<DimSize_t>(
floor(static_cast<float>(inWidth - kernelExtentWidth) /
static_cast<float>(strideDims[1])));
const DimSize_t outChannels = dims[dims.size() - 2];
const DimSize_t inChannels = outChannels / kernelDims[0] / kernelDims[1];
std::fill_n(output, dims[0] * outHeight * outWidth * outChannels, O(0));
for (DimSize_t n = 0; n < dims[0]; ++n) {
for (DimSize_t outC = 0; outC < outChannels; ++outC) {
const auto inOffsetW = outC % kernelDims[1];
const auto inOffsetH = (outC / kernelDims[1]) % kernelDims[0];
const auto inC = outC / kernelDims[0] / kernelDims[1];
for (DimSize_t outH = 0; outH < outHeight; ++outH) {
const auto inH = outH * strideDims[0] + inOffsetH * dilationDims[0];
for (DimSize_t outW = 0; outW < outWidth; ++outW) {
const auto inW = outW * strideDims[1] + inOffsetW * dilationDims[1];
output[((n * inChannels + inC) * inHeight + inH) * inWidth + inW] +=
input[((n * outChannels + outC) * outHeight + outH) * outWidth + outW];
}
}
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(FoldImpl2D_cpu,
{DataType::Float32},
{ProdConso::defaultModel, Aidge::FoldImpl2D_cpu_forward_kernel<float, float>, nullptr});
REGISTRAR(FoldImpl2D_cpu,
{DataType::Float64},
{ProdConso::defaultModel, Aidge::FoldImpl2D_cpu_forward_kernel<double, double>, nullptr});
REGISTRAR(FoldImpl2D_cpu,
{DataType::Int32},
{ProdConso::defaultModel, Aidge::FoldImpl2D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_FOLDIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_
#define AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_
#include <memory>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/GlobalAveragePooling.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
// Operator implementation entry point for the backend
using GlobalAveragePoolingImpl_cpu = OperatorImpl_cpu<GlobalAveragePooling_Op,
void(const std::vector<DimSize_t> &, const void *, void *)>;
// Implementation entry point registration to Operator
REGISTRAR(GlobalAveragePooling_Op, "cpu", Aidge::GlobalAveragePoolingImpl_cpu::create);
} // namespace Aidge
#endif /* _AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_
#include <cstddef>
#include <functional> // std::multiplies
#include <numeric> // std::accumulate
#include <vector>
#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
#include "aidge/data/Data.hpp"
#include "aidge/utils/ErrorHandling.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
template <typename T>
typename std::enable_if<std::is_floating_point<T>::value, T>::type
stableMean(const T* vec, size_t size) {
T mean = 0;
for (size_t i = 0; i < size; ++i) {
mean = std::fma<T>(vec[i] - mean, 1.0f / (i + 1), mean);
}
return mean;
}
// Specialization for integers: perform the mean computation in float
template <typename T>
typename std::enable_if<!std::is_floating_point<T>::value, T>::type
stableMean(const T* vec, size_t size) {
double mean = 0;
for (size_t i = 0; i < size; ++i) {
mean = std::fma<double>(vec[i] - mean, 1.0f / (i + 1), mean);
}
return mean;
}
template <typename T>
typename std::enable_if<std::is_floating_point<T>::value, T>::type
castFromFloat(T value) {
return value;
}
template <typename T>
typename std::enable_if<!std::is_floating_point<T>::value, T>::type
castFromFloat(double value) {
return static_cast<T>(std::nearbyint(value));
}
template <class I, class O>
void GlobalAveragePoolingImpl_cpu_forward_kernel(
const std::vector<DimSize_t> &dims, const void *input_, void *output_) {
// error checking
AIDGE_ASSERT(dims.size() >= 3,"GlobalAveragePool needs at least a 3 dimensions "
"input, number of input dim : {}",
dims.size());
// computation
const I *input = static_cast<const I *>(input_);
O *output = static_cast<O *>(output_);
DimSize_t nb_elems = std::accumulate(dims.begin(), dims.end(), std::size_t(1),
std::multiplies<std::size_t>());
const DimSize_t in_batch_nb_elems{nb_elems / dims[0]};
const DimSize_t in_channel_nb_elems{in_batch_nb_elems / dims[1]};
const DimSize_t out_batch_nb_elems{dims[1]};
// parse channel by channel and fill each output with the average of the
// values in the channel
for (DimSize_t batch = 0; batch < dims[0]; ++batch) {
for (DimSize_t channel = 0; channel < dims[1]; ++channel) {
const I *filter_start = std::next(
input, (batch * in_batch_nb_elems) + (channel * in_channel_nb_elems));
output[batch * out_batch_nb_elems + channel] = castFromFloat<O>(stableMean<I>(filter_start, in_channel_nb_elems));
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(GlobalAveragePoolingImpl_cpu,
{DataType::Float32},
{ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<float, float>, nullptr});
REGISTRAR(GlobalAveragePoolingImpl_cpu,
{DataType::Float64},
{ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<double, double>, nullptr});
REGISTRAR(GlobalAveragePoolingImpl_cpu,
{DataType::Int32},
{ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_
#define AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_
#include <array>
#include <memory>
#include <tuple>
#include <vector>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/GridSample.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
namespace Aidge {
// Operator implementation entry point for the backend
using GridSampleImpl_cpu = OperatorImpl_cpu<GridSample_Op,
void(const GridSample_Op&,
const std::shared_ptr<Tensor>&,
const std::shared_ptr<Tensor>&,
const std::shared_ptr<Tensor>&)>;
// Implementation entry point registration to Operator
REGISTRAR(GridSample_Op, "cpu", Aidge::GridSampleImpl_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
#include <algorithm> // std::max, std::min
#include <cmath> // std::fabs, std::trunf, std::nearbyint
#include <cstddef> // std::size_t
#include <cstdint> // std::int64_t
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include "aidge/backend/cpu/operator/GridSampleImpl.hpp"
#include "aidge/data/half.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
static bool in_bound(float coord, float lower_bound, float upper_bound) noexcept {
return (coord > lower_bound) && (coord < upper_bound);
}
static float unnormalized_coord(float coord, float new_lower_bound, float new_upper_bound) noexcept {
return (coord + 1) / 2 * (new_upper_bound - new_lower_bound) + new_lower_bound;
}
// unused
// static float normalized_coord(float coord, float prev_lower_bound, float prev_upper_bound) noexcept {
// return (coord + prev_lower_bound) / (prev_upper_bound-prev_lower_bound) * 2 - 1;
// }
static float unnormalize_grid_sample_coord(float coord, std::size_t size, bool align_corners) noexcept {
return align_corners ? unnormalized_coord(coord, 0.0f, static_cast<float>(size) - 1.0f)
: unnormalized_coord(coord, -0.5f, static_cast<float>(size) - 0.5f);
}
// unused
// static float normalize_grid_sample_coord(float coord, std::size_t size, bool align_corners) noexcept {
// return align_corners ? normalized_coord(coord, 0.0f, static_cast<float>(size) - 1.0f)
// : normalized_coord(coord, -0.5f, static_cast<float>(size) - 0.5f);
// }
static float update_normalized_coord_with_padding(float coord, Aidge::GridSample_Op::PaddingMode padding_mode) {
if (!in_bound(coord, -1.0f, 1.0f)) {
if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) {
coord = std::min(std::max(-1.0f, coord), 1.0f);
}
else if (padding_mode == Aidge::GridSample_Op::PaddingMode::Reflection) {
float abs_coord = std::fabs(coord);
float int_coord = std::truncf(abs_coord);
std::int32_t nb_refl = static_cast<std::int32_t>((int_coord - 1) / 2);
float res = ((nb_refl + 1)*2) - abs_coord;
coord = (coord > 0) ? (nb_refl % 2 == 0 ? res : -res) \
: (nb_refl % 2 == 0 ? -res : res);
}
}
return coord;
}
static std::int64_t update_unnormalized_coord_with_padding(std::int64_t coord, std::int64_t size, Aidge::GridSample_Op::PaddingMode padding_mode) {
if (!in_bound(coord, 0, size)) {
// out of bound. switch padding mode
if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) {
coord = std::min(std::max(std::int64_t(0), coord), size-std::int64_t(1));
} else if (padding_mode == Aidge::GridSample_Op::PaddingMode::Reflection) {
const std::int64_t quotient = coord / (size-1);
const std::int64_t remainer = std::abs(coord - quotient*(size-1));
coord = (quotient % 2 == 0) ? remainer : size - 1 - remainer;
}
}
return coord;
}
namespace Aidge {
/**
* @brief Forward kernel for 1D GridSample on CPU backend.
* @tparam I Input data type.
* @tparam O Output data type.
* @param params tuple of Attributes from the Operator
* @param inputDims Array of input dimensions.
* @param input_ const input Tensor.
* @param grid_ const grid Tensor.
* @param output_ Output Tensor.
*/
template <class I, class O>
void GridSampleImpl1D_cpu_forward_kernel(const GridSample_Op& op,
const std::shared_ptr<Tensor>& in0,
const std::shared_ptr<Tensor>& in1,
const std::shared_ptr<Tensor>& out)
{
const I* const input = static_cast<const I *>(in0->getImpl()->rawPtr());
const I* input_ptr = input;
float* const grid = static_cast<float*>(in1->getImpl()->rawPtr());
float* grid_ptr = grid;
O* const output = static_cast<O*>(out->getImpl()->rawPtr());
O* output_ptr = output;
const std::size_t N = in0->dim(0);
const std::size_t C = in0->dim(1);
const std::size_t in_H = in0->dim(2);
const std::size_t grid_H = in1->dim(1);
const std::size_t in_N_s = in0->stride(0);
const std::size_t in_C_s = in0->stride(1);
const std::size_t in_H_s = in0->stride(2);
const std::size_t grid_N_s = in1->stride(0);
const std::size_t grid_H_s = in1->stride(1);
const std::size_t out_N_s = out->stride(0);
const std::size_t out_C_s = out->stride(1);
const std::size_t out_H_s = out->stride(2);
float* grid_ptr_N = grid;
const I* input_ptr_N = input;
O* output_ptr_N = output;
for (std::size_t n = 0; n < N; ++n) {
grid_ptr = grid_ptr_N;
for (std::size_t grid_x = 0; grid_x < grid_H; ++grid_x) {
output_ptr = output_ptr_N + grid_x*out_H_s;
/*
* change grid_x coord to match padding_mode
* Change range from [-1, 1] to [0, H-1] or [-0.5, H-0.5] according to align_corners
* Handle computation of interpolation
* any value outside bounds is considered 0
* if nearest:
* else if linear:
* else if cubic:
* else : nothing
*/
float x = *grid_ptr;
x = update_normalized_coord_with_padding(x, op.paddingMode());
x = unnormalize_grid_sample_coord(x, in_H, op.alignCorners());
if (op.mode() == GridSample_Op::Mode::Nearest) {
const std::int64_t x_rounded = std::nearbyintf(x);
if (in_bound(x_rounded, 0, in_H)) {
input_ptr = input_ptr_N + x_rounded*in_H_s;
for (std::size_t c = 0; c < C; ++c) {
*output_ptr = *input_ptr;
input_ptr += in_C_s;
output_ptr += out_C_s;
}
} else {
for (std::size_t c = 0; c < C; ++c) {
*output_ptr = O(0);
output_ptr += out_C_s;
}
}
} else if (op.mode() == GridSample_Op::Mode::Linear) {
const std::int64_t x_inf = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_H, op.paddingMode());
const std::int64_t x_sup = update_unnormalized_coord_with_padding(x_inf + 1, in_H, op.paddingMode());
const I* input_ptr_NC = input_ptr_N;
for (std::size_t c = 0; c < C; ++c) {
const I f_inf = in_bound(x_inf, 0, in_H) ?
input_ptr_NC[static_cast<std::size_t>(x_inf)*in_H_s] : I(0);
const I f_sup = in_bound(x_sup, 0, in_H) ?
input_ptr_NC[static_cast<std::size_t>(x_sup)*in_H_s] : I(0);
*output_ptr = static_cast<O>(static_cast<I>(x - x_inf)*f_inf \
+ static_cast<I>(x_sup - x)*f_sup);
input_ptr_NC += in_C_s;
output_ptr += out_C_s;
}
} else if (op.mode() == GridSample_Op::Mode::Cubic) {
const std::int64_t x_inf = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_H, op.paddingMode());
const std::int64_t x_sup = update_unnormalized_coord_with_padding(x_inf + 1, in_H, op.paddingMode());
const std::int64_t x_inf_inf = update_unnormalized_coord_with_padding(x_inf - 1, in_H, op.paddingMode());
const std::int64_t x_sup_sup = update_unnormalized_coord_with_padding(x_sup + 1, in_H, op.paddingMode());
const I x1 = static_cast<I>(x - static_cast<float>(x_inf));
const I x2 = x1 * x1;
const I x3 = x1 * x2;
const I* input_ptr_NC = input_ptr_N;
for (std::size_t c = 0; c < C; ++c) {
const I f_inf_inf = in_bound(x_inf_inf, 0, in_H) ? input_ptr_NC[x_inf_inf*in_H_s] : I(0);
const I f_inf = in_bound(x_inf, 0, in_H) ? input_ptr_NC[x_inf*in_H_s] : I(0);
const I f_sup = in_bound(x_sup, 0, in_H) ? input_ptr_NC[x_sup*in_H_s] : I(0);
const I f_sup_sup = in_bound(x_sup_sup, 0, in_H) ? input_ptr_NC[x_sup_sup*in_H_s] : I(0);
const I m_inf = (f_sup - f_inf_inf) / I(2);
const I m_sup = (f_sup_sup - f_inf) / I(2);
*output_ptr = f_inf \
+ x1 * m_inf \
+ x2 * (3 * (f_sup - f_inf) - 2 * m_inf - m_sup) \
+ x3 * (2*(f_inf - f_sup) + m_inf + m_sup);
input_ptr_NC += in_C_s;
output_ptr += out_C_s;
}
}
grid_ptr += grid_H_s;
}
input_ptr_N += in_N_s;
grid_ptr_N += grid_N_s;
output_ptr_N += out_N_s;
}
}
// Kernels registration to implementation entry point
// only accept 1st input with only 1 spatial feat. (nb dims = 1)
REGISTRAR(GridSampleImpl_cpu,
{{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float16}}},
{ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<half_float::half, half_float::half>, nullptr});
REGISTRAR(GridSampleImpl_cpu,
{{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float32}}},
{ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<float, float>, nullptr});
REGISTRAR(GridSampleImpl_cpu,
{{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float64}}},
{ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<double, double>, nullptr});
REGISTRAR(GridSampleImpl_cpu,
{{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Int32}}},
{ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
/**
* @brief Forward kernel for 1D GridSample on CPU backend.
* @tparam I Input data type.
* @tparam O Output data type.
* @param params tuple of Attributes from the Operator
* @param inputDims Array of input dimensions.
* @param input_ const input Tensor.
* @param grid_ const grid Tensor.
* @param output_ Output Tensor.
*/
template <class I, class O>
void GridSampleImpl2D_cpu_forward_kernel(const GridSample_Op& op,
const std::shared_ptr<Tensor>& in0,
const std::shared_ptr<Tensor>& in1,
const std::shared_ptr<Tensor>& out)
{
const I* input = static_cast<const I *>(in0->getImpl()->rawPtr());
const I* input_ptr = input;
float* const grid = static_cast<float*>(in0->getImpl()->rawPtr());
float* grid_ptr = grid;
O* const output = static_cast<O*>(out->getImpl()->rawPtr());
const std::size_t N = in0->dim(0);
const std::size_t C = in0->dim(1);
const std::size_t in_H = in0->dim(2);
const std::size_t in_W = in0->dim(3);
const std::size_t grid_H = in1->dim(1);
const std::size_t grid_W = in1->dim(2);
const std::size_t in_N_s = in0->stride(0);
const std::size_t in_C_s = in0->stride(1);
const std::size_t in_H_s = in0->stride(2);
const std::size_t in_W_s = in0->stride(3);
const std::size_t grid_N_s = in1->stride(0);
const std::size_t grid_H_s = in1->stride(1);
const std::size_t grid_W_s = in1->stride(2);
const std::size_t grid_Coord_s = in1->stride(3);
const std::size_t out_N_s = out->stride(0);
const std::size_t out_C_s = out->stride(1);
const std::size_t out_H_s = out->stride(2);
const std::size_t out_W_s = out->stride(3);
float* grid_ptr_N = grid;
const I* input_ptr_N = input;
O* output_ptr_N = output;
for (std::size_t n = 0; n < N; ++n) {
for (std::size_t grid_y = 0; grid_y < grid_H; ++grid_y) {
for (std::size_t grid_x = 0; grid_x < grid_W; ++grid_x) {
O* output_ptr = output_ptr_N + grid_y*out_H_s + grid_y*out_W_s;
grid_ptr = grid_ptr_N + grid_y*grid_H_s + grid_x*grid_W_s;
/*
* change grid_x coord to match padding_mode
* Change range from [-1, 1] to [0, H-1] or [-0.5, H-0.5] according to align_corners
* Handle computation of interpolation
* any value outside bounds is considered 0
* if nearest:
* else if linear:
* else if cubic:
* else : nothing
*/
float x = *grid_ptr;
float y = grid_ptr[grid_Coord_s];
x = update_normalized_coord_with_padding(x, op.paddingMode());
x = unnormalize_grid_sample_coord(x, in_W, op.alignCorners());
y = update_normalized_coord_with_padding(y, op.paddingMode());
y = unnormalize_grid_sample_coord(y, in_H, op.alignCorners());
if (op.mode() == GridSample_Op::Mode::Nearest) {
const std::int64_t x_rounded = std::nearbyintf(x);
const std::int64_t y_rounded = std::nearbyintf(y);
if (in_bound(x_rounded, 0, in_W) && in_bound(y_rounded, 0, in_H)) {
input_ptr = input_ptr_N + y_rounded*in_H_s + x_rounded*in_W_s;
for (std::size_t c = 0; c < C; ++c) {
*output_ptr = *input_ptr;
input_ptr += in_C_s;
output_ptr += out_C_s;
}
} else {
for (std::size_t c = 0; c < C; ++c) {
*output_ptr = O(0);
output_ptr += out_C_s;
}
}
} else if (op.mode() == GridSample_Op::Mode::Linear) {
const std::int64_t x_r = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_W, op.paddingMode()); // right
const std::int64_t x_l = update_unnormalized_coord_with_padding(x_r + 1, in_W, op.paddingMode()); // left
const std::int64_t y_t = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(y)), in_H, op.paddingMode()); // top
const std::int64_t y_b = update_unnormalized_coord_with_padding(y_t + 1, in_H, op.paddingMode()); // bottom
const I* input_ptr_NC = input_ptr_N;
for (std::size_t c = 0; c < C; ++c) {
const I f_tr = (in_bound(x_r, 0, in_W) && in_bound(y_t, 0, in_H)) ?
input_ptr_NC[static_cast<std::size_t>(y_t)*in_H_s
+ static_cast<std::size_t>(x_r)*in_W_s]
: I(0);
const I f_tl = (in_bound(x_l, 0, in_W) && in_bound(y_t, 0, in_H)) ?
input_ptr_NC[static_cast<std::size_t>(y_t)*in_H_s
+ static_cast<std::size_t>(x_l)*in_W_s]
: I(0);
const I f_br = (in_bound(x_r, 0, in_W) && in_bound(y_b, 0, in_H)) ?
input_ptr_NC[static_cast<std::size_t>(y_b)*in_H_s
+ static_cast<std::size_t>(x_r)*in_W_s]
: I(0);
const I f_bl = (in_bound(x_l, 0, in_W) && in_bound(y_b, 0, in_H)) ?
input_ptr_NC[static_cast<std::size_t>(y_b)*in_H_s
+ static_cast<std::size_t>(x_l)*in_W_s]
: I(0);
// compute weighted sum of the 4 corners
const I w_tr = static_cast<I>((y - static_cast<float>(y_t))*(static_cast<float>(x_r) - x));
const I w_tl = static_cast<I>((y - static_cast<float>(y_t))*(x - static_cast<float>(x_l)));
const I w_br = static_cast<I>((static_cast<float>(y_b) - y)*(static_cast<float>(x_r) - x));
const I w_bl = static_cast<I>((static_cast<float>(y_b) - y)*(x - static_cast<float>(x_l)));
*output_ptr = static_cast<O>(w_tr*f_tr + w_tl*f_tl + w_br*f_br + w_bl*f_bl);
input_ptr_NC += in_C_s;
output_ptr += out_C_s;
}
} else if (op.mode() == GridSample_Op::Mode::Cubic) {
/*
* .. .. .. .. .. ..
* .. 00 01 02 03 ..
* .. 10 11 12 13 ..
* .. 20 21 22 23 ..
* .. 30 31 32 33 ..
* .. .. .. .. .. ..
*/
const std::int64_t x_1 = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_W, op.paddingMode());
const std::int64_t x_0 = update_unnormalized_coord_with_padding(x_1 - 1, in_W, op.paddingMode());
const std::int64_t x_2 = update_unnormalized_coord_with_padding(x_1 + 1, in_W, op.paddingMode());
const std::int64_t x_3 = update_unnormalized_coord_with_padding(x_1 + 2, in_W, op.paddingMode());
const std::int64_t y_1 = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(y)), in_H, op.paddingMode());
const std::int64_t y_0 = update_unnormalized_coord_with_padding(y_1 - 1, in_H, op.paddingMode());
const std::int64_t y_2 = update_unnormalized_coord_with_padding(y_1 + 1, in_H, op.paddingMode());
const std::int64_t y_3 = update_unnormalized_coord_with_padding(y_1 + 2, in_H, op.paddingMode());
const I* input_ptr_NC = input_ptr_N;
for (std::size_t c = 0; c < C; ++c) {
const I f_00 = in_bound(x_0, 0, in_W) && in_bound(y_0, 0, in_H) ?
input_ptr_NC[x_0*in_W_s + y_0*in_H_s] : I(0);
const I f_01 = in_bound(x_0, 0, in_W) && in_bound(y_1, 0, in_H) ?
input_ptr_NC[x_0*in_W_s + y_1*in_H_s] : I(0);
const I f_02 = in_bound(x_0, 0, in_W) && in_bound(y_2, 0, in_H) ?
input_ptr_NC[x_0*in_W_s + y_2*in_H_s] : I(0);
const I f_03 = in_bound(x_0, 0, in_W) && in_bound(y_3, 0, in_H) ?
input_ptr_NC[x_0*in_W_s + y_3*in_H_s] : I(0);
const I f_10 = in_bound(x_1, 0, in_W) && in_bound(y_0, 0, in_H) ?
input_ptr_NC[x_1*in_W_s + y_0*in_H_s] : I(0);
const I f_20 = in_bound(x_2, 0, in_W) && in_bound(y_0, 0, in_H) ?
input_ptr_NC[x_2*in_W_s + y_0*in_H_s] : I(0);
const I f_30 = in_bound(x_3, 0, in_W) && in_bound(y_0, 0, in_H) ?
input_ptr_NC[x_3*in_W_s + y_0*in_H_s] : I(0);
const I f_11 = in_bound(x_1, 0, in_W) && in_bound(y_1, 0, in_H) ?
input_ptr_NC[x_1*in_W_s + y_1*in_H_s] : I(0);
const I f_12 = in_bound(x_1, 0, in_W) && in_bound(y_2, 0, in_H) ?
input_ptr_NC[x_1*in_W_s + y_2*in_H_s] : I(0);
const I f_13 = in_bound(x_1, 0, in_W) && in_bound(y_3, 0, in_H) ?
input_ptr_NC[x_1*in_W_s + y_3*in_H_s] : I(0);
const I f_21 = in_bound(x_2, 0, in_W) && in_bound(y_1, 0, in_H) ?
input_ptr_NC[x_2*in_W_s + y_1*in_H_s] : I(0);
const I f_22 = in_bound(x_2, 0, in_W) && in_bound(y_2, 0, in_H) ?
input_ptr_NC[x_2*in_W_s + y_2*in_H_s] : I(0);
const I f_23 = in_bound(x_2, 0, in_W) && in_bound(y_3, 0, in_H) ?
input_ptr_NC[x_2*in_W_s + y_3*in_H_s] : I(0);
const I f_31 = in_bound(x_3, 0, in_W) && in_bound(y_1, 0, in_H) ?
input_ptr_NC[x_3*in_W_s + y_1*in_H_s] : I(0);
const I f_32 = in_bound(x_3, 0, in_W) && in_bound(y_2, 0, in_H) ?
input_ptr_NC[x_3*in_W_s + y_2*in_H_s] : I(0);
const I f_33 = in_bound(x_3, 0, in_W) && in_bound(y_3, 0, in_H) ?
input_ptr_NC[x_3*in_W_s + y_3*in_H_s] : I(0);
const I mx_11 = (f_21 - f_01) / I(2);
const I mx_12 = (f_22 - f_02) / I(2);
const I mx_21 = (f_31 - f_11) / I(2);
const I mx_22 = (f_32 - f_12) / I(2);
const I my_11 = (f_12 - f_10) / I(2);
const I my_12 = (f_13 - f_11) / I(2);
const I my_21 = (f_22 - f_20) / I(2);
const I my_22 = (f_23 - f_21) / I(2);
const I mxy_11 = (f_22 - f_20 - f_02 - + f_00) / I(4);
const I mxy_12 = (f_23 - f_21 - f_03 - + f_01) / I(4);
const I mxy_21 = (f_32 - f_30 - f_12 - + f_10) / I(4);
const I mxy_22 = (f_33 - f_31 - f_13 - + f_11) / I(4);
const I a_00 = f_11;
const I a_10 = mx_11;
const I a_20 = I(3)*(f_21 - f_11) - I(2)*mx_11 - mx_21;
const I a_30 = I(2)*(f_11 - f_21) + mx_11 + mx_21;
const I a_01 = my_11;
const I a_11 = mxy_11;
const I a_21 = I(3)*(my_21 - my_11) - I(2)*mxy_11 - mxy_21;
const I a_31 = I(2)*(my_11 - my_21) + mxy_11 + mxy_21;
const I a_02 = I(3)*(f_12 - f_11) - I(2)*my_11 - my_12;
const I a_12 = I(3)*(mx_12 - mx_11) - I(2)*mxy_11 - mxy_12;
const I a_22 = I(9)*(f_11 + f_22 - f_21 - f_12) + I(3)*(I(2)*(mx_11 - mx_12 + my_11 - my_21) + mx_21 - mx_22 + my_12 - my_22) + mxy_22 + I(2)*(mxy_12 + mxy_21 + I(2)*mxy_11);
const I a_32 = - mxy_12 - mxy_22 + I(2)*(my_22 - my_12 - mxy_11 - mxy_21 + I(2)*(my_21 - my_11) + I(3)*(f_21 + f_12 - f_11 - f_22)) + I(3)*(mx_12 + mx_22 - mx_11 - mx_21);
const I a_03 = I(2)*(f_11 - f_12) + my_11 + my_12;
const I a_13 = I(2)*(mx_11 - mx_12) + mxy_11 + mxy_12;
const I a_23 = - mxy_21 - mxy_22 + I(2)*(-mx_21 + mx_22 - mxy_11 - mxy_12 + I(2)*(mx_12 - mx_11) + I(3)*(f_12 + f_21 - f_11 - f_22)) + I(3)*(my_21 + my_22 - my_11 - my_12);
const I a_33 = mxy_11 + mxy_21 + mxy_12 + mxy_22 + I(2)*(mx_11 + mx_21 - mx_12 - mx_22 + my_11 - my_21 + my_12 - my_22 + I(2)*(f_11 - f_21 - f_12 + f_22));
const I x2 = static_cast<I>(x*x);
const I x3 = static_cast<I>(x*x*x);
const I y2 = static_cast<I>(y*y);
const I y3 = static_cast<I>(y*y*y);
*output_ptr = static_cast<O>( \
a_00 + a_10*x + a_20*x2 + a_30*x3 \
+ a_01*y + a_11*x*y + a_21*x2*y + a_31*x3*y \
+ a_02*y2 + a_12*x*y2 + a_22*x2*y2 + a_32*x3*y2 \
+ a_03*y3 + a_13*x*y3 + a_23*x2*y3 + a_33*x3*y3);
input_ptr_NC += in_C_s;
output_ptr += out_C_s;
}
}
}
}
input_ptr_N += in_N_s;
grid_ptr_N += grid_N_s;
output_ptr_N += out_N_s;
}
}
// Kernels registration to implementation entry point
// only accept 1st input with only 2 spatial feat. (nb dims = 2)
REGISTRAR(GridSampleImpl_cpu,
{{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float16}}},
{ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<half_float::half, half_float::half>, nullptr});
REGISTRAR(GridSampleImpl_cpu,
{{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float32}}},
{ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<float, float>, nullptr});
REGISTRAR(GridSampleImpl_cpu,
{{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float64}}},
{ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<double, double>, nullptr});
REGISTRAR(GridSampleImpl_cpu,
{{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Int32}}},
{ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_ */