From 15465b1cc0e63d75ee643fc7f4e252a3de962442 Mon Sep 17 00:00:00 2001 From: Olivier BICHLER <olivier.bichler@cea.fr> Date: Thu, 29 Aug 2024 14:55:35 +0200 Subject: [PATCH] Adapted ConvImpl to new OperatorImpl mechanism --- .../aidge/backend/cpu/operator/ConvImpl.hpp | 259 ++++++++++++++---- .../cpu/operator/ConvImpl_forward_kernels.hpp | 259 ------------------ src/operator/ConvImpl.cpp | 57 ++-- 3 files changed, 224 insertions(+), 351 deletions(-) delete mode 100644 include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp diff --git a/include/aidge/backend/cpu/operator/ConvImpl.hpp b/include/aidge/backend/cpu/operator/ConvImpl.hpp index 5cd1c804..2c528933 100644 --- a/include/aidge/backend/cpu/operator/ConvImpl.hpp +++ b/include/aidge/backend/cpu/operator/ConvImpl.hpp @@ -17,21 +17,17 @@ #include <tuple> #include <vector> -#include "aidge/backend/OperatorImpl.hpp" +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" #include "aidge/operator/Conv.hpp" #include "aidge/utils/Registrar.hpp" #include "aidge/utils/Types.h" #include "aidge/backend/cpu/data/GetCPUPtr.h" namespace Aidge { -// class Conv_Op; - -// compute kernel registry for forward and backward -// Conv 1D -class ConvImpl1DForward_cpu - : public Registrable<ConvImpl1DForward_cpu, - std::tuple<DataType, DataType, DataType, DataType>, - std::function<void(const std::array<DimSize_t, 1>&, +// Operator implementation entry point for the backend +using Conv1D_Op = Conv_Op<1>; +using ConvImpl1D_cpu = OperatorImpl_cpu<Conv_Op<1>, + void(const std::array<DimSize_t, 1>&, const std::array<DimSize_t, 1>&, const std::array<DimSize_t, 1>&, const std::array<DimSize_t, 3> &, @@ -39,31 +35,20 @@ class ConvImpl1DForward_cpu const void *, const void *, const void *, - void *)>> {}; - -class ConvImpl1D_cpu : public OperatorImpl { - public: - ConvImpl1D_cpu(const Conv_Op<1>& op) : OperatorImpl(op, "cpu") {} - - static std::unique_ptr<ConvImpl1D_cpu> create(const Conv_Op<1> &op) { - return std::make_unique<ConvImpl1D_cpu>(op); - } + void *), + void(const std::array<DimSize_t, 1>&, + const std::array<DimSize_t, 1>&, + const std::array<DimSize_t, 1>&, + bool, + const std::array<DimSize_t, 3> &, + const void *, + const void *, + const void *, + void *)>; - public: - std::shared_ptr<ProdConso> getProdConso() const override { return std::make_unique<ProdConso>(mOp, true); }; - void forward() override; -}; - -namespace { -// add cpu backend to Conv_Op<1> implementation registry -static Registrar<Conv_Op<1>> registrarConvImpl1D_cpu("cpu", Aidge::ConvImpl1D_cpu::create); -} // namespace - -// Conv 2D -class ConvImpl2DForward_cpu - : public Registrable<ConvImpl2DForward_cpu, - std::tuple<DataType, DataType, DataType, DataType>, - std::function<void(const std::array<DimSize_t, 2>&, +using Conv2D_Op = Conv_Op<2>; +using ConvImpl2D_cpu = OperatorImpl_cpu<Conv_Op<2>, + void(const std::array<DimSize_t, 2>&, const std::array<DimSize_t, 2>&, const std::array<DimSize_t, 2>&, const std::array<DimSize_t, 4> &, @@ -71,11 +56,8 @@ class ConvImpl2DForward_cpu const void *, const void *, const void *, - void *)>> {}; -class ConvImpl2DBackward_cpu - : public Registrable<ConvImpl2DBackward_cpu, - std::tuple<DataType, DataType, DataType, DataType>, - std::function<void(const std::array<DimSize_t, 2>&, + void *), + void(const std::array<DimSize_t, 2>&, const std::array<DimSize_t, 2>&, const std::array<DimSize_t, 2>&, bool, @@ -83,25 +65,198 @@ class ConvImpl2DBackward_cpu const void *, const void *, const void *, - void *)>> {}; + void *)>; -class ConvImpl2D_cpu : public OperatorImpl { - public: - ConvImpl2D_cpu(const Conv_Op<2>& op) : OperatorImpl(op, "cpu") {} +// Implementation entry point registration to Operator +REGISTRAR(Conv1D_Op, "cpu", Aidge::ConvImpl1D_cpu::create); +REGISTRAR(Conv2D_Op, "cpu", Aidge::ConvImpl2D_cpu::create); - static std::unique_ptr<ConvImpl2D_cpu> create(const Conv_Op<2> &op) { - return std::make_unique<ConvImpl2D_cpu>(op); +//////////////////////////////////////////////////////////////////////////////// + +/** + * @brief Forward kernel for 1D Convolution on CPU backend. + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam B Bias data type. + * @tparam O Output data type. + * @param params tuple of Attributes from the Operator + * @param inputDims Array of input dimensions. + * @param input_ const input Tensor. + * @param weights_ const weight Tensor. + * @param biases_ const Biais Tensor. + * @param output_ Output Tensor. + */ +template <class I, class W, class B, class O> +void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims, + const std::array<DimSize_t, 1>& /*dilationDims*/, + const std::array<DimSize_t, 1>& kernelDims, + const std::array<DimSize_t, 3>& inputDims, + DimSize_t outChannels, + const void *input_, + const void *weights_, + const void *biases_, + void *output_) +{ + // FIXME: missing convolution attributes as arguments + const I *input = static_cast<const I *>(input_); + const W *weights = static_cast<const W *>(weights_); + const B *biases = static_cast<const B *>(biases_); + O *output = static_cast<O *>(output_); + + // output H size + const std::size_t oxSize = + static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) / + static_cast<float>(strideDims[0]))); + + // TODO: kernel computation + // output (batch, outCh, Xout, Yout) + // input (batch, inCh, Xin, Yin) + // weight (outCh, inCh, kernelX, kernelY) + // does not take Dilation attribute into account + using signedsize = std::make_signed<std::size_t>::type; + for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { + for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { + const std::size_t oIndex = (outCh + batch*outChannels) * oxSize; + // If bias = nullptr, set B(0) + B biasVal = (biases != nullptr) ? biases[outCh] : B(0); + std::fill(output + oIndex, output+(oIndex+oxSize), biasVal); + for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { + const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2]; + const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0]; + for (std::size_t ox = 0; ox < oxSize; ++ox) { + const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]); + const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0))); + const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx); + const std::size_t oIndexFull = oIndex + ox; + const signedsize ix = static_cast<signedsize>(ox * strideDims[0]); + + for (std::size_t sx = sxMin; sx < sxMax; ++sx) { + output[oIndexFull] += weights[wIndex + sx] * + input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))]; + } + } + } + } } +} + +REGISTRAR(ConvImpl1D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr}); +REGISTRAR(ConvImpl1D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr}); +REGISTRAR(ConvImpl1D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<int, int, int, int>, nullptr}); +REGISTRAR(ConvImpl1D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr}); - public: - std::shared_ptr<ProdConso> getProdConso() const override { return std::make_unique<ProdConso>(mOp, true); }; - void forward() override; -}; -namespace { -// add cpu backend to Conv_Op<2> implementation registry -static Registrar<Conv_Op<2>> registrarConvImpl2D_cpu("cpu", Aidge::ConvImpl2D_cpu::create); -} // namespace +/** + * @brief Forward kernel for 2D Convolution on CPU backend. + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam B Bias data type. + * @tparam O Output data type. + * @param params tuple of Attributes from the Operator + * @param inputDims Array of input dimensions. + * @param input_ const input Tensor. + * @param weights_ const weight Tensor. + * @param biases_ const Biais Tensor. + * @param output_ Output Tensor. + */ +template <class I, class W, class B, class O> +void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, + const std::array<DimSize_t, 2>& /*dilationDims*/, + const std::array<DimSize_t, 2>& kernelDims, + const std::array<DimSize_t, 4> &inputDims, + DimSize_t outChannels, + const void *input_, + const void *weights_, + const void *biases_, + void *output_) +{ + // FIXME: missing convolution attributes as arguments + const I *input = static_cast<const I *>(input_); + const W *weights = static_cast<const W *>(weights_); + const B *biases = static_cast<const B *>(biases_); + O *output = static_cast<O *>(output_); + + // output H size + const std::size_t oxSize = + static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) / + static_cast<float>(strideDims[0]))); + // output W size + const std::size_t oySize = + static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - kernelDims[1] + strideDims[1]) / + static_cast<float>(strideDims[1]))); + + // TODO: kernel computation + // output (batch, outCh, Xout, Yout) + // input (batch, inCh, Xin, Yin) + // weight (outCh, inCh, kernelX, kernelY) + // does not take Dilation attribute into account + using signedsize = std::make_signed<std::size_t>::type; + for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { + for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { + const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize; + // If bias = nullptr, set B(0) + B biasVal = (biases != nullptr) ? biases[outCh] : B(0); + std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal); + for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { + const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; + const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1]; + for (std::size_t ox = 0; ox < oxSize; ++ox) { + const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]); + const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0))); + const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx); + for (std::size_t oy = 0; oy < oySize; ++oy) { + const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]); + const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0))); + const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify); + const std::size_t oIndexFull = oIndex + ox*oySize + oy; + const signedsize ix = static_cast<signedsize>(ox * strideDims[0]); + const signedsize iy = static_cast<signedsize>(oy * strideDims[1]); + + if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) { + output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] + + weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] + + weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] + + weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] + + weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] + + weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] + + weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] + + weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] + + weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]); + } else { + for (std::size_t sx = sxMin; sx < sxMax; ++sx) { + for (std::size_t sy = syMin; sy < syMax; ++sy) { + output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] * + input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy))]; + } + } + } + } + } + } + } + } +} + +REGISTRAR(ConvImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr}); +REGISTRAR(ConvImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr}); +REGISTRAR(ConvImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<int, int, int, int>, nullptr}); +REGISTRAR(ConvImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr}); } // namespace Aidge #endif /* AIDGE_CPU_OPERATOR_CONVIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp deleted file mode 100644 index 88a71c47..00000000 --- a/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp +++ /dev/null @@ -1,259 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2023 CEA-List - * - * This program and the accompanying materials are made available under the - * terms of the Eclipse Public License 2.0 which is available at - * http://www.eclipse.org/legal/epl-2.0. - * - * SPDX-License-Identifier: EPL-2.0 - * - ********************************************************************************/ - -#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_ -#define AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_ - -#include <algorithm> -#include <array> -#include <cmath> - -#include "aidge/backend/cpu/data/GetCPUPtr.h" -#include "aidge/backend/cpu/operator/ConvImpl.hpp" -#include "aidge/data/half.hpp" -#include "aidge/utils/Registrar.hpp" -#include "aidge/utils/Types.h" - -namespace Aidge { -/** - * @brief Forward kernel for 1D Convolution on CPU backend. - * @tparam I Input data type. - * @tparam W Weight data type. - * @tparam B Bias data type. - * @tparam O Output data type. - * @param params tuple of Attributes from the Operator - * @param inputDims Array of input dimensions. - * @param input_ const input Tensor. - * @param weights_ const weight Tensor. - * @param biases_ const Biais Tensor. - * @param output_ Output Tensor. - */ -template <class I, class W, class B, class O> -void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims, - const std::array<DimSize_t, 1>& /*dilationDims*/, - const std::array<DimSize_t, 1>& kernelDims, - const std::array<DimSize_t, 3>& inputDims, - DimSize_t outChannels, - const void *input_, - const void *weights_, - const void *biases_, - void *output_) -{ - // FIXME: missing convolution attributes as arguments - const I *input = static_cast<const I *>(input_); - const W *weights = static_cast<const W *>(weights_); - const B *biases = static_cast<const B *>(biases_); - O *output = static_cast<O *>(output_); - - // output H size - const std::size_t oxSize = - static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) / - static_cast<float>(strideDims[0]))); - - // TODO: kernel computation - // output (batch, outCh, Xout, Yout) - // input (batch, inCh, Xin, Yin) - // weight (outCh, inCh, kernelX, kernelY) - // does not take Dilation attribute into account - using signedsize = std::make_signed<std::size_t>::type; - for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { - for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { - const std::size_t oIndex = (outCh + batch*outChannels) * oxSize; - // If bias = nullptr, set B(0) - B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output + oIndex, output+(oIndex+oxSize), biasVal); - for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { - const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2]; - const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0]; - for (std::size_t ox = 0; ox < oxSize; ++ox) { - const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]); - const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0))); - const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx); - const std::size_t oIndexFull = oIndex + ox; - const signedsize ix = static_cast<signedsize>(ox * strideDims[0]); - - for (std::size_t sx = sxMin; sx < sxMax; ++sx) { - output[oIndexFull] += weights[wIndex + sx] * - input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))]; - } - } - } - } - } -} - -namespace { -static Registrar<ConvImpl1DForward_cpu> registrarConvImpl1DForward_cpu_Float32( - {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32}, - Aidge::ConvImpl1D_cpu_forward_kernel<float, float, float, float>); -static Registrar<ConvImpl1DForward_cpu> registrarConvImpl1DForward_cpu_Float16( - {DataType::Float16, DataType::Float16, DataType::Float16, DataType::Float16}, - Aidge::ConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>); -static Registrar<ConvImpl1DForward_cpu> registrarConvImpl1DForward_cpu_Int32( - {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32}, - Aidge::ConvImpl1D_cpu_forward_kernel<int, int, int, int>); -static Registrar<ConvImpl1DForward_cpu> registrarConvImpl1DForward_cpu_Float64( - {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64}, - Aidge::ConvImpl1D_cpu_forward_kernel<double, double, double, double>); -} // namespace - - -/** - * @brief Forward kernel for 2D Convolution on CPU backend. - * @tparam I Input data type. - * @tparam W Weight data type. - * @tparam B Bias data type. - * @tparam O Output data type. - * @param params tuple of Attributes from the Operator - * @param inputDims Array of input dimensions. - * @param input_ const input Tensor. - * @param weights_ const weight Tensor. - * @param biases_ const Biais Tensor. - * @param output_ Output Tensor. - */ -template <class I, class W, class B, class O> -void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, - const std::array<DimSize_t, 2>& /*dilationDims*/, - const std::array<DimSize_t, 2>& kernelDims, - const std::array<DimSize_t, 4> &inputDims, - DimSize_t outChannels, - const void *input_, - const void *weights_, - const void *biases_, - void *output_) -{ - // FIXME: missing convolution attributes as arguments - const I *input = static_cast<const I *>(input_); - const W *weights = static_cast<const W *>(weights_); - const B *biases = static_cast<const B *>(biases_); - O *output = static_cast<O *>(output_); -/* - // output H size - const std::size_t oxSize = - static_cast<std::size_t>(static_cast<float>(inputDims[0] - kernelDims[0] + strideDims[0]) / - static_cast<float>(strideDims[0])); - // output W size - const std::size_t oySize = - static_cast<std::size_t>(static_cast<float>(inputDims[1] - kernelDims[1] + strideDims[1]) / - static_cast<float>(strideDims[1])); - - // TODO: kernel computation - // output (Xout, Yout, outCh, batch) - // input (Xin, Yin, inCh, batch) - // weight (kernelX, kernelY, inCh, outCh) - // does not take Dilation attribute into account - for (std::size_t ox = 0; ox < oxSize; ++ox) { - for (std::size_t oy = 0; oy < oySize; ++oy) { - const std::size_t ix = ox * strideDims[0]; - const std::size_t iy = oy * strideDims[1]; - - for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { - const std::size_t oIndex = inputDims[3] * (outCh + outChannels * (oy + oySize * ox)); - B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - for (std::size_t batch = 0; batch < inputDims[3]; ++batch) { - output[oIndex + batch] = biasVal; - } - for (std::size_t inCh = 0; inCh < inputDims[2]; ++inCh) { - for (std::size_t sx = 0; sx < kernelDims[0]; ++sx) { - for (std::size_t sy = 0; sy < kernelDims[1]; ++sy) { - const std::size_t wIndex = - outCh + outChannels * (inCh + inputDims[2] * (sy + kernelDims[1] * sx)); - std::size_t iIndex = inputDims[3] * (inCh + inputDims[2] * ((iy + sy) + inputDims[1] * (ix + sx))); - for (std::size_t batch = 0; batch < inputDims[3]; ++batch) { - output[oIndex + batch] += weights[wIndex] * input[iIndex + batch]; - } - } - } - } - } - } - } -*/ - - - // output H size - const std::size_t oxSize = - static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) / - static_cast<float>(strideDims[0]))); - // output W size - const std::size_t oySize = - static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - kernelDims[1] + strideDims[1]) / - static_cast<float>(strideDims[1]))); - - // TODO: kernel computation - // output (batch, outCh, Xout, Yout) - // input (batch, inCh, Xin, Yin) - // weight (outCh, inCh, kernelX, kernelY) - // does not take Dilation attribute into account - using signedsize = std::make_signed<std::size_t>::type; - for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { - for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { - const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize; - // If bias = nullptr, set B(0) - B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal); - for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { - const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; - const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1]; - for (std::size_t ox = 0; ox < oxSize; ++ox) { - const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]); - const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0))); - const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx); - for (std::size_t oy = 0; oy < oySize; ++oy) { - const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]); - const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0))); - const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify); - const std::size_t oIndexFull = oIndex + ox*oySize + oy; - const signedsize ix = static_cast<signedsize>(ox * strideDims[0]); - const signedsize iy = static_cast<signedsize>(oy * strideDims[1]); - - if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) { - output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] + - weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] + - weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] + - weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] + - weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] + - weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] + - weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] + - weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] + - weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]); - } else { - for (std::size_t sx = sxMin; sx < sxMax; ++sx) { - for (std::size_t sy = syMin; sy < syMax; ++sy) { - output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] * - input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy))]; - } - } - } - } - } - } - } - } -} - -namespace { -static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float32( - {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32}, - Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>); -static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float16( - {DataType::Float16, DataType::Float16, DataType::Float16, DataType::Float16}, - Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>); -static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Int32( - {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32}, - Aidge::ConvImpl2D_cpu_forward_kernel<int, int, int, int>); -static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float64( - {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64}, - Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>); -} // namespace -} // namespace Aidge - -#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_ */ diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp index 3a6b331b..b57ffd5f 100644 --- a/src/operator/ConvImpl.cpp +++ b/src/operator/ConvImpl.cpp @@ -18,35 +18,18 @@ #include <vector> #include "aidge/backend/cpu/data/GetCPUPtr.h" -#include "aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp" #include "aidge/operator/Conv.hpp" #include "aidge/utils/Types.h" +template <> void Aidge::ConvImpl1D_cpu::forward() { const auto& op_ = static_cast<const Conv_Op<1>&>(mOp); // FIXME: uncomment the following code once memory handling will work -AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator."); + AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator."); AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator."); - // Find the correct kernel type - const auto outputDataType = op_.getOutput(0)->dataType(); - const Registrar<ConvImpl1DForward_cpu>::registrar_key registrarKey = { - op_.getInput(0)->dataType(), - op_.getInput(1)->dataType(), - (op_.getInput(2) ? op_.getInput(2)->dataType() : op_.getInput(1)->dataType()), - outputDataType}; - - Registrar<ConvImpl1DForward_cpu>::registrar_type kernelFunc; - if (Registrar<ConvImpl1DForward_cpu>::exists(registrarKey)) { - // One exists with the right inputs/output types - kernelFunc = Registrar<ConvImpl1DForward_cpu>::create(registrarKey); - } - else { - // Otherwise, fallback to the kernel with all types matching output type - kernelFunc = Registrar<ConvImpl1DForward_cpu>::create({ - outputDataType, outputDataType, outputDataType, outputDataType}); - } + const auto impl = Registrar<ConvImpl1D_cpu>::create(getBestMatch(getRequiredSpec())); // Convert input data (no overhead if not needed!) // TODO: right now, if needed, memory will be allocated/deallocated at each @@ -58,7 +41,7 @@ AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator."); const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor(); // Call kernel - kernelFunc(op_.strideDims(), + impl.forward(op_.strideDims(), op_.dilationDims(), op_.kernelDims(), op_.getInput(0)->template dims<3>(), // input dimensions @@ -70,6 +53,12 @@ AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator."); ); } +template <> +void Aidge::ConvImpl1D_cpu::backward() { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<1> on backend cpu"); +} + +template <> void Aidge::ConvImpl2D_cpu::forward() { const auto& op_ = dynamic_cast<const Conv_Op<2>&>(mOp); @@ -77,24 +66,7 @@ void Aidge::ConvImpl2D_cpu::forward() { AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator."); AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator."); - // Find the correct kernel type - const auto outputDataType = op_.getOutput(0)->dataType(); - const Registrar<ConvImpl2DForward_cpu>::registrar_key registrarKey = { - op_.getInput(0)->dataType(), - op_.getInput(1)->dataType(), - (op_.getInput(2) ? op_.getInput(2)->dataType() : op_.getInput(1)->dataType()), - outputDataType}; - - Registrar<ConvImpl2DForward_cpu>::registrar_type kernelFunc; - if (Registrar<ConvImpl2DForward_cpu>::exists(registrarKey)) { - // One exists with the right inputs/output types - kernelFunc = Registrar<ConvImpl2DForward_cpu>::create(registrarKey); - } - else { - // Otherwise, fallback to the kernel with all types matching output type - kernelFunc = Registrar<ConvImpl2DForward_cpu>::create({ - outputDataType, outputDataType, outputDataType, outputDataType}); - } + const auto impl = Registrar<ConvImpl2D_cpu>::create(getBestMatch(getRequiredSpec())); // Convert input data (no overhead if not needed!) // TODO: right now, if needed, memory will be allocated/deallocated at each @@ -106,7 +78,7 @@ void Aidge::ConvImpl2D_cpu::forward() { const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor(); // Call kernel - kernelFunc(op_.strideDims(), + impl.forward(op_.strideDims(), op_.dilationDims(), op_.kernelDims(), op_.getInput(0)->template dims<4>(), // input dimensions @@ -117,3 +89,8 @@ void Aidge::ConvImpl2D_cpu::forward() { getCPUPtr(mOp.getRawOutput(0)) // output ); } + +template <> +void Aidge::ConvImpl2D_cpu::backward() { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<2> on backend cpu"); +} -- GitLab