From 7bd9c550694f560301abe4bd04aaeb1c81fd8cea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20KUBLER?= <gregoire.kubler@proton.me> Date: Tue, 25 Feb 2025 13:31:12 +0000 Subject: [PATCH] chore : conv forward 1/2D formatting --- .../aidge/backend/cpu/operator/ConvImpl.hpp | 1 + .../backend/cpu/operator/ConvImpl_kernels.hpp | 249 ++++++++++++------ src/operator/ConvImpl.cpp | 21 +- 3 files changed, 177 insertions(+), 94 deletions(-) diff --git a/include/aidge/backend/cpu/operator/ConvImpl.hpp b/include/aidge/backend/cpu/operator/ConvImpl.hpp index 8bf11ac0..e480697b 100644 --- a/include/aidge/backend/cpu/operator/ConvImpl.hpp +++ b/include/aidge/backend/cpu/operator/ConvImpl.hpp @@ -20,6 +20,7 @@ #include "aidge/utils/Types.h" namespace Aidge { + // Operator implementation entry point for the backend using Conv1D_Op = Conv_Op<1>; using ConvImpl1D_cpu = OperatorImpl_cpu<Conv_Op<1>, diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp index 4e1861b4..274f5f4f 100644 --- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp @@ -37,16 +37,15 @@ using std::array; * @param output_ Output Tensor. */ template <class I, class W, class B, class O> -void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims, - const std::array<DimSize_t, 1>& dilationDims, - const std::array<DimSize_t, 1>& kernelDims, - const std::array<DimSize_t, 3>& inputDims, - DimSize_t outChannels, - const void *input_, - const void *weights_, - const void *biases_, - void *output_) -{ +void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim, + const array<DimSize_t, 1> &dilationDim, + const array<DimSize_t, 1> &kernelDim, + const std::array<DimSize_t, 3> &inputDims, + DimSize_t outChannels, + const void *input_, + const void *weights_, + const void *biases_, + void *output_) { // FIXME: missing convolution attributes as arguments const I *input = static_cast<const I *>(input_); const W *weights = static_cast<const W *>(weights_); @@ -54,34 +53,51 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims, O *output = static_cast<O *>(output_); // output H size - const std::size_t oxSize = - static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) / - static_cast<float>(strideDims[0]))); - const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1; + const std::size_t oxSize = static_cast<std::size_t>(std::floor( + static_cast<float>(inputDims[2] - dilationDim[0] * (kernelDim[0] - 1) - + 1 + strideDim[0]) / + static_cast<float>(strideDim[0]))); + const DimSize_t dilated_kernel_x = dilationDim[0] * (kernelDim[0] - 1) + 1; - // TODO: kernel computation - // output (batch, outCh, Xout, Yout) - // input (batch, inCh, Xin, Yin) - // weight (outCh, inCh, kernelX, kernelY) - // does not take Dilation attribute into account using signedsize = std::make_signed<std::size_t>::type; for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { - const std::size_t oIndex = (outCh + batch*outChannels) * oxSize; + const std::size_t oIndex = (outCh + batch * outChannels) * oxSize; // If bias = nullptr, set B(0) B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output + oIndex, output+(oIndex+oxSize), biasVal); + std::fill(output + oIndex, output + (oIndex + oxSize), biasVal); for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { - const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2]; - const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0]; + const std::size_t iIndex = + (inCh + batch * inputDims[1]) * inputDims[2]; + const std::size_t wIndex = + (inCh + outCh * inputDims[1]) * kernelDim[0]; for (std::size_t ox = 0; ox < oxSize; ++ox) { - // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]); - // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0))); - // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx); + // const signedsize difx = static_cast<signedsize>(- ox * + // strideDim[0s); const std::size_t sxMin = + // static_cast<std::size_t>(std::max(difx, signedsize(0))); + // const std::size_t sxMax = + // (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : + // ((inputDims[2] + difx) > kernelDim[0s[0] ? kernelDim[0s + // : inputDims[2] + difx); const std::size_t sxMin = 0; const std::size_t sxMax = dilated_kernel_x; const std::size_t oIndexFull = oIndex + ox; - const signedsize ix = static_cast<signedsize>(ox * strideDims[0]); + const signedsize ix = + static_cast<signedsize>(ox * strideDim[0]); + + for (std::size_t sx = sxMin; sx * dilationDim[0] < sxMax; + ++sx) { + output[oIndexFull] += + weights[wIndex + sx] * + input[iIndex + static_cast<std::size_t>( + ix + static_cast<signedsize>( + sx * dilationDim[0]))]; + } + } + } + } + } +} /** * @brief perform 1D backpropagation for the data input @@ -119,9 +135,9 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims, * @param[inout] iGrad gradients of the input to update */ template <class I, class W, class O> -void conv1DBackwardInput(const DimSize_t &stride, - const DimSize_t &dilation, - const DimSize_t &kDim, +void conv1DBackwardInput(const array<DimSize_t, 1> &stride, + const array<DimSize_t, 1> &dilation, + const array<DimSize_t, 1> &kDim, const array<DimSize_t, 2> &kStrides, const W *weights, const array<DimSize_t, 3> &oDims, @@ -434,16 +450,15 @@ REGISTRAR(ConvImpl1D_cpu, * @param output_ Output Tensor. */ template <class I, class W, class B, class O> -void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, - const std::array<DimSize_t, 2>& dilationDims, - const std::array<DimSize_t, 2>& kernelDims, - const std::array<DimSize_t, 4> &inputDims, - DimSize_t outChannels, - const void *input_, - const void *weights_, - const void *biases_, - void *output_) -{ +void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, + const array<DimSize_t, 2> &dilationDims, + const array<DimSize_t, 2> &kernelDims, + const array<DimSize_t, 4> &inputDims, + DimSize_t outChannels, + const void *input_, + const void *weights_, + const void *biases_, + void *output_) { // FIXME: missing convolution attributes as arguments const I *input = static_cast<const I *>(input_); const W *weights = static_cast<const W *>(weights_); @@ -451,59 +466,102 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, O *output = static_cast<O *>(output_); // output H size - const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1; - const std::size_t oxSize = - static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) / - static_cast<float>(strideDims[0]))); + const DimSize_t dilated_kernel_x = + dilationDims[0] * (kernelDims[0] - 1) + 1; + const std::size_t oxSize = static_cast<std::size_t>(std::floor( + static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) / + static_cast<float>(strideDims[0]))); // output W size - const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1; - const std::size_t oySize = - static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) / - static_cast<float>(strideDims[1]))); - + const DimSize_t dilated_kernel_y = + dilationDims[1] * (kernelDims[1] - 1) + 1; + const std::size_t oySize = static_cast<std::size_t>(std::floor( + static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) / + static_cast<float>(strideDims[1]))); // TODO: kernel computation // output (batch, outCh, Xout, Yout) // input (batch, inCh, Xin, Yin) // weight (outCh, inCh, kernelX, kernelY) // does not take Dilation attribute into account - const std::size_t outChannels_s = oxSize * oySize; + const std::size_t outChannels_s = oxSize * oySize; if (dilated_kernel_x == 3 && dilated_kernel_y == 3) { for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { // If bias = nullptr, set B(0) B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output, output+outChannels_s, biasVal); + std::fill(output, output + outChannels_s, biasVal); for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { - std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; - const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9; - if (strideDims[0] == 1 && strideDims[1]==1) { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) { + std::size_t iIndex = (inCh + batch * inputDims[1]) * + inputDims[2] * inputDims[3]; + const std::size_t wIndex = + (inCh + outCh * inputDims[1]) * 9; + if (strideDims[0] == 1 && strideDims[1] == 1) { + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; + ++ox, oIndex += oySize, iIndex -= inputDims[3]) { for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2]; + output[oIndex + oy] += + weights[wIndex + 0] * input[iIndex + oy] + + weights[wIndex + 1] * + input[iIndex + oy + 1] + + weights[wIndex + 2] * + input[iIndex + oy + 2]; } - iIndex+=inputDims[3]; + iIndex += inputDims[3]; for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2]; + output[oIndex + oy] += + weights[wIndex + 3] * input[iIndex + oy] + + weights[wIndex + 4] * + input[iIndex + oy + 1] + + weights[wIndex + 5] * + input[iIndex + oy + 2]; } - iIndex+=inputDims[3]; + iIndex += inputDims[3]; for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2]; + output[oIndex + oy] += + weights[wIndex + 6] * input[iIndex + oy] + + weights[wIndex + 7] * + input[iIndex + oy + 1] + + weights[wIndex + 8] * + input[iIndex + oy + 2]; } } } else { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) { + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, + oIndex += oySize, + iIndex += (strideDims[0] - + 2) * inputDims[3]) { for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2]; + output[oIndex + oy] += + weights[wIndex + 0] * + input[iIndex + oy * strideDims[1]] + + weights[wIndex + 1] * + input[iIndex + oy * strideDims[1] + + 1] + + weights[wIndex + 2] * + input[iIndex + oy * strideDims[1] + 2]; } - iIndex+=inputDims[3]; + iIndex += inputDims[3]; for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy*strideDims[1]]+weights[wIndex+4]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+5]*input[iIndex+oy*strideDims[1]+2]; + output[oIndex + oy] += + weights[wIndex + 3] * + input[iIndex + oy * strideDims[1]] + + weights[wIndex + 4] * + input[iIndex + oy * strideDims[1] + + 1] + + weights[wIndex + 5] * + input[iIndex + oy * strideDims[1] + 2]; } - iIndex+=inputDims[3]; + iIndex += inputDims[3]; for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy*strideDims[1]]+weights[wIndex+7]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+8]*input[iIndex+oy*strideDims[1]+2]; + output[oIndex + oy] += + weights[wIndex + 6] * + input[iIndex + oy * strideDims[1]] + + weights[wIndex + 7] * + input[iIndex + oy * strideDims[1] + + 1] + + weights[wIndex + 8] * + input[iIndex + oy * strideDims[1] + 2]; } } } @@ -516,18 +574,26 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { // If bias = nullptr, set B(0) B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output, output+outChannels_s, biasVal); + std::fill(output, output + outChannels_s, biasVal); for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { - std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; - const std::size_t wIndex = (inCh + outCh*inputDims[1]); + std::size_t iIndex = (inCh + batch * inputDims[1]) * + inputDims[2] * inputDims[3]; + const std::size_t wIndex = (inCh + outCh * inputDims[1]); if (strideDims[0] == 1 && strideDims[1] == 1) { - for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) { + for (std::size_t oIndex = 0; oIndex < oxSize * oySize; + ++oIndex, ++iIndex) { output[oIndex] += weights[wIndex] * input[iIndex]; } - } else { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) { - for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) { - output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy]; + } else { + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; + ++ox, + oIndex += oySize, + iIndex += + inputDims[3] * strideDims[0]) { + for (std::size_t oy = 0, iy = 0; oy < oySize; + ++oy, iy += strideDims[1]) { + output[oIndex + oy] += + weights[wIndex + 0] * input[iIndex + iy]; } } } @@ -540,21 +606,36 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { // If bias = nullptr, set B(0) B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output, output+outChannels_s, biasVal); + std::fill(output, output + outChannels_s, biasVal); for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { - std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; - const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1]; + std::size_t iIndex_channel = + (inCh + batch * inputDims[1]) * inputDims[2] * + inputDims[3]; + const std::size_t wIndex = (inCh + outCh * inputDims[1]) * + kernelDims[0] * kernelDims[1]; // loop over each ouput line - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) { + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; + ++ox, + oIndex += oySize, + iIndex_channel += + inputDims[3] * strideDims[0]) { // loop over associated input line - for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) { + for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; + ++ky, ix += inputDims[3] * dilationDims[0]) { // loop over the entire line - for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) { - const std::size_t iIndex = iIndex_channel + ix + iy; - // loop over elements assosicated with one output - for (std::size_t kx = 0; kx < kernelDims[0]; ++kx) { - output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]]; + for (std::size_t oy = 0, iy = 0; oy < oySize; + ++oy, iy += strideDims[1]) { + const std::size_t iIndex = + iIndex_channel + ix + iy; + // loop over elements assosicated with one + // output + for (std::size_t kx = 0; kx < kernelDims[0]; + ++kx) { + output[oIndex + oy] += + weights[wIndex + kernelDims[0] * ky + + kx] * + input[iIndex + kx * dilationDims[1]]; } } } diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp index ffbb75c1..d23a9968 100644 --- a/src/operator/ConvImpl.cpp +++ b/src/operator/ConvImpl.cpp @@ -40,16 +40,17 @@ void Aidge::ConvImpl1D_cpu::forward() { const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor(); // Call kernel - impl.forward(op_.strideDims(), - op_.dilationDims(), - op_.kernelDims(), - op_.getInput(0)->template dims<3>(), // input dimensions - dynamic_cast<const Conv_Op<1>&>(mOp).outChannels(), // outChannels - input0.getImpl()->rawPtr(), // input - input1.getImpl()->rawPtr(), // weight - op_.getInput(2) ? input2.getImpl()->rawPtr() : nullptr, // bias - getCPUPtr(mOp.getRawOutput(0)) // output - ); + impl.forward( + op_.strideDims(), + op_.dilationDims(), + op_.kernelDims(), + op_.getInput(0)->template dims<3>(), // input dimensions + dynamic_cast<const Conv_Op<1> &>(mOp).outChannels(), // outChannels + input0.getImpl()->rawPtr(), // input + input1.getImpl()->rawPtr(), // weight + op_.getInput(2) ? input2.getImpl()->rawPtr() : nullptr, // bias + getCPUPtr(mOp.getRawOutput(0)) // output + ); } template <> void ConvImpl1D_cpu::backward() { -- GitLab