diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp index 92210ae5f71ca223c4a67018570263240c1b400a..36b6d4e23c21cd8349ed9ad8263b7336615c99c8 100644 --- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp @@ -450,16 +450,15 @@ REGISTRAR(ConvImpl1D_cpu, * @param output_ Output Tensor. */ template <class I, class W, class B, class O> -void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, - const std::array<DimSize_t, 2>& dilationDims, - const std::array<DimSize_t, 2>& kernelDims, - const std::array<DimSize_t, 4> &inputDims, - DimSize_t outChannels, - const void *input_, - const void *weights_, - const void *biases_, - void *output_) -{ +void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, + const array<DimSize_t, 2> &dilationDims, + const array<DimSize_t, 2> &kernelDims, + const array<DimSize_t, 4> &inputDims, + DimSize_t outChannels, + const void *input_, + const void *weights_, + const void *biases_, + void *output_) { // FIXME: missing convolution attributes as arguments const I *input = static_cast<const I *>(input_); const W *weights = static_cast<const W *>(weights_); @@ -467,59 +466,102 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, O *output = static_cast<O *>(output_); // output H size - const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1; - const std::size_t oxSize = - static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) / - static_cast<float>(strideDims[0]))); + const DimSize_t dilated_kernel_x = + dilationDims[0] * (kernelDims[0] - 1) + 1; + const std::size_t oxSize = static_cast<std::size_t>(std::floor( + static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) / + static_cast<float>(strideDims[0]))); // output W size - const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1; - const std::size_t oySize = - static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) / - static_cast<float>(strideDims[1]))); - + const DimSize_t dilated_kernel_y = + dilationDims[1] * (kernelDims[1] - 1) + 1; + const std::size_t oySize = static_cast<std::size_t>(std::floor( + static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) / + static_cast<float>(strideDims[1]))); // TODO: kernel computation // output (batch, outCh, Xout, Yout) // input (batch, inCh, Xin, Yin) // weight (outCh, inCh, kernelX, kernelY) // does not take Dilation attribute into account - const std::size_t outChannels_s = oxSize * oySize; + const std::size_t outChannels_s = oxSize * oySize; if (dilated_kernel_x == 3 && dilated_kernel_y == 3) { for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { // If bias = nullptr, set B(0) B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output, output+outChannels_s, biasVal); + std::fill(output, output + outChannels_s, biasVal); for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { - std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; - const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9; - if (strideDims[0] == 1 && strideDims[1]==1) { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) { + std::size_t iIndex = (inCh + batch * inputDims[1]) * + inputDims[2] * inputDims[3]; + const std::size_t wIndex = + (inCh + outCh * inputDims[1]) * 9; + if (strideDims[0] == 1 && strideDims[1] == 1) { + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; + ++ox, oIndex += oySize, iIndex -= inputDims[3]) { for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2]; + output[oIndex + oy] += + weights[wIndex + 0] * input[iIndex + oy] + + weights[wIndex + 1] * + input[iIndex + oy + 1] + + weights[wIndex + 2] * + input[iIndex + oy + 2]; } - iIndex+=inputDims[3]; + iIndex += inputDims[3]; for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2]; + output[oIndex + oy] += + weights[wIndex + 3] * input[iIndex + oy] + + weights[wIndex + 4] * + input[iIndex + oy + 1] + + weights[wIndex + 5] * + input[iIndex + oy + 2]; } - iIndex+=inputDims[3]; + iIndex += inputDims[3]; for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2]; + output[oIndex + oy] += + weights[wIndex + 6] * input[iIndex + oy] + + weights[wIndex + 7] * + input[iIndex + oy + 1] + + weights[wIndex + 8] * + input[iIndex + oy + 2]; } } } else { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) { + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, + oIndex += oySize, + iIndex += (strideDims[0] - + 2) * inputDims[3]) { for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2]; + output[oIndex + oy] += + weights[wIndex + 0] * + input[iIndex + oy * strideDims[1]] + + weights[wIndex + 1] * + input[iIndex + oy * strideDims[1] + + 1] + + weights[wIndex + 2] * + input[iIndex + oy * strideDims[1] + 2]; } - iIndex+=inputDims[3]; + iIndex += inputDims[3]; for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy*strideDims[1]]+weights[wIndex+4]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+5]*input[iIndex+oy*strideDims[1]+2]; + output[oIndex + oy] += + weights[wIndex + 3] * + input[iIndex + oy * strideDims[1]] + + weights[wIndex + 4] * + input[iIndex + oy * strideDims[1] + + 1] + + weights[wIndex + 5] * + input[iIndex + oy * strideDims[1] + 2]; } - iIndex+=inputDims[3]; + iIndex += inputDims[3]; for (std::size_t oy = 0; oy < oySize; ++oy) { - output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy*strideDims[1]]+weights[wIndex+7]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+8]*input[iIndex+oy*strideDims[1]+2]; + output[oIndex + oy] += + weights[wIndex + 6] * + input[iIndex + oy * strideDims[1]] + + weights[wIndex + 7] * + input[iIndex + oy * strideDims[1] + + 1] + + weights[wIndex + 8] * + input[iIndex + oy * strideDims[1] + 2]; } } } @@ -532,18 +574,26 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { // If bias = nullptr, set B(0) B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output, output+outChannels_s, biasVal); + std::fill(output, output + outChannels_s, biasVal); for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { - std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; - const std::size_t wIndex = (inCh + outCh*inputDims[1]); + std::size_t iIndex = (inCh + batch * inputDims[1]) * + inputDims[2] * inputDims[3]; + const std::size_t wIndex = (inCh + outCh * inputDims[1]); if (strideDims[0] == 1 && strideDims[1] == 1) { - for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) { + for (std::size_t oIndex = 0; oIndex < oxSize * oySize; + ++oIndex, ++iIndex) { output[oIndex] += weights[wIndex] * input[iIndex]; } - } else { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) { - for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) { - output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy]; + } else { + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; + ++ox, + oIndex += oySize, + iIndex += + inputDims[3] * strideDims[0]) { + for (std::size_t oy = 0, iy = 0; oy < oySize; + ++oy, iy += strideDims[1]) { + output[oIndex + oy] += + weights[wIndex + 0] * input[iIndex + iy]; } } } @@ -556,21 +606,36 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { // If bias = nullptr, set B(0) B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output, output+outChannels_s, biasVal); + std::fill(output, output + outChannels_s, biasVal); for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { - std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; - const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1]; + std::size_t iIndex_channel = + (inCh + batch * inputDims[1]) * inputDims[2] * + inputDims[3]; + const std::size_t wIndex = (inCh + outCh * inputDims[1]) * + kernelDims[0] * kernelDims[1]; // loop over each ouput line - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) { + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; + ++ox, + oIndex += oySize, + iIndex_channel += + inputDims[3] * strideDims[0]) { // loop over associated input line - for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) { + for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; + ++ky, ix += inputDims[3] * dilationDims[0]) { // loop over the entire line - for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) { - const std::size_t iIndex = iIndex_channel + ix + iy; - // loop over elements assosicated with one output - for (std::size_t kx = 0; kx < kernelDims[0]; ++kx) { - output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]]; + for (std::size_t oy = 0, iy = 0; oy < oySize; + ++oy, iy += strideDims[1]) { + const std::size_t iIndex = + iIndex_channel + ix + iy; + // loop over elements assosicated with one + // output + for (std::size_t kx = 0; kx < kernelDims[0]; + ++kx) { + output[oIndex + oy] += + weights[wIndex + kernelDims[0] * ky + + kx] * + input[iIndex + kx * dilationDims[1]]; } } }