diff --git a/include/aidge/backend/cpu/operator/ConvImpl.hpp b/include/aidge/backend/cpu/operator/ConvImpl.hpp index c06d0912f419909013f930867ce3c3238c1a5555..8bf11ac0ad6afff62a79f0b6d9a0e876daf569b2 100644 --- a/include/aidge/backend/cpu/operator/ConvImpl.hpp +++ b/include/aidge/backend/cpu/operator/ConvImpl.hpp @@ -13,45 +13,63 @@ #define AIDGE_CPU_OPERATOR_CONVIMPL_H_ #include <array> -#include <memory> -#include <tuple> -#include <vector> #include "aidge/backend/cpu/operator/OperatorImpl.hpp" #include "aidge/operator/Conv.hpp" #include "aidge/utils/Registrar.hpp" #include "aidge/utils/Types.h" -#include "aidge/backend/cpu/data/GetCPUPtr.h" namespace Aidge { // Operator implementation entry point for the backend using Conv1D_Op = Conv_Op<1>; using ConvImpl1D_cpu = OperatorImpl_cpu<Conv_Op<1>, - void(const std::array<DimSize_t, 1>&, - const std::array<DimSize_t, 1>&, - const std::array<DimSize_t, 1>&, - const std::array<DimSize_t, 3> &, - DimSize_t, - const void *, - const void *, - const void *, - void *)>; + void(const std::array<DimSize_t, 1> &, + const std::array<DimSize_t, 1> &, + const std::array<DimSize_t, 1> &, + const std::array<DimSize_t, 3> &, + DimSize_t, + const void *, + const void *, + const void *, + void *), + void(const std::array<DimSize_t, 1> &, + const std::array<DimSize_t, 1> &, + const std::array<DimSize_t, 1> &, + const std::array<DimSize_t, 3> &, + const std::array<DimSize_t, 3> &, + const void *, + const void *, + const void *, + void *, + void *, + void *)>; using Conv2D_Op = Conv_Op<2>; -using ConvImpl2D_cpu = OperatorImpl_cpu<Conv_Op<2>, - void(const std::array<DimSize_t, 2>&, - const std::array<DimSize_t, 2>&, - const std::array<DimSize_t, 2>&, - const std::array<DimSize_t, 4> &, - DimSize_t, - const void *, - const void *, - const void *, - void *)>; +using ConvImpl2D_cpu = OperatorImpl_cpu<Conv2D_Op, + void(const std::array<DimSize_t, 2> &, + const std::array<DimSize_t, 2> &, + const std::array<DimSize_t, 2> &, + const std::array<DimSize_t, 4> &, + DimSize_t, + const void *, + const void *, + const void *, + void *), + void(const std::array<DimSize_t, 2> &, + const std::array<DimSize_t, 2> &, + const std::array<DimSize_t, 2> &, + const std::array<DimSize_t, 4> &, + const std::array<DimSize_t, 4> &, + const void *, + const void *, + const void *, + void *, + void *, + void *)>; // Implementation entry point registration to Operator REGISTRAR(Conv1D_Op, "cpu", Aidge::ConvImpl1D_cpu::create); REGISTRAR(Conv2D_Op, "cpu", Aidge::ConvImpl2D_cpu::create); -} // namespace Aidge +} // namespace Aidge #endif /* AIDGE_CPU_OPERATOR_CONVIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp index 1229d5714e6b0cbae4e42ece9130c2c2305f133e..703772605657d5d5eeb06df80ec4aa9341366de5 100644 --- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp @@ -25,6 +25,8 @@ #include "aidge/backend/cpu/data/GetCPUPtr.h" namespace Aidge { +using std::array; + /** * @brief Forward kernel for 1D Convolution on CPU backend. * @tparam I Input data type. @@ -85,9 +87,80 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims, const std::size_t oIndexFull = oIndex + ox; const signedsize ix = static_cast<signedsize>(ox * strideDims[0]); - for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) { - output[oIndexFull] += weights[wIndex + sx] * - input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))]; +/** + * @brief perform 1D backpropagation for the data input + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * @note formula : + * for i in 0..input_size: + * for n in 0..weight_size: + * dL dYn dL + * ---- = ---- ---- + * dXi dXi Yn + * with : dYn / dXi = w_k + * for each input value + * for each weight + * for each output + * multiply the weight with the associated value + * @note kernel & stride are passed as single integers as they are just arrays + * of length 1 + * @note reminder that kernel dimensions are + * {outChannels, inChannels, {kernelDims}} + * <=> {oDims[1], iDims[1], kernelDim} + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam O Output data type. + * @param[in] stride stride parameter of the convolution operator + * @param[in] dilation dilation parameter of the convolution operator + * @param[in] kDims dimension of the kernel + * @param[in] kStrides nb of elements contained per dimension of the kernel + * @param[in] weights kernel weights + * @param[in] oDims dimensions of the output + * @param[in] oStrides nb of elements contained per dimension of the output + * @param[in] oGrad output gradient + * @param[in] iDims input dimensions + * @param[in] iStrides nb of elements contained per dimension of the input + * @param[inout] iGrad gradients of the input to update + */ +template <class I, class W, class O> +void conv1DBackwardInput(const DimSize_t &stride, + const DimSize_t &dilation, + const DimSize_t &kDim, + const array<DimSize_t, 2> &kStrides, + const W *weights, + const array<DimSize_t, 3> &oDims, + const array<DimSize_t, 2> &oStrides, + const O *oGrad, + const array<DimSize_t, 3> &iDims, + const array<DimSize_t, 2> &iStrides, + I *iGrad) { + + array<DimSize_t, 2> iOffsets{0, 0}; + array<DimSize_t, 2> oOffsets{0, 0}; + array<DimSize_t, 2> kOffsets{0, 0}; + + for (std::size_t batch = 0; batch < iDims[0]; ++batch) { + iOffsets[0] = batch * iStrides[0]; + oOffsets[0] = batch * oStrides[0]; + + for (DimSize_t oChannel = 0; oChannel < oDims[1]; oChannel++) { + oOffsets[1] = (oChannel * oStrides[1]) + oOffsets[0]; + kOffsets[0] = oChannel * kStrides[0]; + + for (std::size_t iChannel = 0; iChannel < iDims[1]; ++iChannel) { + iOffsets[1] = (iChannel * iStrides[1]) + iOffsets[0]; + kOffsets[1] = iChannel * kStrides[1] + kOffsets[0]; + + for (DimSize_t oX = 0; oX < oDims[2]; ++oX) { + auto iX = oX * stride[0]; + auto inIdx = iX + iOffsets[1]; + + for (DimSize_t kX = 0; kX < kDim[0]; ++kX) { + auto dilatedKernelIdx = kX * dilation[0]; + + iGrad[inIdx + dilatedKernelIdx] += + weights[kOffsets[1] + kX] * + oGrad[oOffsets[1] + oX]; } } } @@ -95,20 +168,261 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims, } } +/** + * @brief computes weight backpropagation for conv1D + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * weight grad + * for i in 0..weight_size: + * for n in 0..output_size: + * dL dYn dL + * ---- = ---- ---- + * dwi dwi Yn + * with : dYn / dwi = x_k + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam O Output data type. + * @param[in] stride stride parameter of the convolution operator + * @param[in] dilation dilation parameter of the convolution operator + * @param[in] iDims input dimensions + * @param[in] iStrides nb of elements contained per dimension of the input + * @param[inout] iGrad gradients of the input to update + * @param[in] oDims dimensions of the output + * @param[in] oStrides nb of elements contained per dimension of the output + * @param[in] oGrad output gradient + * @param[in] kDims dimension of the kernel + * @param[in] kStrides nb of elements contained per dimension of the kernel + * @param[in] weights kernel weights + */ +template <class I, class W, class O> +static void conv1DBackwardWeights(const array<DimSize_t, 1> &stride, + const array<DimSize_t, 1> &dilation, + const array<DimSize_t, 3> &iDims, + const array<DimSize_t, 2> iStrides, + const I *input, + const array<DimSize_t, 3> &oDims, + const array<DimSize_t, 2> oStrides, + const O *oGrad, + const array<DimSize_t, 1> &kDim, + const array<DimSize_t, 2> kStrides, + W *weightsGrad) { + + array<DimSize_t, 2> iOffsets{0, 0}; + array<DimSize_t, 2> oOffsets{0, 0}; + array<DimSize_t, 2> kOffsets{0, 0}; + + for (DimSize_t batch = 0; batch < oDims[0]; ++batch) { + iOffsets[0] = batch * iStrides[0]; + oOffsets[0] = batch * oStrides[0]; + + for (DimSize_t oChannel = 0; oChannel < oDims[1]; ++oChannel) { + oOffsets[1] = oChannel * oStrides[1] + oOffsets[0]; + kOffsets[0] = oChannel * kStrides[0]; + + for (DimSize_t iChannel = 0; iChannel < iDims[1]; ++iChannel) { + kOffsets[1] = iChannel * kStrides[1] + kOffsets[0]; + iOffsets[1] = iChannel * iStrides[1] + iOffsets[0]; + oOffsets[1] = oChannel * oStrides[1] + oOffsets[0]; + + for (DimSize_t kX = 0; kX < kDim[0]; ++kX) { + + for (DimSize_t oX = 0; oX < oDims[2]; ++oX) { + const DimSize_t iX = oX * stride[0] + kX * dilation[0] ; + + weightsGrad[kOffsets[1] + kX] += + input[iOffsets[1] + iX] * oGrad[oOffsets[1] + oX]; + } + } + } + } + } +} + +/** + * @brief computes bias backpropagation for conv1D operation + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * @note formula : + * Bias grad: + * for i in 0..bias_size: + * for n in 0..output_size: + * dL dYn dL + * ---- = ---- ---- + * dbi dbi Yn + * with : dYn / dbi = 1 + * + * Hence the partial derivative of the loss wrt bias is the + * output loss. Hence the bias grad is just the sum of the + * loss values over the batch + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam B Bias data type. + * @tparam O Output data type. + * @param[in] oDims output tensor dimensions + * @param[in] oStrides nb of elements contained per dimension of the output + * tensor + * @param[in] oGrad output tensor gradients + * @param[inout] biasesGrad biases gradients + */ +template <class B, class O> +static void conv1DBackwardBias(const array<DimSize_t, 3> &oDims, + const array<DimSize_t, 2> &oStrides, + const O *oGrad, + B *biasesGrad) { + array<DimSize_t, 2> oOffsets{0, 0}; + + for (DimSize_t batchIdx = 0; batchIdx < oDims[0]; ++batchIdx) { + oOffsets[0] = batchIdx * oStrides[0]; + + for (DimSize_t oChannel = 0; oChannel < oDims[1]; ++oChannel) { + oOffsets[1] = oChannel * oStrides[1] + oOffsets[0]; + + for (DimSize_t oIdx = 0; oIdx < oDims[2]; oIdx++) { + biasesGrad[oChannel] += oGrad[oOffsets[1] + oIdx]; + } + } + } +} + +/** + * @brief Backward kernel for 1D Convolution on CPU backend. + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam B Bias data type. + * @tparam O Output data type. + * @param[in] const stride + * @param[in] const kernelDims + * @param[in] const iDims input data dimensions + * @param[in] const oDims output data dimmensions + * @param[in] const oChannels output channel number + * @param[in] const input_ const input Tensor. + * @param[in] const weights_ const weight Tensor. + * @param[in] const biases_ const Biais Tensor. + * @param[in] const output_ Output Tensor. + * @param[in] const oGrad_ gradients of output data + * @param[inout] iGrad_ gradients of input data + * @param[inout] weightsGrad_ gradients of the kernel weights + * @param[inout] biasesGrad_ gradients of the kernel biases + */ +template <class I, class W, class B, class O> +void ConvImpl1D_cpu_backward_kernel(const array<DimSize_t,1> &stride, + const array<DimSize_t,1> &dilation, + const array<DimSize_t,1> &kernelDim, + const array<DimSize_t, 3> &inputDims, + const array<DimSize_t, 3> &outputDims, + const void *input_, + const void *weights_, + const void *oGrad_, + void *iGrad_, + void *weightsGrad_, + void *biasesGrad_) { + + const I *input = static_cast<const I *>(input_); + I *iGrad = static_cast<I *>(iGrad_); + const I *oGrad = static_cast<const I *>(oGrad_); + const W *weights = static_cast<const W *>(weights_); + W *weightsGrad = static_cast<W *>(weightsGrad_); + + ////////////////////////////// + // COMPUTING STRIDES + ////////////////////////////// + // NOTE: The ...Stride var represent the number of values contained in + // each dimension they will be used to compute the index offset of + // values while iterating on each tensor + // NOTE: They are 1 item shorter than their corresponding tensor as the + // number of total elements is not used except for gradient initialization + + // {batch_stride, channel_stride, dim0_stride, dim1_stride} + const array<DimSize_t, 2> inputStrides{inputDims[1] * inputDims[2], + inputDims[2]}; + const DimSize_t nbEltsInput = inputDims[0] * inputStrides[0]; + + // {batch_stride, channel_stride, dim0_stride, dim1_stride} + const array<DimSize_t, 2> outputStrides{outputDims[1] * outputDims[2], + outputDims[2]}; + + // NOTE: kernel dims = {iChannel, oChannel, kernelDim0, kernelDim1} + // kernel_strides = {iChannel, oChannel, kernelDim0} + const array<DimSize_t, 2> kernelStrides{ + inputDims[1] * kernelDim[0], + kernelDim[0], + }; + const DimSize_t nbEltsKernel = outputDims[1] * kernelStrides[0]; + + std::fill(iGrad, iGrad + nbEltsInput, I(0)); + std::fill(weightsGrad, weightsGrad + nbEltsKernel, W(0)); + + conv1DBackwardInput(stride, + dilation, + kernelDim, + kernelStrides, + weights, + outputDims, + outputStrides, + oGrad, + inputDims, + inputStrides, + iGrad); + + conv1DBackwardWeights(stride, + dilation, + inputDims, + inputStrides, + input, + outputDims, + outputStrides, + oGrad, + kernelDim, + kernelStrides, + weightsGrad); + + if (biasesGrad_ != nullptr) { + B *biasesGrad = static_cast<B *>(biasesGrad_); + std::fill(biasesGrad, biasesGrad + outputDims[1], B(0)); + conv1DBackwardBias(outputDims, outputStrides, oGrad, biasesGrad); + } +} + // Kernels registration to implementation entry point REGISTRAR(ConvImpl1D_cpu, - {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr}); + {{DataType::Any, DataFormat::NCHW}, + {DataType::Float32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvImpl1D_cpu_forward_kernel<float, float, float, float>, + ConvImpl1D_cpu_backward_kernel<float, float, float, float>}); REGISTRAR(ConvImpl1D_cpu, - {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr}); + {{DataType::Any, DataFormat::NCHW}, + {DataType::Float16, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvImpl1D_cpu_forward_kernel<half_float::half, + half_float::half, + half_float::half, + half_float::half>, + ConvImpl1D_cpu_backward_kernel<half_float::half, + half_float::half, + half_float::half, + half_float::half>}); REGISTRAR(ConvImpl1D_cpu, - {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr}); + {{DataType::Any, DataFormat::NCHW}, + {DataType::Float64, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvImpl1D_cpu_forward_kernel<double, double, double, double>, + ConvImpl1D_cpu_backward_kernel<double, double, double, double>}); REGISTRAR(ConvImpl1D_cpu, - {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr}); - + {{DataType::Any, DataFormat::NCHW}, + {DataType::Int32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvImpl1D_cpu_forward_kernel<std::int32_t, + std::int32_t, + std::int32_t, + std::int32_t>, + ConvImpl1D_cpu_backward_kernel<std::int32_t, + std::int32_t, + std::int32_t, + std::int32_t>}); /** * @brief Forward kernel for 2D Convolution on CPU backend. @@ -256,21 +570,380 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, } } +/** + * @brief perform backpropagation for the input + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * @note formula : + * for i in 0..input_size: + * for n in 0..weight_size: + * dL dYn dL + * ---- = ---- ---- + * dXi dXi Yn + * with : dYn / dXi = w_k + * for each input value + * for each weight + * for each output + * multiply the weight with the associated value + * @note kernel & stride are passed as single integers as they are just arrays + * of length 1 + * @note reminder that kernel dimensions are + * {outChannels, inChannels, {kernelDims}} + * <=> {oDims[1], iDims[1], kernelDim} + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam O Output data type. + * @param[in] stride stride parameter of the convolution operator + * @param[in] dilation dilation parameter of the convolution operator + * @param[in] kDims dimension of the kernel + * @param[in] kStrides nb of elements contained per dimension of the kernel + * @param[in] weights weights values + * @param[in] oDims dimensions of the output + * @param[in] oStrides nb of elements contained per dimension of the output + * @param[in] oGrad output gradient + * @param[in] iDims input dimensions + * @param[in] iStrides nb of elements contained per dimension of the input + * @param[inout] iGrad gradients of the input to update + */ +template <class I, class W, class O> +void conv2DBackwardInput(const array<DimSize_t, 2> &stride, + const array<DimSize_t, 2> &dilation, + const array<DimSize_t, 2> &kDims, + const array<DimSize_t, 3> &kStrides, + const W *weights, + const array<DimSize_t, 4> &oDims, + const array<DimSize_t, 3> &oStrides, + const O *oGrad, + const array<DimSize_t, 4> &iDims, + const array<DimSize_t, 3> &iStrides, + I *iGrad) { + // records index offsets for each dimension that have a stride (== all + // dimension except the last) for every parsed tensor + array<DimSize_t, 3> kOffset{}; + array<DimSize_t, 3> iOffset{}; + array<DimSize_t, 3> oOffset{}; + + for (std::size_t batch = 0; batch < iDims[0]; ++batch) { + iOffset[0] = batch * iStrides[0]; + oOffset[0] = batch * oStrides[0]; + + for (DimSize_t oChannel = 0; oChannel < oDims[1]; oChannel++) { + oOffset[1] = (oChannel * oStrides[1]) + oOffset[0]; + kOffset[0] = (oChannel * kStrides[0]); + for (std::size_t iChannel = 0; iChannel < iDims[1]; ++iChannel) { + iOffset[1] = (iChannel * iStrides[1]) + iOffset[0]; + kOffset[1] = iChannel * kStrides[1] + kOffset[0]; + + for (DimSize_t oX = 0; oX < oDims[2]; ++oX) { + oOffset[2] = (oX * oStrides[2]) + oOffset[1]; + + auto iX = oX * stride[0]; + iOffset[2] = (iX * iStrides[2]) + iOffset[1]; + + for (DimSize_t oY = 0; oY < oDims[3]; ++oY) { + auto oIdx = oOffset[2] + oY; + + auto iY = oY * stride[1]; + auto iIdx = iOffset[2] + iY; + + for (DimSize_t kX = 0; kX < kDims[0]; ++kX) { + auto kDilX = kX * dilation[0]; + auto iDilKXOffset = kDilX * iStrides[2]; + + kOffset[2] = (kX * kStrides[2]) + kOffset[1]; + + for (DimSize_t kY = 0; kY < kDims[1]; ++kY) { + auto kDilY = kY * dilation[1]; + + iGrad[iIdx + iDilKXOffset + kDilY] += + weights[kOffset[2] + kY] * oGrad[oIdx]; + } + } + } + } + } + } + } +} + +/** + * @brief computes weight backpropagation for conv2D operation + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * weight grad + * for i in 0..weight_size: + * for n in 0..output_size: + * dL dYn dL + * ---- = ---- ---- + * dwi dwi Yn + * with : dYn / dwi = x_k + * @tparam I input dtype + * @tparam W weight dtype + * @tparam O output dtype + * @param[in] iDims input data dimensions + * @param[in] iBatchStride nb element in each input data batch + * @param[in] iChannelStride nb element in each input data channel + * @param[in] input input data + * @param[in] oDims output data dimmensions + * @param[in] oBatchStride nb element in each output data batch + * @param[in] oChannelStride nb element in each output data channel + * @param[in] oGrad gradients of output data + * @param[in] stride + * @param[in] kernelDims + * @param[inout] weightsGrad gradients of the kernel weights + */ +template <class I, class W, class O> +void conv2DBackwardWeights(const array<DimSize_t, 4> &iDims, + const array<DimSize_t, 3> &iStrides, + const I *input, + const array<DimSize_t, 4> &oDims, + const array<DimSize_t, 3> &oStrides, + const O *oGrad, + const array<DimSize_t, 2> &kDim, + const array<DimSize_t, 3> &kStrides, + const array<DimSize_t, 2> &stride, + const array<DimSize_t, 2> &dilation, + W *weightsGrad) { + // records index offsets for each dimension that have a stride (== all + // dimension except the last) for every parsed tensor + array<DimSize_t, 3> iOffsets{0, 0, 0}; + array<DimSize_t, 3> oOffsets{0, 0, 0}; + array<DimSize_t, 3> kOffsets{0, 0, 0}; + + for (DimSize_t batchIdx = 0; batchIdx < oDims[0]; ++batchIdx) { + iOffsets[0] = batchIdx * iStrides[0]; + oOffsets[0] = batchIdx * oStrides[0]; + + for (DimSize_t iChannel = 0; iChannel < iDims[1]; ++iChannel) { + iOffsets[1] = iChannel * iStrides[1] + iOffsets[0]; + kOffsets[0] = iChannel * kStrides[0]; + + for (DimSize_t oChannel = 0; oChannel < oDims[1]; ++oChannel) { + oOffsets[1] = oChannel * oStrides[1] + oOffsets[0]; + kOffsets[1] = oChannel * kStrides[1] + kOffsets[0]; + + for (DimSize_t kX = 0; kX < kDim[0]; ++kX) { + kOffsets[2] = kX * kStrides[2] + kOffsets[1]; + for (DimSize_t kY = 0; kY < kDim[1]; ++kY) { + + for (DimSize_t oX = 0; oX < oDims[2]; ++oX) { + const DimSize_t iX = + oX * stride[0] + kX * dilation[0]; + + oOffsets[2] = oX * oStrides[2] + oOffsets[1]; + iOffsets[2] = iX * iStrides[2] + iOffsets[1]; + + for (DimSize_t oY = 0; oY < oDims[3]; ++oY) { + const DimSize_t iY = + oY * stride[1] + kY * dilation[1]; + + weightsGrad[kOffsets[2] + kY] += + input[iOffsets[2] + iY] * + oGrad[oOffsets[2] + oY]; + } + } + } + } + } + } + } +} + +/** + * @brief computes bias backpropagation for conv2D operation + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * @note formula : + * Bias grad: + * for i in 0..bias_size: + * for n in 0..output_size: + * dL dYn dL + * ---- = ---- ---- + * dbi dbi Yn + * with : dYn / dbi = 1 + * + * Hence the partial derivative of the loss wrt bias is the + * output loss Hence the bias grad is just the sum of the + * loss values over the batch + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam B Bias data type. + * @tparam O Output data type. + * @param[in] oDims output tensor dimensions + * @param[in] oStrides nb of elements contained per dimension of the + * output + * @param[in] oGrad output tensor gradients + * @param[inout] biasesGrad biases gradients + */ +template <class B, class O> +static void conv2DBackwardBias(const array<DimSize_t, 4> &oDims, + const array<DimSize_t, 3> &oStrides, + const O *oGrad, + B *biasesGrad) { + // records all index offsets for output tensor + array<DimSize_t, 3> oOffsets{}; + for (DimSize_t batchIdx = 0; batchIdx < oDims[0]; ++batchIdx) { + oOffsets[0] = batchIdx * oStrides[0]; + + for (DimSize_t oChannel = 0; oChannel < oDims[1]; ++oChannel) { + oOffsets[1] = oChannel * oStrides[1] + oOffsets[0]; + + for (DimSize_t oX = 0; oX < oDims[2]; ++oX) { + oOffsets[2] = oX * oStrides[2] + oOffsets[1]; + + for (DimSize_t oY = 0; oY < oDims[3]; ++oY) { + biasesGrad[oChannel] += oGrad[oOffsets[2] + oY]; + } + } + } + } +} + +/** + * @brief Backward kernel for 2D Convolution on CPU backend. + * @note INPUT & OUTPUT convention is the same as in the + * forward function + * + * @tparam I Input data type. + * @tparam W Weight data type. + * @tparam B Bias data type. + * @tparam O Output data type. + * @param[in] const stride attribute of conv operator + * @param[in] const dilation attribute of conv operator + * @param[in] const kernelDims + * @param[in] const iDims input data dimensions + * @param[in] const oDims output data dimmensions + * @param[in] const input_ input tensor. + * @param[in] const weights_ kernel tensor. + * @param[in] const oGrad_ output tensor gradient. + * @param[inout] iGrad_ input tensor gradient. + * @param[inout] weightsGrad_ kernel weights tensor gradients + * @param[inout] biasesGrad_ kernel biases tensor gradients + */ +template <class I, class W, class B, class O> +void ConvImpl2D_cpu_backward_kernel(const array<DimSize_t, 2> &stride, + const array<DimSize_t, 2> &dilation, + const array<DimSize_t, 2> &kernelDims, + const array<DimSize_t, 4> &inputDims, + const array<DimSize_t, 4> &outputDims, + const void *input_, + const void *weights_, + const void *oGrad_, + void *iGrad_, + void *weightsGrad_, + void *biasesGrad_) { + + const I *input = static_cast<const I *>(input_); + I *iGrad = static_cast<I *>(iGrad_); + const I *outputGrad = static_cast<const I *>(oGrad_); + const W *weights = static_cast<const W *>(weights_); + W *weightsGrad = static_cast<W *>(weightsGrad_); + + ////////////////////////////// + // COMPUTING STRIDES + ////////////////////////////// + // NOTE: The ...Stride var represent the number of values contained in + // each dimension they will be used to compute the index offset of + // values while iterating on each tensor + // NOTE: They are 1 item shorter than their corresponding tensor as the + // number of total elements is not used except for gradient initialization + + // {batch_stride, channel_stride, dim0_stride, dim1_stride} + const array<DimSize_t, 3> inputStrides{ + inputDims[1] * inputDims[2] * inputDims[3], + inputDims[2] * inputDims[3], + inputDims[3]}; + const DimSize_t nbEltsInput = inputDims[0] * inputStrides[0]; + + // {batch_stride, channel_stride, dim0_stride, dim1_stride} + const array<DimSize_t, 3> outputStrides{ + outputDims[1] * outputDims[2] * outputDims[3], + outputDims[2] * outputDims[3], + outputDims[3]}; + + // NOTE: kernel dims = {iChannel, oChannel, kernelDim0, kernelDim1} + // kernel_strides = {iChannel, oChannel, kernelDim0} + const array<DimSize_t, 3> kernelStrides{ + inputDims[1] * kernelDims[0] * kernelDims[1], + kernelDims[0] * kernelDims[1], + kernelDims[1]}; + + const DimSize_t nbEltsKernel = outputDims[1] * kernelStrides[0]; + + //////////////////////////// + // prepping gradient arrays + std::fill(iGrad, iGrad + nbEltsInput, I(0)); + std::fill(weightsGrad, weightsGrad + nbEltsKernel, W(0)); + + conv2DBackwardInput(stride, + dilation, + kernelDims, + kernelStrides, + weights, + outputDims, + outputStrides, + outputGrad, + inputDims, + inputStrides, + iGrad); + + conv2DBackwardWeights(inputDims, + inputStrides, + input, + outputDims, + outputStrides, + outputGrad, + kernelDims, + kernelStrides, + stride, + dilation, + weightsGrad); + + if (biasesGrad_ != nullptr) { + B *biasesGrad = static_cast<B *>(biasesGrad_); + std::fill(biasesGrad, biasesGrad + outputDims[1], B(0)); + conv2DBackwardBias(outputDims, outputStrides, outputGrad, biasesGrad); + } +} // Kernels registration to implementation entry point REGISTRAR(ConvImpl2D_cpu, - {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr}); -REGISTRAR(ConvImpl2D_cpu, - {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr}); -REGISTRAR(ConvImpl2D_cpu, - {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr}); + {{DataType::Any, DataFormat::NCHW}, + {DataType::Float32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>, + Aidge::ConvImpl2D_cpu_backward_kernel<float, float, float, float>}); REGISTRAR(ConvImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, + {DataType::Float16, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, + half_float::half, + half_float::half, + half_float::half>, + Aidge::ConvImpl2D_cpu_backward_kernel<half_float::half, + half_float::half, + half_float::half, + half_float::half>}); +REGISTRAR( + ConvImpl2D_cpu, {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}}, - {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr}); -} // namespace Aidge + {ProdConso::inPlaceModel, + Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>, + Aidge::ConvImpl2D_cpu_backward_kernel<double, double, double, double>}); +REGISTRAR(ConvImpl2D_cpu, + {{DataType::Any, DataFormat::NCHW}, + {DataType::Int32, DataFormat::NCHW}}, + {ProdConso::inPlaceModel, + ConvImpl2D_cpu_forward_kernel<std::int32_t, + std::int32_t, + std::int32_t, + std::int32_t>, + ConvImpl2D_cpu_backward_kernel<std::int32_t, + std::int32_t, + std::int32_t, + std::int32_t>}); +} // namespace Aidge #endif /* AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_ */ diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp index fdfe19fbf4bf3e71c86aa28b966cfb21a1b5ba40..782a58d39b39f2bede1e79c9fb9c7a5807ef8f26 100644 --- a/src/operator/ConvImpl.cpp +++ b/src/operator/ConvImpl.cpp @@ -22,6 +22,8 @@ #include "aidge/operator/Conv.hpp" #include "aidge/utils/Types.h" +namespace Aidge { + template <> void Aidge::ConvImpl1D_cpu::forward() { const auto& op_ = static_cast<const Conv_Op<1>&>(mOp); @@ -55,9 +57,47 @@ void Aidge::ConvImpl1D_cpu::forward() { ); } -template <> -void Aidge::ConvImpl1D_cpu::backward() { - AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<1> on backend cpu"); +template <> void ConvImpl1D_cpu::backward() { + const auto &op = dynamic_cast<const Conv1D_Op &>(mOp); + const auto &outputGrad = op.getOutput(0)->grad(); + AIDGE_ASSERT(outputGrad, "{}: missing ouput #0 gradient", op.type()); + AIDGE_ASSERT(op.getInput(0)->grad(), + "{}: missing data input(#0) gradient", + op.type()); + AIDGE_ASSERT(op.getInput(1)->grad(), + "{}: missing weight input(#1) gradient", + op.type()); + + std::shared_ptr<Tensor> inputDataGradFallback, inputWeightGradFallback, + inputBiasGradFallback; + const auto &inputDataGrad = + op.getInput(0)->grad()->refCastFrom(inputDataGradFallback, + *(op.getOutput(0))); + const auto &inputWeightGrad = + op.getInput(1)->grad()->refCastFrom(inputWeightGradFallback, + *(op.getOutput(0))); + const auto &inputBiasGrad = + (op.getInput(2) && op.getInput(2)->grad()) + ? op.getInput(2)->grad()->refCastFrom(inputBiasGradFallback, + *(op.getOutput(0))) + : Tensor(); + + // Call kernel + const auto impl = + Registrar<ConvImpl1D_cpu>::create(getBestMatch(getRequiredSpec())); + impl.backward( + op.strideDims(), + op.dilationDims(), + op.kernelDims(), + op.getInput(0)->template dims<3>(), + op.getOutput(0)->template dims<3>(), + + getCPUPtr(op.getInput(0)), + getCPUPtr(op.getInput(1)), + getCPUPtr(outputGrad), + inputDataGrad.getImpl()->rawPtr(), + inputWeightGrad.getImpl()->rawPtr(), + op.getInput(2) ? inputBiasGrad.getImpl()->rawPtr() : nullptr); } template <> @@ -93,7 +133,48 @@ void Aidge::ConvImpl2D_cpu::forward() { ); } -template <> -void Aidge::ConvImpl2D_cpu::backward() { - AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<2> on backend cpu"); + +template <> void ConvImpl2D_cpu::backward() { + const auto &op = dynamic_cast<const Conv2D_Op &>(mOp); + const auto &outputGrad = op.getOutput(0)->grad(); + AIDGE_ASSERT(outputGrad, "{}: missing ouput #0 gradient", op.type()); + AIDGE_ASSERT(op.getInput(0)->grad(), + "{}: missing data input(#0) gradient", + op.type()); + AIDGE_ASSERT(op.getInput(1)->grad(), + "{}: missing weight input(#1) gradient", + op.type()); + + std::shared_ptr<Tensor> inputDataGradFallback, inputWeightGradFallback, + inputBiasGradFallback; + const auto &inputDataGrad = + op.getInput(0)->grad()->refCastFrom(inputDataGradFallback, + *(op.getOutput(0))); + const auto &inputWeightGrad = + op.getInput(1)->grad()->refCastFrom(inputWeightGradFallback, + *(op.getOutput(0))); + const auto &inputBiasGrad = + (op.getInput(2) && op.getInput(2)->grad()) + ? op.getInput(2)->grad()->refCastFrom(inputBiasGradFallback, + *(op.getOutput(0))) + : Tensor(); + + // Call kernel + const auto impl = + Registrar<ConvImpl2D_cpu>::create(getBestMatch(getRequiredSpec())); + impl.backward( + op.strideDims(), + op.dilationDims(), + op.kernelDims(), + op.getInput(0)->template dims<4>(), + op.getOutput(0)->template dims<4>(), + + getCPUPtr(op.getInput(0)), + getCPUPtr(op.getInput(1)), + getCPUPtr(outputGrad), + inputDataGrad.getImpl()->rawPtr(), + inputWeightGrad.getImpl()->rawPtr(), + op.getInput(2) ? inputBiasGrad.getImpl()->rawPtr() : nullptr); } + +} // namespace Aidge diff --git a/unit_tests/operator/Test_ConvImpl.cpp b/unit_tests/operator/Test_ConvImpl.cpp index f7be338c0b9c5bb1d5af6bfa09ed7855c17fb6c0..69e806cba87dc91db2b5223c6ff4a7b00c6a4df6 100644 --- a/unit_tests/operator/Test_ConvImpl.cpp +++ b/unit_tests/operator/Test_ConvImpl.cpp @@ -1645,4 +1645,1000 @@ TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") { REQUIRE(approxEq<float>(*(conv_op.getOutput(0)),*expectedOutput, 1e-5f, 1e-6f)); } } -} \ No newline at end of file +} + +template <DimSize_t DIM> +std::shared_ptr<OperatorTensor> +setupTestConv(const DimSize_t batchSize, + const DimSize_t inChannels, + const DimSize_t outChannels, + const std::array<DimSize_t, DIM> kernelSize, + const std::array<DimSize_t, DIM> dataSize, + const std::array<DimSize_t, DIM> stride, + const std::array<DimSize_t, DIM> dilation, + const std::array<DimSize_t, 2 * DIM> padding, + const std::shared_ptr<Tensor> input, + const std::shared_ptr<Tensor> weights, + const std::shared_ptr<Tensor> biases) { + input->setBackend("cpu"); + weights->setBackend("cpu"); + biases->setBackend("cpu"); + std::shared_ptr<Node> convNode; + convNode = Conv(inChannels, + outChannels, + kernelSize, + "myconv", + std::array<DimSize_t, DIM>({stride}), + dilation); + auto op = + std::static_pointer_cast<OperatorTensor>(convNode->getOperator()); + + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + op->associateInput(0, input); + op->associateInput(1, weights); + op->associateInput(2, biases); + + REQUIRE_NOTHROW(op->forwardDims(true)); + + return op; +} + +TEST_CASE("[cpu/operator] Conv(backward)", "[Conv][CPU]") { + SECTION("1D") { + const std::size_t DIM = 1; + SECTION("no stride & no dilation, outChannels > inChannels") { + + const DimSize_t batchSize = 1; + const DimSize_t inChannels = 2; + const DimSize_t outChannels = 3; + const DimSize_t kernelSize = 4; + const DimSize_t inDataSize = 12; + + const DimSize_t stride = 1; + const DimSize_t dilation = 1; + const std::array<DimSize_t, 2 * DIM> padding({0, 0}); + + auto inputSize = + std::vector<DimSize_t>({batchSize, inChannels, inDataSize}); + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000}, + {1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000, + 1.000000}}}})); + + auto weights = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{0.100000, 0.100000, 0.100000, 0.100000}, + {0.100000, 0.100000, 0.100000, 0.100000}}, + {{0.100000, 0.100000, 0.100000, 0.100000}, + {0.100000, 0.100000, 0.100000, 0.100000}}, + {{0.100000, 0.100000, 0.100000, 0.100000}, + {0.100000, 0.100000, 0.100000, 0.100000}}} + + })); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({0.010000, 0.010000, 0.010000})); + + auto op = setupTestConv<DIM>( + batchSize, + inChannels, + outChannels, + std::array<DimSize_t, DIM>({kernelSize}), + std::array<DimSize_t, DIM>({inDataSize}), + std::array<DimSize_t, DIM>({stride}), + std::array<DimSize_t, DIM>({dilation}), + padding, + input, + weights, + biases); + + //////////////////////////////////// + // setup gradients for backward + auto outputGrad = + std::make_shared<Tensor>(op->getOutput(0)->dims()); + outputGrad->setDataType(DataType::Float32); + outputGrad->setBackend("cpu"); + constantFiller(outputGrad, 1.f); + op->getOutput(0)->setGrad(outputGrad); + + //////////////////////////////////// + // setup gradients for backward + REQUIRE_NOTHROW(op->backward()); + + SECTION("Input Grad") { + auto expectedInputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{0.3000, + 0.6000, + 0.9000, + 1.2000, + 1.2000, + 1.2000, + 1.2000, + 1.2000, + 1.2000, + 0.9000, + 0.6000, + 0.3000}, + {0.3000, + 0.6000, + 0.9000, + 1.2000, + 1.2000, + 1.2000, + 1.2000, + 1.2000, + 1.2000, + 0.9000, + 0.6000, + 0.3000}}}})); + CHECK(approxEq<float, float>(*op->getInput(0)->grad(), + *expectedInputGrad)); + } + SECTION("Weight grad") { + std::vector<DimSize_t> weightsSize( + {outChannels, inChannels, kernelSize}); + auto expectedWeightsGrad = + std::make_shared<Tensor>(weightsSize); + expectedWeightsGrad->setBackend("cpu"); + expectedWeightsGrad->setDataType(DataType::Float32); + constantFiller<float>(expectedWeightsGrad, 9.); + + CHECK(approxEq<float, float>(*op->getInput(1)->grad(), + *expectedWeightsGrad)); + } + SECTION("Bias Grad") { + std::vector<DimSize_t> biasesSize({outChannels}); + auto expectedBiasGrad = std::make_shared<Tensor>(biasesSize); + expectedBiasGrad->setBackend("cpu"); + expectedBiasGrad->setDataType(DataType::Float32); + constantFiller<float>(expectedBiasGrad, 9.); + CHECK(approxEq<float, float>(*op->getInput(2)->grad(), + *expectedBiasGrad)); + } + } + + SECTION("stride and no dilation, inChannel > outChannels") { + const DimSize_t batchSize = 2; + const DimSize_t inChannels = 3; + const DimSize_t outChannels = 1; + const DimSize_t kernelSize = 2; + const DimSize_t inDataSize = 8; + const DimSize_t stride = 3; + const DimSize_t dilation = 1; + const std::array<DimSize_t, 2 * DIM> padding({0, 0}); + + auto inputSize = + std::vector<DimSize_t>({batchSize, inChannels, inDataSize}); + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}}, + + {{1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}}}})); + auto weights = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{0.1000, 0.1000}, + {0.1000, 0.1000}, + {0.1000, 0.1000}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({0.060000})); + + auto op = setupTestConv<DIM>( + batchSize, + inChannels, + outChannels, + std::array<DimSize_t, DIM>({kernelSize}), + std::array<DimSize_t, DIM>({inDataSize}), + std::array<DimSize_t, DIM>({stride}), + std::array<DimSize_t, DIM>({dilation}), + padding, + input, + weights, + biases); + + //////////////////////////////////// + // setup gradients for backward + auto outputGrad = + std::make_shared<Tensor>(op->getOutput(0)->dims()); + outputGrad->setDataType(DataType::Float32); + outputGrad->setBackend("cpu"); + constantFiller(outputGrad, 1.f); + op->getOutput(0)->setGrad(outputGrad); + + //////////////////////////////////// + // setup gradients for backward + REQUIRE_NOTHROW(op->backward()); + + SECTION("Input Grad") { + auto expectedInputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000}}, + + {{0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000, + 0.0000, + 0.1000, + 0.1000}}}})); + CHECK(approxEq<float, float>(*op->getInput(0)->grad(), + *expectedInputGrad)); + } + SECTION("Weight grad") { + auto expectedWeightsGrad = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{6., 6.}, {6., 6.}, {6., 6.}}}})); + CHECK(approxEq<float, float>(*op->getInput(1)->grad(), + *expectedWeightsGrad)); + } + SECTION("Bias Grad") { + auto expectedBiasesGrad = std::make_shared<Tensor>( + Array1D<float, outChannels>({6.})); + CHECK(approxEq<float, float>(*op->getInput(2)->grad(), + *expectedBiasesGrad)); + } + } + + SECTION("dilation, no stride") { + const DimSize_t batchSize = 2; + const DimSize_t inChannels = 3; + const DimSize_t outChannels = 1; + const DimSize_t kernelSize = 2; + const DimSize_t inDataSize = 8; + + const DimSize_t stride = 1; + const DimSize_t dilation = 2; + const std::array<DimSize_t, 2 * DIM> padding({0, 0}); + + auto inputSize = + std::vector<DimSize_t>({batchSize, inChannels, inDataSize}); + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}}, + + {{1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1.}}}})); + auto weights = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{0.1000, 0.1000}, + {0.1000, 0.1000}, + {0.1000, 0.1000}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({0.060000})); + + auto op = setupTestConv<DIM>( + batchSize, + inChannels, + outChannels, + std::array<DimSize_t, DIM>({kernelSize}), + std::array<DimSize_t, DIM>({inDataSize}), + std::array<DimSize_t, DIM>({stride}), + std::array<DimSize_t, DIM>({dilation}), + padding, + input, + weights, + biases); + + //////////////////////////////////// + // setup gradients for backward + auto outputGrad = + std::make_shared<Tensor>(op->getOutput(0)->dims()); + outputGrad->setDataType(DataType::Float32); + outputGrad->setBackend("cpu"); + constantFiller(outputGrad, 1.f); + op->getOutput(0)->setGrad(outputGrad); + + //////////////////////////////////// + // setup gradients for backward + REQUIRE_NOTHROW(op->backward()); + + SECTION("Input Grad") { + auto expectedInputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{0.1000, + 0.1000, + 0.2000, + 0.2000, + 0.2000, + 0.2000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.2000, + 0.2000, + 0.2000, + 0.2000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.2000, + 0.2000, + 0.2000, + 0.2000, + 0.1000, + 0.1000}}, + + {{0.1000, + 0.1000, + 0.2000, + 0.2000, + 0.2000, + 0.2000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.2000, + 0.2000, + 0.2000, + 0.2000, + 0.1000, + 0.1000}, + {0.1000, + 0.1000, + 0.2000, + 0.2000, + 0.2000, + 0.2000, + 0.1000, + 0.1000}}}})); + CHECK(approxEq<float, float>(*op->getInput(0)->grad(), + *expectedInputGrad)); + } + SECTION("Weight grad") { + auto expectedWeightsGrad = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{12., 12.}, {12., 12.}, {12., 12.}}}})); + CHECK(approxEq<float, float>(*op->getInput(1)->grad(), + *expectedWeightsGrad)); + } + SECTION("Bias Grad") { + auto expectedBiasesGrad = std::make_shared<Tensor>( + Array1D<float, outChannels>({12.})); + CHECK(approxEq<float, float>(*op->getInput(2)->grad(), + *expectedBiasesGrad)); + } + } + SECTION("stride & dilation") { + const DimSize_t batchSize = 1; + const DimSize_t inChannels = 4; + const DimSize_t outChannels = 4; + const DimSize_t kernelSize = 3; + const DimSize_t inDataSize = 13; + + const DimSize_t stride = 4; + const DimSize_t dilation = 3; + const std::array<DimSize_t, 2 * DIM> padding({0, 0}); + + auto inputSize = + std::vector<DimSize_t>({batchSize, inChannels, inDataSize}); + + auto input = std::make_shared< + Tensor>(Array3D<float, batchSize, inChannels, inDataSize>( + {{{{1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, + {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}}}})); + auto weights = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}}, + + {{0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}}, + + {{0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}}, + + {{0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}, + {0.1000, 0.1000, 0.1000}}}})); + + auto biases = std::make_shared<Tensor>(Array1D<float, outChannels>( + {{0.0100, 0.0100, 0.0100, 0.0100}})); + + auto op = setupTestConv<DIM>( + batchSize, + inChannels, + outChannels, + std::array<DimSize_t, DIM>({kernelSize}), + std::array<DimSize_t, DIM>({inDataSize}), + std::array<DimSize_t, DIM>({stride}), + std::array<DimSize_t, DIM>({dilation}), + padding, + input, + weights, + biases); + + //////////////////////////////////// + // setup gradients for backward + auto outputGrad = + std::make_shared<Tensor>(op->getOutput(0)->dims()); + outputGrad->setDataType(DataType::Float32); + outputGrad->setBackend("cpu"); + constantFiller(outputGrad, 1.f); + op->getOutput(0)->setGrad(outputGrad); + + //////////////////////////////////// + // setup gradients for backward + REQUIRE_NOTHROW(op->backward()); + + SECTION("Input Grad") { + auto expectedInputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.0000, + 0.0000}, + {0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.0000, + 0.0000}, + {0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.0000, + 0.0000}, + {0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.4000, + 0.4000, + 0.0000, + 0.0000, + 0.4000, + 0.0000, + 0.0000}}}})); + CHECK(approxEq<float, float>(*op->getInput(0)->grad(), + *expectedInputGrad)); + } + SECTION("Weight grad") { + auto expectedWeightsGrad = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}}, + + {{2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}}, + + {{2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}}, + + {{2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}, + {2., 2., 2.}}}})); + CHECK(approxEq<float, float>(*op->getInput(1)->grad(), + *expectedWeightsGrad)); + } + SECTION("Bias Grad") { + auto expectedBiasesGrad = std::make_shared<Tensor>( + Array1D<float, outChannels>({{2., 2., 2., 2.}})); + CHECK(approxEq<float, float>(*op->getInput(2)->grad(), + *expectedBiasesGrad)); + } + } + + // Harder to read, look at previous tests in case of issue + SECTION("Sequential values") { + const DimSize_t batchSize = 1; + const DimSize_t inChannels = 2; + const DimSize_t outChannels = 2; + const DimSize_t kernelSize = 3; + const DimSize_t inDataSize = 8; + + const DimSize_t stride = 2; + const DimSize_t dilation = 2; + const std::array<DimSize_t, 2 * DIM> padding({0, 0}); + + const DimSize_t outDataSize = 2; + + auto inputSize = + std::vector<DimSize_t>({batchSize, inChannels, inDataSize}); + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{1., 2., 3., 4., 5., 6., 7., 8.}, + {9., 10., 11., 12., 13., 14., 15., 16.}}}})); + auto weights = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{0.1000, 0.2000, 0.3000}, {0.4000, 0.5000, 0.6000}}, + + {{0.7000, 0.8000, 0.9000}, {1.0000, 1.1000, 1.2000}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{0.0100, 0.0200}})); + + auto outputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, outChannels, outDataSize>( + {{{{1., 2.}, {3., 4.}}}})); + + auto op = setupTestConv<DIM>( + batchSize, + inChannels, + outChannels, + std::array<DimSize_t, DIM>({kernelSize}), + std::array<DimSize_t, DIM>({inDataSize}), + std::array<DimSize_t, DIM>({stride}), + std::array<DimSize_t, DIM>({dilation}), + padding, + input, + weights, + biases); + + //////////////////////////////////// + // setup gradients for backward + op->getOutput(0)->setGrad(outputGrad); + + REQUIRE_NOTHROW(op->backward()); + + SECTION("Input Grad") { + auto expectedInputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{2.2000, + 0.0000, + 5.6000, + 0.0000, + 6.6000, + 0.0000, + 4.2000, + 0.0000}, + {3.4000, + 0.0000, + 8.6000, + 0.0000, + 9.6000, + 0.0000, + 6.0000, + 0.0000}}}})); + CHECK(approxEq<float, float>(*op->getInput(0)->grad(), + *expectedInputGrad)); + } + SECTION("Weight grad") { + auto expectedWeightsGrad = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{7., 13., 19.}, {31., 37., 43.}}, + + {{15., 29., 43.}, {71., 85., 99.}}}})); + CHECK(approxEq<float, float>(*op->getInput(1)->grad(), + *expectedWeightsGrad)); + } + SECTION("Bias Grad") { + auto expectedBiasesGrad = std::make_shared<Tensor>( + Array1D<float, outChannels>({{3., 7.}})); + CHECK(approxEq<float, float>(*op->getInput(2)->grad(), + *expectedBiasesGrad)); + } + } + SECTION("random values testing") { + const DimSize_t batchSize = 1; + const DimSize_t inChannels = 4; + const DimSize_t outChannels = 4; + const DimSize_t kernelSize = 3; + const DimSize_t inDataSize = 13; + const DimSize_t outDataSize = 2; + + const DimSize_t stride = 4; + const DimSize_t dilation = 3; + const std::array<DimSize_t, 2 * DIM> padding({0, 0}); + + auto inputSize = + std::vector<DimSize_t>({batchSize, inChannels, inDataSize}); + + auto input = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{0.180772, + -0.069988, + -0.359623, + -0.915204, + 0.625765, + 0.025510, + 0.954514, + 0.064349, + 0.361151, + 1.167878, + -1.349893, + -0.510177, + 0.235958}, + {-0.239778, + -0.921115, + 1.543297, + 1.348826, + -0.139642, + 0.285797, + 0.965120, + -2.037150, + 0.493136, + 1.486999, + 0.591033, + 0.126030, + -1.562687}, + {-1.160103, + -0.334841, + 0.447772, + -0.801645, + 1.523611, + 2.508587, + -0.663096, + -0.251275, + 1.010145, + 0.121547, + -1.510835, + 2.104773, + 2.762959}, + {-1.746529, + 0.410919, + -0.242185, + 0.420812, + 0.277596, + 0.778898, + 1.533269, + 1.609736, + -0.403228, + -0.274928, + 1.473840, + 0.068826, + 1.332708}}}})); + auto weights = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{0.587285, 0.286069, 0.008287}, + {-0.252325, -1.324722, 0.189178}, + {0.021100, 0.940420, -0.557690}, + {-0.693927, -0.325247, 1.243933}}, + + {{-1.167186, -0.409124, 1.260062}, + {-1.563006, 1.134614, -0.082384}, + {0.289316, 0.835773, -0.244991}, + {0.271223, 0.093636, -0.883432}}, + + {{-0.327417, 0.078394, -0.380766}, + {0.377508, 0.111912, 2.314279}, + {-0.798906, -0.564303, -1.134660}, + {0.170527, 0.994665, 1.262572}}, + + {{1.621816, 1.077471, 0.594781}, + {-1.529087, 2.043707, -0.165627}, + {0.087070, -0.527656, -0.100288}, + {1.053922, -0.623074, -1.590572}}}})); + + auto biases = std::make_shared<Tensor>(Array1D<float, outChannels>( + {{1.285940, -0.051787, -0.968103, -0.586324}})); + + auto op = setupTestConv<DIM>( + batchSize, + inChannels, + outChannels, + std::array<DimSize_t, DIM>({kernelSize}), + std::array<DimSize_t, DIM>({inDataSize}), + std::array<DimSize_t, DIM>({stride}), + std::array<DimSize_t, DIM>({dilation}), + padding, + input, + weights, + biases); + + //////////////////////////////////// + // setup gradients for backward + auto outputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, outChannels, outDataSize>( + {{{{0.053156, 1.189073}, + {0.100228, 1.042344}, + {-1.468991, 0.581337}, + {1.330418, 0.487802}}}})); + op->getOutput(0)->setGrad(outputGrad); + + //////////////////////////////////// + // setup gradients for backward + REQUIRE_NOTHROW(op->backward()); + + SECTION("Input Grad") { + auto expectedInputGrad = std::make_shared<Tensor>( + Array3D<float, batchSize, inChannels, inDataSize>( + {{{{2.552898, + 0.000000, + 0.000000, + 1.292528, + 0.082501, + 0.000000, + 1.477383, + 0.484875, + 0.000000, + 0.000000, + 1.392054, + 0.000000, + 0.000000}, + {-2.758950, + 0.000000, + 0.000000, + 2.597889, + -2.455656, + 0.000000, + -3.618210, + 0.669449, + 0.000000, + 0.000000, + 1.403657, + 0.000000, + 0.000000}, + {1.319545, + 0.000000, + 0.000000, + 0.260710, + -0.095303, + 0.000000, + 1.479181, + 1.403949, + 0.000000, + 0.000000, + -1.627040, + 0.000000, + 0.000000}, + {1.141951, + 0.000000, + 0.000000, + -2.298007, + 0.070817, + 0.000000, + -3.993255, + -0.014843, + 0.000000, + 0.000000, + 0.516383, + 0.000000, + 0.000000}}}})); + CHECK(approxEq<float, float>(*op->getInput(0)->grad(), + *expectedInputGrad, + 1e-5, + 1e-6)); + } + SECTION("Weight grad") { + auto expectedWeightsGrad = std::make_shared<Tensor>( + Array3D<float, outChannels, inChannels, kernelSize>( + {{{{0.753690, 0.027866, -1.554383}, + {-0.178790, -2.350622, 0.754084}, + {1.750019, -0.341397, -1.831741}, + {0.237243, 1.936463, 1.834007}}, + + {{0.670381, -0.024656, -1.311384}, + {-0.169587, -1.988220, 0.712792}, + {1.471852, -0.342263, -1.641270}, + {0.114300, 1.720076, 1.689925}}, + + {{0.098228, 1.381835, -2.186914}, + {0.271054, -3.165683, -1.074165}, + {2.589912, 1.031534, 0.095779}, + {2.727013, 0.317630, -1.395561}}, + + {{0.545751, -1.186215, 0.611421}, + {-0.387123, 0.800776, 1.572321}, + {-0.800201, -1.189095, -1.619183}, + {-2.188202, 1.345088, 2.758830}}} + + })); + CHECK(approxEq<float, float>(*op->getInput(1)->grad(), + *expectedWeightsGrad, + 1e-5, + 1e-6)); + } + SECTION("Bias Grad") { + auto expectedBiasesGrad = + std::make_shared<Tensor>(Array1D<float, outChannels>( + {{1.242230, 1.142572, -0.887655, 1.818220}})); + CHECK(approxEq<float, float>(*op->getInput(2)->grad(), + *expectedBiasesGrad)); + } + } + } + SECTION("2D") { + const DimSize_t DIM = 2; + SECTION("Sequential values") { + constexpr DimSize_t batchSize = 1; + constexpr DimSize_t inChannels = 1; + constexpr DimSize_t outChannels = 2; + constexpr std::array<DimSize_t, DIM> kernelSize = {1, 2}; + constexpr std::array<DimSize_t, DIM> inDataSize = {3, 4}; + + constexpr std::array<DimSize_t, DIM> stride = {1, 2}; + constexpr std::array<DimSize_t, DIM> dilation = {1, 2}; + constexpr std::array<DimSize_t, 2 * DIM> padding({0, 0}); + + constexpr std::array<DimSize_t, DIM> outDataSize = {3, 1}; + + auto inputSize = std::vector<DimSize_t>( + {batchSize, inChannels, inDataSize[0], inDataSize[1]}); + + auto input = std::make_shared<Tensor>( + Array4D<float, + batchSize, + inChannels, + inDataSize[0], + inDataSize[1]>({{{{{1., 2., 3., 4.}, + {5., 6., 7., 8.}, + {9., 10., 11., 12.}}}}})); + auto weights = std::make_shared<Tensor>( + Array4D<float, + outChannels, + inChannels, + kernelSize[0], + kernelSize[1]>({{{{{1., 2.}}}, {{{3., 4.}}}}})); + + auto biases = std::make_shared<Tensor>( + Array1D<float, outChannels>({{1., 2.}})); + + auto outputGrad = std::make_shared<Tensor>(Array4D<float, + batchSize, + outChannels, + outDataSize[0], + outDataSize[1]>( + {{{{{1.}, {2.}, {3.}}, {{4.}, {5.}, {6.}}}}})); + + auto op = setupTestConv<DIM>(batchSize, + inChannels, + outChannels, + kernelSize, + inDataSize, + stride, + dilation, + padding, + input, + weights, + biases); + + //////////////////////////////////// + // setup gradients for backward + op->getOutput(0)->setGrad(outputGrad); + + REQUIRE_NOTHROW(op->backward()); + + SECTION("Input Grad") { + auto expectedInputGrad = std::make_shared<Tensor>( + Array4D<float, + batchSize, + inChannels, + inDataSize[0], + inDataSize[1]>({{{{{13., 0., 18., 0.}, + {17., 0., 24., 0.}, + {21., 0., 30., 0.}}}}})); + CHECK(approxEq<float, float>(*op->getInput(0)->grad(), + *expectedInputGrad)); + } + SECTION("Weight grad") { + auto expectedWeightsGrad = + std::make_shared<Tensor>(Array4D<float, + outChannels, + inChannels, + kernelSize[0], + kernelSize[1]>( + {{{{{38., 50.}}}, {{{83., 113.}}}}})); + CHECK(approxEq<float, float>(*op->getInput(1)->grad(), + *expectedWeightsGrad)); + } + SECTION("Bias Grad") { + auto expectedBiasesGrad = std::make_shared<Tensor>( + Array1D<float, outChannels>({{6., 15.}})); + CHECK(approxEq<float, float>(*op->getInput(2)->grad(), + *expectedBiasesGrad)); + } + } + } +}