From e9e8c07e5b6dae1730180f4f555ca1c6362afaf5 Mon Sep 17 00:00:00 2001 From: NAUD Maxence <maxence.naud@cea.fr> Date: Thu, 28 Mar 2024 09:55:55 +0000 Subject: [PATCH] Upd ReLU, LeakyReLU and FC backward functions --- include/aidge/backend/cpu/operator/FCImpl.hpp | 5 +- .../cpu/operator/FCImpl_backward_kernels.hpp | 84 +++++++++++++++ .../LeakyReLUImpl_backward_kernels.hpp | 2 +- .../operator/ReLUImpl_backward_kernels.hpp | 2 +- src/operator/FCImpl.cpp | 102 +++++++++--------- src/operator/LeakyReLUImpl.cpp | 10 +- src/operator/ReLUImpl.cpp | 7 +- 7 files changed, 154 insertions(+), 58 deletions(-) create mode 100644 include/aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp diff --git a/include/aidge/backend/cpu/operator/FCImpl.hpp b/include/aidge/backend/cpu/operator/FCImpl.hpp index 71fdf8e2..fedd8b38 100644 --- a/include/aidge/backend/cpu/operator/FCImpl.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl.hpp @@ -48,6 +48,8 @@ class FCImplBackward_cpu : public Registrable<FCImplBackward_cpu, const void *, const void *, const void *, + void *, + void *, void *)> {}; class FCImpl_cpu : public OperatorImpl { @@ -58,7 +60,8 @@ public: return std::make_unique<FCImpl_cpu>(op); } - void forward() override; + void forward() override final; + void backward() override final; }; namespace { diff --git a/include/aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp new file mode 100644 index 00000000..50fb5f49 --- /dev/null +++ b/include/aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp @@ -0,0 +1,84 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_FCIMPL_BACKWARD_KERNEL_H_ +#define AIDGE_CPU_OPERATOR_FCIMPL_BACKWARD_KERNEL_H_ + +#include "aidge/utils/Registrar.hpp" +#include <algorithm> + +#include "aidge/backend/cpu/operator/FCImpl.hpp" + +namespace Aidge { +template <class I, class O, class W, class B> +void FCImpl_cpu_backward_kernel(const FC_Op::Attrs& attrs, const DimSize_t batchSize, const DimSize_t oneInputSize, + const void* input_, const void* originalInput_, const void* weight_, void* output_, void* weightGrad_, void* biasesGrad_) { + // FIXME: missing FC attributes as arguments + const I* input = static_cast<const I*>(input_); + const I* originalInput = static_cast<const I*>(originalInput_); + const W* weight = static_cast<const W*>(weight_); + O* output = static_cast<O*>(output_); + W* weightGrad = static_cast<W*>(weightGrad_); + B* biasesGrad = static_cast<B*>(biasesGrad_); + + + // bias grad + if (std::get<1>(attrs)) { // no bias + std::fill(biasesGrad, biasesGrad + std::get<0>(attrs), B(0)); + } else { + for (std::size_t o = 0; o < std::get<0>(attrs); ++o) { // nb outputs + B sum{0}; + for (std::size_t b = 0; b < batchSize; ++b) { + sum += input[b*std::get<0>(attrs) + o]; + } + biasesGrad[o] = sum; + } + } + + // weight grad + for (std::size_t o = 0; o < std::get<0>(attrs); ++o) { + for (std::size_t c = 0; c < oneInputSize; ++c) { + W sum{0}; + for (std::size_t b = 0; b < batchSize; ++b) { + sum += originalInput[b*oneInputSize + c]*input[b*std::get<0>(attrs) + o]; + } + weightGrad[o*oneInputSize + c] = sum; + } + } + + // input grad + for (std::size_t b = 0; b < batchSize; ++b) { + for (std::size_t c = 0; c < oneInputSize; ++c) { + O sum{0}; + for (std::size_t o = 0; o < std::get<0>(attrs); ++o) { + sum += weight[o*oneInputSize + c] * input[b*std::get<0>(attrs) + o]; + } + output[b*oneInputSize + c] = sum; + } + } +} + + +namespace { +static Registrar<FCImplBackward_cpu> registrarFCImpl2DBackward_cpu_Float32( + {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32}, + Aidge::FCImpl_cpu_backward_kernel<float, float, float, float>); +static Registrar<FCImplBackward_cpu> registrarFCImpl2DBackward_cpu_Int32( + {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32}, + Aidge::FCImpl_cpu_backward_kernel<int, int, int, int>); +static Registrar<FCImplBackward_cpu> registrarFCImpl2DBackward_cpu_Float64( + {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64}, + Aidge::FCImpl_cpu_backward_kernel<double, double, double, double>); +} // namespace + +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_FCIMPL_BACKWARD_KERNEL_H_ */ diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl_backward_kernels.hpp index 0e2fc400..949e6af6 100644 --- a/include/aidge/backend/cpu/operator/LeakyReLUImpl_backward_kernels.hpp +++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl_backward_kernels.hpp @@ -28,7 +28,7 @@ void LeakyReLUImpl_cpu_backward_kernel(const LeakyReLU_Op::Attrs& attrs, I negativeSlope = static_cast<I>(std::get<0>(attrs)); for (std::size_t i = 0; i < inputLenght; ++i) { - output[i] = input[i] > 0 ? 1 : negativeSlope; + output[i] = input[i] > 0 ? input[i] : negativeSlope*input[i]; } } diff --git a/include/aidge/backend/cpu/operator/ReLUImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/ReLUImpl_backward_kernels.hpp index 47d95ac4..b68ea076 100644 --- a/include/aidge/backend/cpu/operator/ReLUImpl_backward_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ReLUImpl_backward_kernels.hpp @@ -28,7 +28,7 @@ void ReLUImpl_cpu_backward_kernel(const std::size_t inputLenght, O* output = static_cast<O*>(output_); for (std::size_t i = 0; i < inputLenght; ++i) { - output[i] = (input[i] > I(0)) ? O(1) : O(0); + output[i] = (input[i] > I(0)) ? static_cast<O>(input[i]) : O(0); } } diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp index 8b0ffca8..eecff38a 100644 --- a/src/operator/FCImpl.cpp +++ b/src/operator/FCImpl.cpp @@ -9,25 +9,27 @@ * ********************************************************************************/ -#include <cassert> -#include <chrono> // std::chrono::milliseconds -#include <numeric> // std::accumulate -#include <thread> // std::this_thread::sleep_for -#include <vector> +#include "aidge/backend/cpu/operator/FCImpl.hpp" + +#include <cstddef> // std::size_t +#include <functional> +#include <memory> +#include <tuple> +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include "aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp" +#include "aidge/backend/cpu/operator/FCImpl_forward_kernels.hpp" #include "aidge/operator/FC.hpp" +#include "aidge/utils/ErrorHandling.hpp" #include "aidge/utils/Types.h" -#include "aidge/backend/cpu/data/GetCPUPtr.h" -#include "aidge/backend/cpu/operator/FCImpl.hpp" -#include "aidge/backend/cpu/operator/FCImpl_forward_kernels.hpp" void Aidge::FCImpl_cpu::forward() { const FC_Op& op_ = dynamic_cast<const FC_Op&>(mOp); - assert((op_.getInput(0)) && "missing input #0"); - assert((op_.getInput(1)) && "missing input #1"); - assert((op_.getInput(2)) && "missing input #2"); + AIDGE_ASSERT(op_.getInput(0), "missing input #0"); + AIDGE_ASSERT(op_.getInput(1), "missing input #1"); + AIDGE_ASSERT(op_.getInput(2), "missing input #2"); // Find the correct kernel type const auto outputDataType = op_.getOutput(0)->dataType(); @@ -66,44 +68,48 @@ void Aidge::FCImpl_cpu::forward() getCPUPtr(mOp.getRawOutput(0))); } -// void Aidge::FCImpl_cpu::backward() -// { -// const FC_Op& op_ = dynamic_cast<const FC_Op&>(mOp); -// const auto& fc_grad = op_.getOutput(0)->grad(); -// assert(fc_grad && "missing ouput #0 gradient"); +void Aidge::FCImpl_cpu::backward() +{ + const FC_Op& op_ = dynamic_cast<const FC_Op&>(mOp); + const auto& fc_grad = op_.getOutput(0)->grad(); + assert(fc_grad && "missing ouput #0 gradient"); -// // Find the correct kernel type -// const Registrar<FCImplBackward_cpu>::registrar_key registrarKey = { -// op_.getInput(0)->grad()->dataType(), -// op_.getInput(1)->grad()->dataType(), -// op_.getInput(2)->grad()->dataType(), -// fc_grad->dataType()}; + // Find the correct kernel type + const Registrar<FCImplBackward_cpu>::registrar_key registrarKey = { + fc_grad->dataType(), + op_.getInput(0)->grad()->dataType(), + op_.getInput(1)->grad()->dataType(), + op_.getInput(2)->grad()->dataType()}; -// Registrar<FCImplBackward_cpu>::registrar_type kernelFunc; -// if (Registrar<FCImplBackward_cpu>::exists(registrarKey)) { -// // One exists with the right inputs/output types -// kernelFunc = Registrar<FCImplBackward_cpu>::create(registrarKey); -// } -// else { -// // Otherwise, fallback to the kernel with all types matching output type -// kernelFunc = Registrar<FCImplBackward_cpu>::create({ -// fc_grad->dataType(), fc_grad->dataType(), fc_grad->dataType(), fc_grad->dataType()}); -// } + Registrar<FCImplBackward_cpu>::registrar_type kernelFunc; + if (Registrar<FCImplBackward_cpu>::exists(registrarKey)) { + // One exists with the right inputs/output types + kernelFunc = Registrar<FCImplBackward_cpu>::create(registrarKey); + } + else { + // Otherwise, fallback to the kernel with all types matching output type + kernelFunc = Registrar<FCImplBackward_cpu>::create({ + fc_grad->dataType(), fc_grad->dataType(), fc_grad->dataType(), fc_grad->dataType()}); + } -// // Convert input data (no overhead if not needed!) -// // TODO: right now, if needed, memory will be allocated/deallocated at each -// // call to forward(). We might put the following shared_ptr as members of -// // this class to avoid that. -// std::shared_ptr<Tensor> input0gradFallback, input1gradFallback, input2gradFallback; -// const auto& input0grad = op_.getInput(0)->grad()->refCastFrom(input0gradFallback, *(op_.getOutput(0))); -// const auto& input1grad = op_.getInput(1)->grad()->refCastFrom(input1gradFallback, *(op_.getOutput(0))); -// const auto& input2grad = op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0))); + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + std::shared_ptr<Tensor> input0gradFallback, input1gradFallback, input2gradFallback; + const auto& input0grad = op_.getInput(0)->grad()->refCastFrom(input0gradFallback, *(op_.getOutput(0))); + const auto& input1grad = op_.getInput(1)->grad()->refCastFrom(input1gradFallback, *(op_.getOutput(0))); + const auto& input2grad = op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0))); -// // Call kernel -// const auto batchSize = (input0.dims().size() > 1) ? input0.dims()[0] : 1; -// kernelFunc(dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(), -// batchSize, -// input0.size() / batchSize, -// input0.getImpl()->rawPtr(), input1.getImpl()->rawPtr(), input2.getImpl()->rawPtr(), -// getCPUPtr(mOp.getRawOutput(0))); -// } + // Call kernel + const auto batchSize = (input0grad.dims().size() > 1) ? input0grad.dims()[0] : 1; + kernelFunc(dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(), + batchSize, + input0grad.size() / batchSize, + getCPUPtr(fc_grad), + getCPUPtr(op_.getInput(0)), + getCPUPtr(mOp.getRawInput(1)), + input0grad.getImpl()->rawPtr(), + input1grad.getImpl()->rawPtr(), + input2grad.getImpl()->rawPtr()); +} diff --git a/src/operator/LeakyReLUImpl.cpp b/src/operator/LeakyReLUImpl.cpp index 4ffb230d..67847429 100644 --- a/src/operator/LeakyReLUImpl.cpp +++ b/src/operator/LeakyReLUImpl.cpp @@ -28,8 +28,9 @@ Aidge::NbElts_t Aidge::LeakyReLUImpl_cpu::getNbRequiredProtected(const Aidge::IO } void Aidge::LeakyReLUImpl_cpu::forward() { - std::shared_ptr<Tensor> in0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0)); - std::shared_ptr<Tensor> out0 = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)); + const LeakyReLU_Op& op_ = dynamic_cast<const LeakyReLU_Op&>(mOp); + std::shared_ptr<Tensor> in0 = op_.getInput(0); + std::shared_ptr<Tensor> out0 = op_.getOutput(0); AIDGE_ASSERT(in0, "missing input #0"); // Find the correct kernel type @@ -46,8 +47,9 @@ void Aidge::LeakyReLUImpl_cpu::forward() { void Aidge::LeakyReLUImpl_cpu::backward() { // reversing in and out Data for backprop - std::shared_ptr<Tensor> in0 = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)); - std::shared_ptr<Tensor> out0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0)); + const LeakyReLU_Op& op_ = dynamic_cast<const LeakyReLU_Op&>(mOp); + std::shared_ptr<Tensor> in0 = op_.getOutput(0)->grad(); + std::shared_ptr<Tensor> out0 = op_.getInput(0)->grad(); AIDGE_ASSERT(in0, "missing input #0"); // Find the correct kernel type diff --git a/src/operator/ReLUImpl.cpp b/src/operator/ReLUImpl.cpp index 84bb1045..00552146 100644 --- a/src/operator/ReLUImpl.cpp +++ b/src/operator/ReLUImpl.cpp @@ -44,9 +44,10 @@ void Aidge::ReLUImpl_cpu::forward() { void Aidge::ReLUImpl_cpu::backward() { // reversing in and out Tensors - std::shared_ptr<Tensor> in0 = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->grad(); - std::shared_ptr<Tensor> out0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->grad(); - AIDGE_ASSERT(out0, "missing input #0"); + const ReLU_Op& op_ = dynamic_cast<const ReLU_Op&>(mOp); + std::shared_ptr<Tensor> in0 = op_.getOutput(0)->grad(); + std::shared_ptr<Tensor> out0 = op_.getInput(0)->grad(); + AIDGE_ASSERT(out0, "current {} operator output#0 has not gradient Tensor.", op_.type()); // Find the correct kernel type auto kernelFunc = Registrar<ReLUImplBackward_cpu>::create({ -- GitLab