From e9e8c07e5b6dae1730180f4f555ca1c6362afaf5 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Thu, 28 Mar 2024 09:55:55 +0000
Subject: [PATCH] Upd ReLU, LeakyReLU and FC backward functions

---
 include/aidge/backend/cpu/operator/FCImpl.hpp |   5 +-
 .../cpu/operator/FCImpl_backward_kernels.hpp  |  84 +++++++++++++++
 .../LeakyReLUImpl_backward_kernels.hpp        |   2 +-
 .../operator/ReLUImpl_backward_kernels.hpp    |   2 +-
 src/operator/FCImpl.cpp                       | 102 +++++++++---------
 src/operator/LeakyReLUImpl.cpp                |  10 +-
 src/operator/ReLUImpl.cpp                     |   7 +-
 7 files changed, 154 insertions(+), 58 deletions(-)
 create mode 100644 include/aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp

diff --git a/include/aidge/backend/cpu/operator/FCImpl.hpp b/include/aidge/backend/cpu/operator/FCImpl.hpp
index 71fdf8e2..fedd8b38 100644
--- a/include/aidge/backend/cpu/operator/FCImpl.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl.hpp
@@ -48,6 +48,8 @@ class FCImplBackward_cpu : public Registrable<FCImplBackward_cpu,
                                               const void *,
                                               const void *,
                                               const void *,
+                                              void *,
+                                              void *,
                                               void *)> {};
 
 class FCImpl_cpu : public OperatorImpl {
@@ -58,7 +60,8 @@ public:
         return std::make_unique<FCImpl_cpu>(op);
     }
 
-    void forward() override;
+    void forward() override final;
+    void backward() override final;
 };
 
 namespace {
diff --git a/include/aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp
new file mode 100644
index 00000000..50fb5f49
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp
@@ -0,0 +1,84 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_FCIMPL_BACKWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_FCIMPL_BACKWARD_KERNEL_H_
+
+#include "aidge/utils/Registrar.hpp"
+#include <algorithm>
+
+#include "aidge/backend/cpu/operator/FCImpl.hpp"
+
+namespace Aidge {
+template <class I, class O, class W, class B>
+void FCImpl_cpu_backward_kernel(const FC_Op::Attrs& attrs, const DimSize_t batchSize, const DimSize_t oneInputSize,
+                                   const void* input_, const void* originalInput_, const void* weight_, void* output_, void* weightGrad_, void* biasesGrad_) {
+    // FIXME: missing FC attributes as arguments
+    const I* input  = static_cast<const I*>(input_);
+    const I* originalInput  = static_cast<const I*>(originalInput_);
+    const W* weight = static_cast<const W*>(weight_);
+    O* output       = static_cast<O*>(output_);
+    W* weightGrad   = static_cast<W*>(weightGrad_);
+    B* biasesGrad   = static_cast<B*>(biasesGrad_);
+
+
+    // bias grad
+    if (std::get<1>(attrs)) { // no bias
+        std::fill(biasesGrad, biasesGrad + std::get<0>(attrs), B(0));
+    } else {
+        for (std::size_t o = 0; o < std::get<0>(attrs); ++o) { // nb outputs
+            B sum{0};
+            for (std::size_t b = 0; b < batchSize; ++b) {
+                sum += input[b*std::get<0>(attrs) + o];
+            }
+            biasesGrad[o] = sum;
+        }
+    }
+
+    // weight grad
+    for (std::size_t o = 0; o < std::get<0>(attrs); ++o) {
+        for (std::size_t c = 0; c < oneInputSize; ++c) {
+            W sum{0};
+            for (std::size_t b = 0; b < batchSize; ++b) {
+                sum += originalInput[b*oneInputSize + c]*input[b*std::get<0>(attrs) + o];
+            }
+            weightGrad[o*oneInputSize + c] = sum;
+        }
+    }
+
+    // input grad
+    for (std::size_t b = 0; b < batchSize; ++b) {
+        for (std::size_t c = 0; c < oneInputSize; ++c) {
+            O sum{0};
+            for (std::size_t o = 0; o < std::get<0>(attrs); ++o) {
+                sum += weight[o*oneInputSize + c] * input[b*std::get<0>(attrs) + o];
+            }
+            output[b*oneInputSize + c] = sum;
+        }
+    }
+}
+
+
+namespace {
+static Registrar<FCImplBackward_cpu> registrarFCImpl2DBackward_cpu_Float32(
+        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
+        Aidge::FCImpl_cpu_backward_kernel<float, float, float, float>);
+static Registrar<FCImplBackward_cpu> registrarFCImpl2DBackward_cpu_Int32(
+        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
+        Aidge::FCImpl_cpu_backward_kernel<int, int, int, int>);
+static Registrar<FCImplBackward_cpu> registrarFCImpl2DBackward_cpu_Float64(
+        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
+        Aidge::FCImpl_cpu_backward_kernel<double, double, double, double>);
+}  // namespace
+
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_FCIMPL_BACKWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl_backward_kernels.hpp
index 0e2fc400..949e6af6 100644
--- a/include/aidge/backend/cpu/operator/LeakyReLUImpl_backward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl_backward_kernels.hpp
@@ -28,7 +28,7 @@ void LeakyReLUImpl_cpu_backward_kernel(const LeakyReLU_Op::Attrs& attrs,
     I negativeSlope = static_cast<I>(std::get<0>(attrs));
 
     for (std::size_t i = 0; i < inputLenght; ++i) {
-        output[i] = input[i] > 0 ? 1 : negativeSlope;
+        output[i] = input[i] > 0 ? input[i] : negativeSlope*input[i];
     }
 }
 
diff --git a/include/aidge/backend/cpu/operator/ReLUImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/ReLUImpl_backward_kernels.hpp
index 47d95ac4..b68ea076 100644
--- a/include/aidge/backend/cpu/operator/ReLUImpl_backward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReLUImpl_backward_kernels.hpp
@@ -28,7 +28,7 @@ void ReLUImpl_cpu_backward_kernel(const std::size_t inputLenght,
     O* output = static_cast<O*>(output_);
 
     for (std::size_t i = 0; i < inputLenght; ++i) {
-        output[i] = (input[i] > I(0)) ? O(1) : O(0);
+        output[i] = (input[i] > I(0)) ? static_cast<O>(input[i]) : O(0);
     }
 }
 
diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp
index 8b0ffca8..eecff38a 100644
--- a/src/operator/FCImpl.cpp
+++ b/src/operator/FCImpl.cpp
@@ -9,25 +9,27 @@
  *
  ********************************************************************************/
 
-#include <cassert>
-#include <chrono>  // std::chrono::milliseconds
-#include <numeric> // std::accumulate
-#include <thread>  // std::this_thread::sleep_for
-#include <vector>
+#include "aidge/backend/cpu/operator/FCImpl.hpp"
+
+#include <cstddef>  // std::size_t
+#include <functional>
+#include <memory>
+#include <tuple>
 
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp"
+#include "aidge/backend/cpu/operator/FCImpl_forward_kernels.hpp"
 #include "aidge/operator/FC.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
 #include "aidge/utils/Types.h"
-#include "aidge/backend/cpu/data/GetCPUPtr.h"
 
-#include "aidge/backend/cpu/operator/FCImpl.hpp"
-#include "aidge/backend/cpu/operator/FCImpl_forward_kernels.hpp"
 
 void Aidge::FCImpl_cpu::forward()
 {
     const FC_Op& op_ = dynamic_cast<const FC_Op&>(mOp);
-    assert((op_.getInput(0)) && "missing input #0");
-    assert((op_.getInput(1)) && "missing input #1");
-    assert((op_.getInput(2)) && "missing input #2");
+    AIDGE_ASSERT(op_.getInput(0), "missing input #0");
+    AIDGE_ASSERT(op_.getInput(1), "missing input #1");
+    AIDGE_ASSERT(op_.getInput(2), "missing input #2");
 
     // Find the correct kernel type
     const auto outputDataType = op_.getOutput(0)->dataType();
@@ -66,44 +68,48 @@ void Aidge::FCImpl_cpu::forward()
         getCPUPtr(mOp.getRawOutput(0)));
 }
 
-// void Aidge::FCImpl_cpu::backward()
-// {
-//     const FC_Op& op_ = dynamic_cast<const FC_Op&>(mOp);
-//     const auto& fc_grad = op_.getOutput(0)->grad();
-//     assert(fc_grad && "missing ouput #0 gradient");
+void Aidge::FCImpl_cpu::backward()
+{
+    const FC_Op& op_ = dynamic_cast<const FC_Op&>(mOp);
+    const auto& fc_grad = op_.getOutput(0)->grad();
+    assert(fc_grad && "missing ouput #0 gradient");
 
-//     // Find the correct kernel type
-//     const Registrar<FCImplBackward_cpu>::registrar_key registrarKey = {
-//         op_.getInput(0)->grad()->dataType(),
-//         op_.getInput(1)->grad()->dataType(),
-//         op_.getInput(2)->grad()->dataType(),
-//         fc_grad->dataType()};
+    // Find the correct kernel type
+    const Registrar<FCImplBackward_cpu>::registrar_key registrarKey = {
+        fc_grad->dataType(),
+        op_.getInput(0)->grad()->dataType(),
+        op_.getInput(1)->grad()->dataType(),
+        op_.getInput(2)->grad()->dataType()};
 
-//     Registrar<FCImplBackward_cpu>::registrar_type kernelFunc;
-//     if (Registrar<FCImplBackward_cpu>::exists(registrarKey)) {
-//         // One exists with the right inputs/output types
-//         kernelFunc = Registrar<FCImplBackward_cpu>::create(registrarKey);
-//     }
-//     else {
-//         // Otherwise, fallback to the kernel with all types matching output type
-//         kernelFunc = Registrar<FCImplBackward_cpu>::create({
-//             fc_grad->dataType(), fc_grad->dataType(), fc_grad->dataType(), fc_grad->dataType()});
-//     }
+    Registrar<FCImplBackward_cpu>::registrar_type kernelFunc;
+    if (Registrar<FCImplBackward_cpu>::exists(registrarKey)) {
+        // One exists with the right inputs/output types
+        kernelFunc = Registrar<FCImplBackward_cpu>::create(registrarKey);
+    }
+    else {
+        // Otherwise, fallback to the kernel with all types matching output type
+        kernelFunc = Registrar<FCImplBackward_cpu>::create({
+            fc_grad->dataType(), fc_grad->dataType(), fc_grad->dataType(), fc_grad->dataType()});
+    }
 
-//     // Convert input data (no overhead if not needed!)
-//     // TODO: right now, if needed, memory will be allocated/deallocated at each
-//     // call to forward(). We might put the following shared_ptr as members of
-//     // this class to avoid that.
-//     std::shared_ptr<Tensor> input0gradFallback, input1gradFallback, input2gradFallback;
-//     const auto& input0grad = op_.getInput(0)->grad()->refCastFrom(input0gradFallback, *(op_.getOutput(0)));
-//     const auto& input1grad = op_.getInput(1)->grad()->refCastFrom(input1gradFallback, *(op_.getOutput(0)));
-//     const auto& input2grad = op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0)));
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0gradFallback, input1gradFallback, input2gradFallback;
+    const auto& input0grad = op_.getInput(0)->grad()->refCastFrom(input0gradFallback, *(op_.getOutput(0)));
+    const auto& input1grad = op_.getInput(1)->grad()->refCastFrom(input1gradFallback, *(op_.getOutput(0)));
+    const auto& input2grad = op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0)));
 
-//     // Call kernel
-//     const auto batchSize = (input0.dims().size() > 1) ? input0.dims()[0] : 1;
-//     kernelFunc(dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(),
-//         batchSize,
-//         input0.size() / batchSize,
-//         input0.getImpl()->rawPtr(), input1.getImpl()->rawPtr(), input2.getImpl()->rawPtr(),
-//         getCPUPtr(mOp.getRawOutput(0)));
-// }
+    // Call kernel
+    const auto batchSize = (input0grad.dims().size() > 1) ? input0grad.dims()[0] : 1;
+    kernelFunc(dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(),
+        batchSize,
+        input0grad.size() / batchSize,
+        getCPUPtr(fc_grad),
+        getCPUPtr(op_.getInput(0)),
+        getCPUPtr(mOp.getRawInput(1)),
+        input0grad.getImpl()->rawPtr(),
+        input1grad.getImpl()->rawPtr(),
+        input2grad.getImpl()->rawPtr());
+}
diff --git a/src/operator/LeakyReLUImpl.cpp b/src/operator/LeakyReLUImpl.cpp
index 4ffb230d..67847429 100644
--- a/src/operator/LeakyReLUImpl.cpp
+++ b/src/operator/LeakyReLUImpl.cpp
@@ -28,8 +28,9 @@ Aidge::NbElts_t Aidge::LeakyReLUImpl_cpu::getNbRequiredProtected(const Aidge::IO
 }
 
 void Aidge::LeakyReLUImpl_cpu::forward() {
-    std::shared_ptr<Tensor> in0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0));
-    std::shared_ptr<Tensor> out0 = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0));
+    const LeakyReLU_Op& op_ = dynamic_cast<const LeakyReLU_Op&>(mOp);
+    std::shared_ptr<Tensor> in0 = op_.getInput(0);
+    std::shared_ptr<Tensor> out0 = op_.getOutput(0);
     AIDGE_ASSERT(in0, "missing input #0");
 
     // Find the correct kernel type
@@ -46,8 +47,9 @@ void Aidge::LeakyReLUImpl_cpu::forward() {
 
 void Aidge::LeakyReLUImpl_cpu::backward() {
     // reversing in and out Data for backprop
-    std::shared_ptr<Tensor> in0 = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0));
-    std::shared_ptr<Tensor> out0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0));
+    const LeakyReLU_Op& op_ = dynamic_cast<const LeakyReLU_Op&>(mOp);
+    std::shared_ptr<Tensor> in0  = op_.getOutput(0)->grad();
+    std::shared_ptr<Tensor> out0 = op_.getInput(0)->grad();
     AIDGE_ASSERT(in0, "missing input #0");
 
     // Find the correct kernel type
diff --git a/src/operator/ReLUImpl.cpp b/src/operator/ReLUImpl.cpp
index 84bb1045..00552146 100644
--- a/src/operator/ReLUImpl.cpp
+++ b/src/operator/ReLUImpl.cpp
@@ -44,9 +44,10 @@ void Aidge::ReLUImpl_cpu::forward() {
 
 void Aidge::ReLUImpl_cpu::backward() {
     // reversing in and out Tensors
-    std::shared_ptr<Tensor> in0 = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->grad();
-    std::shared_ptr<Tensor> out0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->grad();
-    AIDGE_ASSERT(out0, "missing input #0");
+        const ReLU_Op& op_ = dynamic_cast<const ReLU_Op&>(mOp);
+    std::shared_ptr<Tensor> in0  = op_.getOutput(0)->grad();
+    std::shared_ptr<Tensor> out0 = op_.getInput(0)->grad();
+    AIDGE_ASSERT(out0, "current {} operator output#0 has not gradient Tensor.", op_.type());
 
     // Find the correct kernel type
     auto kernelFunc = Registrar<ReLUImplBackward_cpu>::create({
-- 
GitLab