From 650873eb903bbc2a0a31aac584fdaf74b7c08106 Mon Sep 17 00:00:00 2001
From: Adam MARONI <amaroni@lrtechnologies.fr>
Date: Wed, 12 Mar 2025 14:39:56 +0100
Subject: [PATCH] [Issue #251] : Softmax Backward implementation for cpu

---
 .../backend/cpu/operator/SoftmaxImpl.hpp      |   3 +-
 .../cpu/operator/SoftmaxImpl_kernels.hpp      |  90 +++++++-
 src/operator/SoftmaxImpl.cpp                  |  16 +-
 unit_tests/operator/Test_SoftmaxImpl.cpp      | 206 +++++++++++++++++-
 4 files changed, 302 insertions(+), 13 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/SoftmaxImpl.hpp b/include/aidge/backend/cpu/operator/SoftmaxImpl.hpp
index ec2c2696..fb17d04c 100644
--- a/include/aidge/backend/cpu/operator/SoftmaxImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SoftmaxImpl.hpp
@@ -23,7 +23,8 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using SoftmaxImpl_cpu = OperatorImpl_cpu<Softmax_Op,
-    void(std::size_t, const std::vector<DimSize_t>&, const void*, void*)>;
+    void(std::size_t, const std::vector<DimSize_t>&, const void*, void*),
+    void(std::size_t axisIdx, const std::vector<DimSize_t>& , const void* , const void* , void*)>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Softmax_Op, "cpu", Aidge::SoftmaxImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
index 07486a48..3f110672 100644
--- a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
@@ -22,8 +22,13 @@
 #include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
 
 namespace Aidge {
+
 template <class I, class O>
-void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSize_t>& inputDims, const void* input_, void* output_)
+void SoftmaxImpl_cpu_forward_kernel(
+    std::size_t axisIdx,
+    const std::vector<DimSize_t>& inputDims,
+    const void* input_,
+    void* output_)
 {
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
@@ -41,35 +46,100 @@ void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSi
         for (std::size_t j = 0; j < postAxisElems; ++j) {
             I maxVal = input[i * inputDims[axisIdx] * postAxisElems + j];
             for (std::size_t k = 1; k < inputDims[axisIdx]; ++k) {
-                std::size_t inIdx = i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j;
+                std::size_t inIdx =
+                i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j;
                 maxVal = std::max(maxVal, input[inIdx]);
             }
 
             // Calculate sum of exponentials within the axis
             I sumExp = 0;
             for (std::size_t k = 0; k < inputDims[axisIdx]; ++k) {
-                std::size_t inIdx = i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j;
+                std::size_t inIdx =
+                i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j;
                 sumExp += std::exp(input[inIdx] - maxVal);
             }
 
             // Calculate softmax for the current slice along the axis
             for (std::size_t  k = 0; k < inputDims[axisIdx]; ++k) {
-                std::size_t inIdx = i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j;
+                std::size_t inIdx =
+                i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j;
                 output[inIdx] = std::exp(input[inIdx] - maxVal) / sumExp;
             }
         }
     }
 }
 
+/**
+ * @brief Backward Kernel for Softmax on CPU (Use cross entropy as Loss func)
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param[in] inputDims Array of input dimensions.
+ * @param[in] softmaxOut_ Softmax forward output tensor
+ * @param[in] target_ Target output
+ * @param[out] gradientLoss_ Backward gradient Output Tensor.
+ */
+template<class I, class O>
+void SoftmaxImpl_cpu_backward_kernel(
+    std::size_t axisIdx,
+    const std::vector<DimSize_t>& inputDims,
+    const void* softmaxOut_,
+    const void* target_,
+    void* gradientLoss_)
+    
+{
+    const O* softmaxOut = static_cast<const O*>(softmaxOut_); 
+    const O* target = static_cast<const O*>(target_); 
+    I* dL = static_cast<I*>(gradientLoss_);
+
+    // Compute the number of elements after the softmax axis (post-axis size)
+    std::size_t postAxisElems = 1;
+    for (std::size_t i = axisIdx + 1; i < inputDims.size(); ++i) {
+        postAxisElems *= inputDims[i];
+    }
+
+    // Compute the number of elements after the softmax axis (pre-axis size)
+    std::size_t preAxisElems = 1;
+    for (std::size_t i = 0; i < axisIdx; ++i) {
+        preAxisElems *= inputDims[i];
+    }
+
+    //Iterate over batches (pre-axis elements)
+    for (std::size_t i = 0; i < preAxisElems; ++i) {
+        for (std::size_t j = 0; j < postAxisElems; ++j) {
+            for (std::size_t k = 0; k < inputDims[axisIdx]; ++k) {
+                std::size_t inIdx = i * inputDims[axisIdx] * postAxisElems 
+                                    + k * postAxisElems + j;
+                dL[inIdx] = softmaxOut[inIdx] - target[inIdx] ;
+            }
+        }
+    }
+}
+
 REGISTRAR(SoftmaxImpl_cpu,
-    {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::SoftmaxImpl_cpu_forward_kernel<float, float>, nullptr});
+    { DataType::Float32 },
+    {
+        ProdConso::inPlaceModel,
+        Aidge::SoftmaxImpl_cpu_forward_kernel<float, float>,
+        Aidge::SoftmaxImpl_cpu_backward_kernel<float, float>
+    }
+);
 REGISTRAR(SoftmaxImpl_cpu,
-    {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::SoftmaxImpl_cpu_forward_kernel<double, double>, nullptr});
+    { DataType::Float64 },
+    {
+        ProdConso::inPlaceModel,
+        Aidge::SoftmaxImpl_cpu_forward_kernel<double, double>,
+        Aidge::SoftmaxImpl_cpu_backward_kernel<double, double>
+    }
+);
 REGISTRAR(SoftmaxImpl_cpu,
-    {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::SoftmaxImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+    { DataType::Int32 },
+    {
+        ProdConso::inPlaceModel,
+        Aidge::SoftmaxImpl_cpu_forward_kernel<int32_t, int32_t>,
+        Aidge::SoftmaxImpl_cpu_backward_kernel<int32_t, int32_t>
+    }
+);
+
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_SOFTMAXIMPL_KERNELS_H_ */
diff --git a/src/operator/SoftmaxImpl.cpp b/src/operator/SoftmaxImpl.cpp
index 8b6933f2..3f3bb75c 100644
--- a/src/operator/SoftmaxImpl.cpp
+++ b/src/operator/SoftmaxImpl.cpp
@@ -40,5 +40,19 @@ void Aidge::SoftmaxImpl_cpu::forward() {
 
 template <>
 void Aidge::SoftmaxImpl_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Softmax_Op on backend cpu");
+    const auto& op_ = dynamic_cast<const Softmax_Op&>(mOp);
+
+    AIDGE_ASSERT(!op_.getInput(0)->empty(), "Softmax input empty");
+    std::int32_t axis = (op_.axis() >= 0) ? op_.axis() : op_.getInput(0)->nbDims() + op_.axis();
+
+    // Find the correct kernel type
+    const auto impl = Registrar<SoftmaxImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.backward(static_cast<std::size_t>(axis), // axisIdx
+               op_.getInput(0)->dims(),
+               op_.getOutput(0)->getImpl()->rawPtr(),
+               op_.getOutput(0)->grad()->getImpl()->rawPtr(),
+               op_.getInput(0)->grad()->getImpl()->rawPtr()
+    );
 }
diff --git a/unit_tests/operator/Test_SoftmaxImpl.cpp b/unit_tests/operator/Test_SoftmaxImpl.cpp
index bc452a40..d949e393 100644
--- a/unit_tests/operator/Test_SoftmaxImpl.cpp
+++ b/unit_tests/operator/Test_SoftmaxImpl.cpp
@@ -111,4 +111,208 @@ TEST_CASE("[cpu/operator] Softmax(forward)", "[Softmax][CPU]") {
 
         REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
     }
-}
\ No newline at end of file
+}
+
+
+
+TEST_CASE("[cpu/operator] Softmax(backward)", "[Softmax][CPU]") {
+    SECTION("1D Tensor") {
+      std::shared_ptr<Softmax_Op> op = std::make_shared<Softmax_Op>(0);
+      op->setDataType(DataType::Float32);
+      op->setBackend("cpu");
+
+      std::shared_ptr<Tensor> softMaxForwardInputTensor = 
+        std::make_shared<Tensor>(Array1D<float,3> { {3.0, 1.0, 0.2} });
+        //One hot encoded targets
+        // Expected input  obtained after softMax backward execution
+        op->associateInput(0,softMaxForwardInputTensor);
+        op->forward();
+
+        std::shared_ptr<Tensor> target1 =
+          std::make_shared<Tensor>(Array1D<float, 3>{ {1,0, 0} });
+        Tensor expectedGrad1 = Array1D<float,3> {
+             {-0.163981,  0.113143,  0.050838}
+        };
+        op->getOutput(0)->setGrad(target1);
+        op->backward();
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), expectedGrad1,
+              1e-5f, 1e-8f));
+        
+        std::shared_ptr<Tensor> target2 =
+          std::make_shared<Tensor>(Array1D<float, 3>{ {0,1, 0} });
+        Tensor expectedGrad2 = Array1D<float,3> {
+             {0.836019, -0.886857,  0.050838}
+        };
+        op->getOutput(0)->setGrad(target2);
+        op->backward();
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), expectedGrad2,
+              1e-5f, 1e-8f));
+
+        std::shared_ptr<Tensor> target3 =
+          std::make_shared<Tensor>(Array1D<float, 3>{ {0,0, 1} });
+        Tensor expectedGrad3 = Array1D<float,3> {
+             {0.836019,  0.113143, -0.949162}
+        };
+        op->getOutput(0)->setGrad(target3);
+        op->backward();
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), expectedGrad3,
+              1e-5f, 1e-8f));
+    }
+    SECTION("2D Tensor") {
+      std::shared_ptr<Softmax_Op> op = std::make_shared<Softmax_Op>(1);
+      op->setDataType(DataType::Float32);
+      op->setBackend("cpu");
+
+      std::shared_ptr<Tensor> softMaxForwardInputTensor = 
+        std::make_shared<Tensor>(Array2D<float, 2, 3> {
+            {
+              {2.0, 1.0, 0.1},
+              {1.0, 3.0, 0.2}
+            }
+        });
+
+        op->associateInput(0,softMaxForwardInputTensor);
+        op->forward();
+
+        std::shared_ptr<Tensor> target1 =
+          std::make_shared<Tensor>(Array2D<float, 2, 3>{
+            {
+              {1, 0, 0},
+              {0, 1, 0}
+            }
+          });
+        Tensor expectedGrad1 = Array2D<float, 2, 3> {
+          {
+            {-0.34099886, 0.24243297, 0.09856589},
+            {0.11314284, -0.1639812,  0.05083836}
+          }
+        };
+        op->getOutput(0)->setGrad(target1);
+        op->backward();
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), expectedGrad1,
+              1e-5f, 1e-8f));
+    }
+    SECTION("4D Tensor"){
+      std::shared_ptr<Softmax_Op> op = std::make_shared<Softmax_Op>(1);
+      op->setDataType(DataType::Float32);
+      op->setBackend("cpu");
+
+
+
+      std::shared_ptr<Tensor> softMaxForwardInputTensor = 
+        std::make_shared<Tensor>(Array4D<float,2,3,3,3> {
+            {
+                {
+                    {{8.28257084e-01, 7.99335480e-01, 7.36702740e-01},
+                     {2.36729562e-01, 8.61912668e-01, 9.93067741e-01},
+                     {1.63514376e-01, 8.95773172e-02, 2.96533108e-01}},
+                     
+                    {{2.20776618e-01, 5.89067876e-01, 2.03930080e-01},
+                     {1.31294072e-01, 7.10182846e-01, 1.08420849e-04},
+                     {7.21750259e-01, 4.38212037e-01, 5.08823872e-01}},
+                     
+                    {{4.30953979e-01, 1.51903450e-01, 3.76343548e-01},
+                     {8.07861805e-01, 7.79679358e-01, 5.01209974e-01},
+                     {9.31280375e-01, 9.94207084e-01, 1.74868107e-03}}
+                },
+                {
+                    {{6.22058094e-01, 2.32256651e-02, 6.18222237e-01},
+                     {9.58304763e-01, 2.11395025e-02, 4.95614648e-01},
+                     {2.50825584e-01, 4.50860739e-01, 3.80362332e-01}},
+                     
+                    {{9.91703272e-02, 5.06073236e-01, 4.88969564e-01},
+                     {1.12059772e-01, 7.64178872e-01, 7.60362148e-01},
+                     {2.84135342e-02, 4.29610193e-01, 1.27862811e-01}},
+
+                    {{9.57209170e-01, 8.22797656e-01, 1.91352129e-01},
+                     {9.52722490e-01, 6.35501027e-01, 5.67592978e-02},
+                     {2.00799644e-01, 4.00822222e-01, 9.14380193e-01}}
+                }
+            }
+        });
+
+
+        op->associateInput(0,softMaxForwardInputTensor);
+        op->forward();
+
+        std::shared_ptr<Tensor> target1 =
+            std::make_shared<Tensor>(Array4D<float, 2, 3, 3, 3>{{
+                {
+                    {
+                        { 1.0f, 0.0f, 0.0f },
+                        { 0.0f, 1.0f, 0.0f },
+                        { 0.0f, 0.0f, 1.0f }
+                    },
+                    {
+                        { 1.0f, 0.0f, 0.0f },
+                        { 0.0f, 1.0f, 0.0f },
+                        { 0.0f, 0.0f, 1.0f }
+                    },
+                    {
+                        { 1.0f, 0.0f, 0.0f },
+                        { 0.0f, 1.0f, 0.0f },
+                        { 0.0f, 0.0f, 1.0f }
+                    }
+                },
+                {
+                    {
+                        { 1.0f, 0.0f, 0.0f },
+                        { 0.0f, 1.0f, 0.0f },
+                        { 0.0f, 0.0f, 1.0f }
+                    },
+                    {
+                        { 1.0f, 0.0f, 0.0f },
+                        { 0.0f, 1.0f, 0.0f },
+                        { 0.0f, 0.0f, 1.0f }
+                    },
+                    {
+                        { 1.0f, 0.0f, 0.0f },
+                        { 0.0f, 1.0f, 0.0f },
+                        { 0.0f, 0.0f, 1.0f }
+                    }
+                }
+            }});
+
+	Tensor expectedGrad1 = Array4D<float,2,3,3,3> {
+		{
+		{{{-0.548909903,  0.428493917,  0.437751532},
+		  { 0.272464514, -0.640323639,  0.504549026},
+		  { 0.203976154,  0.204576448, -0.664564550}},
+
+		 {{-0.754281461,  0.347237468,  0.256949306},
+		  { 0.245199680, -0.690958738,  0.186924666},
+		  { 0.356466055,  0.289911687, -0.585231602}},
+
+		 {{-0.696808696,  0.224268600,  0.305299193},
+		  { 0.482335806, -0.668717623,  0.308526367},
+		  { 0.439557761,  0.505511820, -0.750203848}}},
+
+
+		{{{-0.665658951,  0.206386790,  0.395053923},
+		  { 0.412633836, -0.798012137,  0.339227289},
+		  { 0.363399804,  0.341277540, -0.712860584}},
+
+		 {{-0.801800549,  0.334487498,  0.347154379},
+		  { 0.177029371, -0.575357676,  0.442047715},
+		  { 0.290932596,  0.334101707, -0.776933849}},
+
+		 {{-0.532540560,  0.459125668,  0.257791758},
+		  { 0.410336822, -0.626630187,  0.218724951},
+		  { 0.345667630,  0.324620724, -0.510205626}}}
+		}
+	};
+        op->getOutput(0)->setGrad(target1);
+        op->backward();
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), expectedGrad1,
+              1e-5f, 1e-8f));
+    }
+}
+
+
+
+
+
+
+
+
+
-- 
GitLab