diff --git a/include/aidge/backend/cpu/operator/SubImpl.hpp b/include/aidge/backend/cpu/operator/SubImpl.hpp index eed26ddcc9f57b3bb7796049a62f3f6be7de4eb5..1f94ff139c319916fed68120317c5f1931619495 100644 --- a/include/aidge/backend/cpu/operator/SubImpl.hpp +++ b/include/aidge/backend/cpu/operator/SubImpl.hpp @@ -15,15 +15,23 @@ #include "aidge/backend/cpu/operator/OperatorImpl.hpp" #include "aidge/operator/Sub.hpp" #include "aidge/utils/Registrar.hpp" -#include "aidge/utils/Types.h" -#include "aidge/backend/cpu/data/GetCPUPtr.h" -#include <memory> + #include <vector> namespace Aidge { // Operator implementation entry point for the backend using SubImpl_cpu = OperatorImpl_cpu<Sub_Op, - void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*)>; + void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*), + void(const std::size_t, + const std::size_t, + const std::size_t, + const std::vector<std::size_t>&, + const std::vector<std::size_t>&, + const std::vector<std::size_t>&, + const void*, + void*, + void*) +>; // Implementation entry point registration to Operator REGISTRAR(Sub_Op, "cpu", Aidge::SubImpl_cpu::create); diff --git a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp index 1d789c3c8886d35ce6597d5704c76060bad196c1..8d3d80e93e2e2f95d00026b9f3d1a1ff48b8d17a 100644 --- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp @@ -42,6 +42,7 @@ void sub_contiguous_arrays(const std::size_t input1size, namespace Aidge { + template <class I1, class I2, class O> void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, std::vector<std::size_t> dims1, @@ -149,10 +150,55 @@ void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, } } +template <class I1, class I2, class O> +void SubImpl_cpu_backward_kernel(const std::size_t input0Length, + const std::size_t input1Length, + const std::size_t gradOutputLength, + const std::vector<std::size_t>& dims0, + const std::vector<std::size_t>& dims1, + const std::vector<std::size_t>& outputDims, + const void* grad_output_, + void* gradientInput0_, + void* gradientInput1_) +{ + const O* grad_output = static_cast<const O*>(grad_output_); + auto* grad_input_0 = static_cast<I1*>(gradientInput0_); + auto* grad_input_1 = static_cast<I2*>(gradientInput1_); + + std::fill_n(grad_input_0, input0Length, static_cast<I1>(0)); + std::fill_n(grad_input_1, input1Length, static_cast<I2>(0)); + + auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0); + auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1); + + for (std::size_t i = 0; i < gradOutputLength; ++i) { + auto idxOutputGrad = getMultiDimIndices(outputDims, i); + std::vector<std::size_t> idxInput0(broadcastedDims0.size()); + std::vector<std::size_t> idxInput1(broadcastedDims1.size()); + + for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) { + idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension]; + } + + for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) { + idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension]; + } + + auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0); + auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1); + + // For subtraction: gradient of first input is 1 * grad_output + grad_input_0[idx0] += static_cast<I1>(grad_output[i]); + // For subtraction: gradient of second input is -1 * grad_output + grad_input_1[idx1] += static_cast<I2>(-grad_output[i]); + } +} + + // Kernels registration to implementation entry point REGISTRAR(SubImpl_cpu, {DataType::Float32}, - {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<float, float, float>, nullptr}); + {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<float, float, float>, Aidge::SubImpl_cpu_backward_kernel<float,float,float>}); REGISTRAR(SubImpl_cpu, {DataType::Float64}, {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<double, double, double>, nullptr}); diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp index e36abe2a9d68a2b56ab1777aa04b0e911df514c8..7f57bf2f1e16bbe6a6dd510f39463b611d925220 100644 --- a/src/operator/SubImpl.cpp +++ b/src/operator/SubImpl.cpp @@ -41,5 +41,27 @@ void Aidge::SubImpl_cpu::forward() { template <> void Aidge::SubImpl_cpu::backward() { - AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Sub_Op on backend cpu"); + + const Sub_Op& op_ = dynamic_cast<const Sub_Op&>(mOp); + + auto in0 = op_.getInput(0); + auto in1 = op_.getInput(1); + auto in0grad = op_.getInput(0)->grad(); + auto in1grad = op_.getInput(1)->grad(); + auto out0grad = op_.getOutput(0)->grad(); + + // Find the correct kernel type + const auto impl = Registrar<SubImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Call kernel + impl.backward(/* input0Length */ in0grad->size(), + /* input1Length */ in1grad->size(), + /* grad0Length */ out0grad->size(), + /* input0Dims */ in0->dims(), + /* input1Dims */ in1->dims(), + /* outputDims */ out0grad->dims(), + /* gradOutput */ getCPUPtr(out0grad), + /* gradInput0 */ getCPUPtr(in0grad), + /* gradInput1 */ getCPUPtr(in1grad)); + } diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp index da1a4873136241cddea351996331c489f7476bdd..0c4a64bb062bb8e9219f26c27910b72439ed8c5c 100644 --- a/unit_tests/operator/Test_MetaOperator.cpp +++ b/unit_tests/operator/Test_MetaOperator.cpp @@ -774,7 +774,7 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") { const auto nbTimeSteps = dims[0]; const auto beta = betaDist(gen); - auto myLeaky = Leaky(nbTimeSteps, beta, 1.0, LeakyReset::Subtraction,"leaky"); + auto myLeaky = Leaky(nbTimeSteps, beta, 1.0, LeakyReset::Subtraction, "leaky"); auto op = std::static_pointer_cast<MetaOperator_Op>(myLeaky->getOperator()); // auto stack = Stack(2); diff --git a/unit_tests/operator/Test_SubImpl.cpp b/unit_tests/operator/Test_SubImpl.cpp index 1317e88a371e9a6e7a3deae5b7f662a9cd879a60..d9b6207b2c47f1004097509c853e69a8160fc22a 100644 --- a/unit_tests/operator/Test_SubImpl.cpp +++ b/unit_tests/operator/Test_SubImpl.cpp @@ -322,4 +322,165 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") { } } } + + +TEST_CASE("[CPU/Operator] Sub(Backward)", "[Sub][CPU][Backward]") { + std::shared_ptr<Node> mySub = Sub(); + auto op = std::static_pointer_cast<OperatorTensor>(mySub->getOperator()); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + SECTION("Case 1: 1D and 2D Tensors") { + const auto T0 = std::make_shared<Tensor>( + Array2D<float, 2, 3>({{{1, 2, 3}, {4, 5, 6}}})); + + const auto T1 = + std::make_shared<Tensor>(Array1D<float, 3>({0.1, 0.2, 0.3})); + + T0->setDataType(DataType::Float32); + T0->setBackend("cpu"); + T1->setDataType(DataType::Float32); + T1->setBackend("cpu"); + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->getOutput(0)->setGrad(std::make_shared<Tensor>( + Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}))); + op->forwardDims(); + + mySub->backward(); + + // For subtraction: grad_input0 = grad_output + const auto expectedGrad0 = std::make_shared<Tensor>( + Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})); + + // For subtraction: grad_input1 = -grad_output (summed across broadcast dimensions) + const auto expectedGrad1 = + std::make_shared<Tensor>(Array1D<float, 3>({-2, -2, -2})); + + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1)); + } + + SECTION("Case 2: 3D and 1D tensors") { + const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}, + {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}})); + + const auto T1 = + std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1})); + + const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}})); + + const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>( + {{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}, + {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}})); + + const auto expectedGrad1 = + std::make_shared<Tensor>(Array1D<float, 3>({-4.0, -4.0, -4.0})); + + for (auto T : {T0, T1, newGrad, expectedGrad0, expectedGrad1}) { + T->setBackend("cpu"); + T->setDataType(DataType::Float32); + } + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->getOutput(0)->setGrad(newGrad); + op->forwardDims(); + + mySub->backward(); + + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1)); + } + + SECTION("Case 3: Random values with broadcasting") { + // Use random values + std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor + std::vector<std::size_t> dims1 = {2, 6, 7}; // Second tensor + std::vector<std::size_t> outputDims = {5, 2, 6, 7}; + + const auto input0Size = 5 * 2 * 1 * 7; + const auto input1Size = 2 * 6 * 7; + const auto outputSize = 5 * 2 * 6 * 7; + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> dist(0.1f, 1.0f); + + std::vector<float> input0Data(input0Size); + std::vector<float> input1Data(input1Size); + std::vector<float> gradOutputData(outputSize); + + // Fill with random values + for (auto &val : input0Data) val = dist(gen); + for (auto &val : input1Data) val = dist(gen); + for (auto &val : gradOutputData) val = dist(gen); + + auto T0 = std::make_shared<Tensor>(); + auto T1 = std::make_shared<Tensor>(); + + T0->setDataType(DataType::Float32); + T0->setBackend("cpu"); + T0->resize(dims0); + T0->getImpl()->setRawPtr(input0Data.data(), input0Size); + + T1->setDataType(DataType::Float32); + T1->setBackend("cpu"); + T1->resize(dims1); + T1->getImpl()->setRawPtr(input1Data.data(), input1Size); + + op->associateInput(0, T0); + op->associateInput(1, T1); + + // Set gradient of output + op->getOutput(0)->setGrad(std::make_shared<Tensor>()); + op->getOutput(0)->grad()->resize(outputDims); + op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(), outputSize); + + op->forwardDims(); + + // Compute reference gradients + std::vector<float> expectedGrad0(input0Size, 0.0f); + std::vector<float> expectedGrad1(input1Size, 0.0f); + + for (std::size_t n = 0; n < 5; ++n) { + for (std::size_t c = 0; c < 2; ++c) { + for (std::size_t h = 0; h < 6; ++h) { + for (std::size_t w = 0; w < 7; ++w) { + std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n)); + std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n)); + std::size_t in1Idx = w + 7 * (h + 6 * c); + + // Gradient for input0: grad_output + expectedGrad0[in0Idx] += gradOutputData[outIdx]; + // Gradient for input1: -grad_output + expectedGrad1[in1Idx] += -gradOutputData[outIdx]; + } + } + } + } + + // Perform backward pass + mySub->backward(); + + auto expectedGrad0Tensor = std::make_shared<Tensor>(); + expectedGrad0Tensor->resize(T0->dims()); + expectedGrad0Tensor->setBackend("cpu"); + expectedGrad0Tensor->setDataType(DataType::Float32); + expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(), expectedGrad0.size()); + + auto expectedGrad1Tensor = std::make_shared<Tensor>(); + expectedGrad1Tensor->resize(T1->dims()); + expectedGrad1Tensor->setBackend("cpu"); + expectedGrad1Tensor->setDataType(DataType::Float32); + expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(), expectedGrad1.size()); + + // Verify backward pass + REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor)); + REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor)); + } +} } // namespace Aidge