From 1cd0d0715b5b55c092cf0229c2c5ce593a240f53 Mon Sep 17 00:00:00 2001 From: Jerome Hue <jerome.hue@cea.fr> Date: Fri, 6 Sep 2024 17:03:46 +0200 Subject: [PATCH] Add backward pass CPU implementation for Mul operator --- .../aidge/backend/cpu/operator/MulImpl.hpp | 24 +- .../cpu/operator/MulImpl_backward_kernels.hpp | 92 ++++ src/operator/MulImpl.cpp | 31 ++ unit_tests/operator/Test_MulImpl.cpp | 412 +++++++++++++++++- 4 files changed, 534 insertions(+), 25 deletions(-) create mode 100644 include/aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp diff --git a/include/aidge/backend/cpu/operator/MulImpl.hpp b/include/aidge/backend/cpu/operator/MulImpl.hpp index 2d42194c..008edf17 100644 --- a/include/aidge/backend/cpu/operator/MulImpl.hpp +++ b/include/aidge/backend/cpu/operator/MulImpl.hpp @@ -25,11 +25,25 @@ namespace Aidge { // compute kernel registry for forward and backward class MulImplForward_cpu - : public Registrable<MulImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)> { -}; + : public Registrable<MulImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, + const std::vector<std::size_t>&, + const std::vector<std::size_t>&, + const void*, + const void*, + void*)> {}; + class MulImplBackward_cpu - : public Registrable<MulImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)> { -}; + : public Registrable<MulImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t, + const std::size_t, + const std::size_t, + const std::vector<std::size_t>, + const std::vector<std::size_t>, + const void*, + const void*, + const void*, + void*, + void*)> {}; + class MulImpl_cpu : public OperatorImpl { public: @@ -40,7 +54,9 @@ public: } Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final; + void forward() override; + void backward() override; }; namespace { diff --git a/include/aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp new file mode 100644 index 00000000..db4cf81f --- /dev/null +++ b/include/aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp @@ -0,0 +1,92 @@ +#ifndef AIDGE_CPU_OPERATOR_MULIMPL_BACKWARD_KERNEL_H_ +#define AIDGE_CPU_OPERATOR_MULIMPL_BACKWARD_KERNEL_H_ + + +#include "aidge/utils/Registrar.hpp" + +#include <cstdint> // std::int32_t, std::int64_t +#include <algorithm> + +#include "aidge/backend/cpu/data/Broadcasting.hpp" +#include "aidge/backend/cpu/operator/MulImpl.hpp" + + +namespace Aidge { + +template <class I1, class I2, class O> +void MulImpl_cpu_backward_kernel(const std::size_t input0Length, + const std::size_t input1Length, + const std::size_t grad0Length, + const std::vector<std::size_t> input0Dims, + const std::vector<std::size_t> input1Dims, + const void* input0_, + const void* input1_, + const void* grad_output_, + void* gradientInput0, + void* gradientInput1) +{ + const auto* input0 = static_cast<const I1*>(input0_); + const auto* input1 = static_cast<const I1*>(input1_); + const auto* grad_output = static_cast<const O*>(grad_output_); + auto* grad_input_0 = static_cast<I1*>(gradientInput0); + auto* grad_input_1 = static_cast<I2*>(gradientInput1); + + + if(input0Dims.size() >= input1Dims.size()) + { + AIDGE_ASSERT(input0Length == grad0Length, "Incorrect dimensions between Mul input and output tensors"); + + for(auto i = 0U; i < input0Length; ++i) + { + const auto indices = getMultiDimIndices(input1Dims, i); + const auto flattenedIndex = getFlattenedIndex(input1Dims, indices); + + grad_input_0[i] = input1[flattenedIndex] * grad_output[i]; + } + + for(std::size_t i = 0 ; i < grad0Length; ++i) + { + const auto indices = getMultiDimIndices(input1Dims, i); + const auto flattenedIndex = getFlattenedIndex(input1Dims, indices); + + grad_input_1[flattenedIndex] += input0[i] * grad_output[i]; + } + + } else { + AIDGE_ASSERT(input1Length == grad0Length, "Incorrect dimensions between Mul input and output tensors"); + + for(auto i = 0U; i < input1Length; ++i) + { + const auto indices = getMultiDimIndices(input0Dims, i); + const auto flattenedIndex = getFlattenedIndex(input0Dims, indices); + + grad_input_1[i] = input0[flattenedIndex] * grad_output[i]; + } + + for(std::size_t i = 0 ; i < grad0Length; ++i) + { + const auto indices = getMultiDimIndices(input0Dims, i); + const auto flattenedIndex = getFlattenedIndex(input0Dims, indices); + + grad_input_0[flattenedIndex] += input1[i] * grad_output[i]; + } + } +} + + +namespace { +static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Float32( + {DataType::Float32, DataType::Float32, DataType::Float32}, + Aidge::MulImpl_cpu_backward_kernel<float, float, float>); +static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Float64( + {DataType::Float64, DataType::Float64, DataType::Float64}, + Aidge::MulImpl_cpu_backward_kernel<double, double, double>); +static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Int32( + {DataType::Int32, DataType::Int32, DataType::Int32}, + Aidge::MulImpl_cpu_backward_kernel<std::int32_t, std::int32_t, std::int32_t>); +static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Int64( + {DataType::Int64, DataType::Int64, DataType::Int64}, + Aidge::MulImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>); +} // namespace +} // namespace Aidge +#endif diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp index d7feb9b7..e5fd911c 100644 --- a/src/operator/MulImpl.cpp +++ b/src/operator/MulImpl.cpp @@ -22,6 +22,7 @@ #include "aidge/backend/cpu/operator/MulImpl.hpp" #include "aidge/backend/cpu/operator/MulImpl_forward_kernels.hpp" +#include "aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp" Aidge::Elts_t Aidge::MulImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const { // this implementation can be in-place @@ -40,6 +41,7 @@ void Aidge::MulImpl_cpu::forward() { const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); + // Call kernel kernelFunc(inputDims0, inputDims1, @@ -48,3 +50,32 @@ void Aidge::MulImpl_cpu::forward() { getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawOutput(0))); } + +void Aidge::MulImpl_cpu::backward() { + + const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp); + + auto in0 = op_.getInput(0); + auto in1 = op_.getInput(1); + auto in0grad = op_.getInput(0)->grad(); + auto in1grad = op_.getInput(1)->grad(); + auto out0grad = op_.getOutput(0)->grad(); + + // Find kernel function + auto kernelFunc = Registrar<MulImplBackward_cpu>::create({ + out0grad->dataType(), + in0grad->dataType(), + in1grad->dataType()}); + + kernelFunc(/* input0Length */ in0grad->size(), + /* input1Length */ in1grad->size(), + /* grad0Length */ out0grad->size(), + /* input0Dims */ in0->dims(), + /* input1Dims */ in1->dims(), + getCPUPtr(in0), + getCPUPtr(in1), + getCPUPtr(out0grad), + getCPUPtr(in0grad), + getCPUPtr(in1grad)); +} + diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp index 9d592d31..3378861d 100644 --- a/unit_tests/operator/Test_MulImpl.cpp +++ b/unit_tests/operator/Test_MulImpl.cpp @@ -24,6 +24,337 @@ namespace Aidge { + TEST_CASE("[CPU/Operator] Mul Backward", "[Mul][CPU][Backward]") + { + std::shared_ptr<Node> myMul = Mul(); + auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator()); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + SECTION("Case 1: 2D and 1D tensors") { + const auto T0 = std::make_shared<Tensor>(Array2D<float,2,3>( + { + { + {1,2,3},{4,5,6} + } + } + )); + + const auto T1 = std::make_shared<Tensor>(Array1D<float,3>( + {0.1,0.2,0.3} + )); + + T0->setDataType(DataType::Float32); + T0->setBackend("cpu"); + T1->setDataType(DataType::Float32); + T1->setBackend("cpu"); + + op->getOutput(0)->setGrad(std::make_shared<Tensor>(Array2D<float,2,3>({{{1.0,1.0,1.0},{1.0,1.0,1.0}}}))); + + op->associateInput(0,T0); + op->associateInput(1,T1); + op->forwardDims(); + + myMul->forward(); + myMul->backward(); + + auto T0Grad = std::make_shared<Tensor>(Array2D<float, 2,3>({{{0.1,0.2,0.3},{0.1, 0.2, 0.3}}})); + auto T1Grad = std::make_shared<Tensor>(Array1D<float, 3>({5,7,9})); + + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *T0Grad)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *T1Grad)); + } + + SECTION("Case 2: 3D and 1D tensors") { + const auto T0 = std::make_shared<Tensor>(Array3D<float,2,2,3>( + { + { + { + {1.0, 2.0, 3.0}, + {4.0, 5.0, 6.0} + }, + { + {7.0, 8.0, 9.0}, + {10.0, 11.0, 12.0} + } + } + } + )); + + const auto T1 = std::make_shared<Tensor>(Array1D<float, 3>({0.3,0.2,0.1})); + + const auto newGrad = std::make_shared<Tensor>(Array3D<float,2,2,3>( + { + { + { + {1, 1, 1}, + {1, 1, 1} + }, + { + {1, 1, 1}, + {1, 1, 1} + } + } + } + )); + + const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,2,3>( + { + { + { + {0.3, 0.2, 0.1}, + {0.3, 0.2, 0.1} + }, + { + {0.3, 0.2, 0.1}, + {0.3, 0.2, 0.1} + } + } + } + )); + + const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float,3>( + {22.0, 26.0, 30.0} + )); + + for(auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1}) + { + T->setBackend("cpu") ; + T->setDataType(DataType::Float32); + } + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->getOutput(0)->setGrad(newGrad); + op->forwardDims(); + + myMul->backward(); + + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1)); + } + + SECTION("Case 3: 4D and 2D tensors") { + const auto T0 = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>( + { + { + { + { + {1.0, 2.0, 3.0}, + {4.0, 5.0, 6.0}, + {7.0, 8.0, 9.0} + }, + { + {10.0, 11.0, 12.0}, + {13.0, 14.0, 15.0}, + {16.0, 17.0, 18.0} + } + }, + { + { + {19.0, 20.0, 21.0}, + {22.0, 23.0, 24.0}, + {25.0, 26.0, 27.0} + }, + { + {28.0, 29.0, 30.0}, + {31.0, 32.0, 33.0}, + {34.0, 35.0, 36.0} + } + } + } + } + )); + + const auto T1 = std::make_shared<Tensor>(Array2D<float, 3,3>( + { + { + {0.5,0.3,0.1}, + {0.4,0.2,0.6}, + {0.7,0.8,0.9} + } + } + )); + + const auto newGrad = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>( + { + { + { + { + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0} + }, + { + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0} + } + }, + { + { + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0} + }, + { + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0} + } + } + } + } + )); + + const auto expectedGrad0 = std::make_shared<Tensor>(Array4D<float,2,2,3,3>( + { + { + { + { + {0.5, 0.3, 0.1}, + {0.4, 0.2, 0.6}, + {0.7, 0.8, 0.9} + }, + { + {0.5, 0.3, 0.1}, + {0.4, 0.2, 0.6}, + {0.7, 0.8, 0.9} + } + }, + { + { + {0.5, 0.3, 0.1}, + {0.4, 0.2, 0.6}, + {0.7, 0.8, 0.9} + }, + { + {0.5, 0.3, 0.1}, + {0.4, 0.2, 0.6}, + {0.7, 0.8, 0.9} + } + } + } + } + )); + + const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 3>( + { + { + {58.0, 62.0, 66.0}, + {70.0, 74.0, 78.0}, + {82.0, 86.0, 90.0} + } + } + )); + + for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1}) + { + T->setBackend("cpu") ; + T->setDataType(DataType::Float32); + } + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->getOutput(0)->setGrad(newGrad); + op->forwardDims(); + + myMul->backward(); + + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1)); + } + + SECTION("Case 4: 3D and 2D tensors") { + const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 3, 4>( + { + { + { + {1.0, 2.0, 3.0, 4.0}, + {5.0, 6.0, 7.0, 8.0}, + {9.0, 10.0, 11.0, 12.0}, + }, + { + {13.0, 14.0, 15.0, 16.0}, + {17.0, 18.0, 19.0, 20.0}, + {21.0, 22.0, 23.0, 24.0}, + } + } + } + )); + + const auto T1 = std::make_shared<Tensor>(Array2D<float, 3, 4>( + { + { + {0.1, 0.2, 0.3, 0.4}, + {0.5, 0.6, 0.7, 0.8}, + {0.9, 1.0, 1.1, 1.2} + } + } + )); + + const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2,3,4>( + { + { + { + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + }, + { + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + {1.0, 1.0, 1.0, 1.0}, + } + } + } + )); + + const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,3,4>( + { + { + { + {0.1, 0.2, 0.3, 0.4}, + {0.5, 0.6, 0.7, 0.8}, + {0.9, 1.0, 1.1, 1.2} + }, + { + {0.1, 0.2, 0.3, 0.4}, + {0.5, 0.6, 0.7, 0.8}, + {0.9, 1.0, 1.1, 1.2} + } + } + } + )); + + const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 4>( + { + { + {14.0, 16.0, 18.0, 20.0}, + {22.0, 24.0, 26.0, 28.0}, + {30.0, 32.0, 34.0, 36.0} + } + } + )); + + for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1}) + { + T->setBackend("cpu") ; + T->setDataType(DataType::Float32); + } + + op->associateInput(0, T0); + op->associateInput(1, T1); + op->getOutput(0)->setGrad(newGrad); + op->forwardDims(); + + myMul->backward(); + + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1)); + } + } + TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { constexpr std::uint16_t NBTRIALS = 10; // Create a random number generator @@ -31,7 +362,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { std::mt19937 gen(rd()); std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1 std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10)); - std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3)); std::uniform_int_distribution<int> boolDist(0,1); // Create MatMul Operator @@ -60,6 +391,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration{}; + SECTION("MulImpl_cpu::forward()") { SECTION("Scalar / Scalar") { @@ -68,16 +400,20 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { } SECTION("+1-D Tensor / +1-D Tensor - same dimensions") { + std::size_t number_of_operation = 0; for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate 2 random Tensors - const std::size_t nbDims = nbDimsDist(gen); - std::vector<std::size_t> dims; + const auto nbDims = nbDimsDist(gen); + auto dims = std::vector<std::size_t>{}; + for (std::size_t i = 0; i < nbDims; ++i) { dims.push_back(dimSizeDist(gen)); } - const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + const auto nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>()); number_of_operation += nb_elements; // without broadcasting @@ -114,67 +450,101 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { delete[] array0; delete[] array1; delete[] result; - - // with broadcasting } std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; std::cout << "total time: " << duration.count() << "μs" << std::endl; } + SECTION("+1-D Tensor / +1-D Tensor - broadcasting") { std::size_t number_of_operation = 0; for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate 2 random Tensors // handle dimensions, replace some dimensions with '1' to get broadcasting + constexpr std::size_t nbDims = 4; - std::vector<std::size_t> dims; - for (std::size_t i = 0; i < nbDims; ++i) { - dims.push_back(dimSizeDist(gen)); + std::vector<std::size_t> dimensions; + + for (std::size_t i = 0; i < nbDims; ++i) + { + dimensions.push_back(dimSizeDist(gen)); } - std::vector<std::size_t> dims0 = dims; - std::vector<std::size_t> dims1 = dims; - std::vector<std::size_t> dimsOut = dims; - for (std::size_t i = 0; i < nbDims; ++i) { - if (boolDist(gen)) { + + auto dims0 = dimensions; + auto dims1 = dimensions; + auto dimsOut = dimensions; + + for (std::size_t i = 0; i < nbDims; ++i) + { + if (boolDist(gen)) + { dims0[i] = 1; } - if (boolDist(gen)) { + + if (boolDist(gen)) + { dims1[i] = 1; } + dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i]; } + for(auto dim : dims0) + { + Log::info("Dimension of input 0 : {}", dim); + } + + for(auto dim : dims1) + { + Log::info("Dimension of input 1 : {}", dim); + } + // create arrays and fill them with random values float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]]; float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]]; float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]]; - for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) { + + for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) + { array0[i] = valueDist(gen); } - for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) { + + for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) + { array1[i] = valueDist(gen); } // compute true result const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1}; const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1}; - for (std::size_t a = 0; a < dimsOut[0]; ++a) { - for (std::size_t b = 0; b < dimsOut[1]; ++b) { + + for (std::size_t a = 0; a < dimsOut[0]; ++a) + { + for (std::size_t b = 0; b < dimsOut[1]; ++b) + { const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0) + strides0[1] * ((dims0[1] > 1) ? b : 0); + const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0) + strides1[1] * ((dims1[1] > 1) ? b : 0); - for (std::size_t c = 0; c < dimsOut[2]; ++c) { + + for (std::size_t c = 0; c < dimsOut[2]; ++c) + { const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a)); - for (std::size_t d = 0; d < dimsOut[3]; ++d) { + + for (std::size_t d = 0; d < dimsOut[3]; ++d) + { std::size_t idx0 = idx0_0 + strides0[2] * ((dims0[2] > 1) ? c : 0) + ((dims0[3] > 1) ? d : 0); + std::size_t idx1 = idx1_0 + strides1[2] * ((dims1[2] > 1) ? c : 0) + ((dims1[3] > 1) ? d : 0); + result[idx_out + d] = array0[idx0] * array1[idx1]; // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl; } -- GitLab