From d6ae2c5f1863764786d896a7e0145c9796292ac5 Mon Sep 17 00:00:00 2001 From: Jerome Hue <jerome.hue@cea.fr> Date: Fri, 28 Feb 2025 15:10:49 +0100 Subject: [PATCH] Fix leaky test --- .../cpu/operator/.nfs000000030e7b5d24000000ad | 223 ++++++++++++++++++ unit_tests/operator/Test_MetaOperator.cpp | 4 +- 2 files changed, 225 insertions(+), 2 deletions(-) create mode 100644 include/aidge/backend/cpu/operator/.nfs000000030e7b5d24000000ad diff --git a/include/aidge/backend/cpu/operator/.nfs000000030e7b5d24000000ad b/include/aidge/backend/cpu/operator/.nfs000000030e7b5d24000000ad new file mode 100644 index 00000000..fb7981cb --- /dev/null +++ b/include/aidge/backend/cpu/operator/.nfs000000030e7b5d24000000ad @@ -0,0 +1,223 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_ + +#include "aidge/utils/Registrar.hpp" + +#include <cstddef> // std::size_t +#include <cstdint> // std::int32_t, std::int64_t +#include <vector> + +#include "aidge/backend/cpu/data/Broadcasting.hpp" +#include "aidge/backend/cpu/operator/SubImpl.hpp" + +namespace { +// suppose values are contiguous in memory +template <class I1, class I2, class O> +void sub_contiguous_arrays(const std::size_t input1size, + const std::size_t input2size, + const std::size_t output1size, + const I1* input1, + const I2* input2, + O* output) +{ + for (std::size_t i = 0; i < output1size; ++i) + { + const std::size_t in1_id = (input1size != 1) ? i : 0; + const std::size_t in2_id = (input2size != 1) ? i : 0; + output[i] = static_cast<O>(input1[in1_id] - input2[in2_id]); + } +} +} + + +namespace Aidge { + +template <class I1, class I2, class O> +void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0, + std::vector<std::size_t> dims1, + const std::vector<std::size_t>& outputDims, + const void* input0_, + const void* input1_, + void* output_) { + + const I1* input_0 = static_cast<const I1*>(input0_); + const I2* input_1 = static_cast<const I2*>(input1_); + O* output = static_cast<O*>(output_); + + // [5,2,1,7] & [2,6,7] + // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7] + // 2. Find the highest equal dimension -> 3 + // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size()) + // 3. Compute the highest number of contiguous data -> 7 + // 4. Compute stride and offset step for the broadcast mechanism + // 5. Call a simple kernel + + // special case for equal dimensions, the kernel is called with the entire arrays at once + if (dims0 == dims1) { + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t i = 0; i < input0_contiguous_size; ++i) + { + output[i] = static_cast<O>(input_0[i] - input_1[i]); + } + return; + } + + // set dimensions to be of equal size by filling the smallest one with ones. + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + + const std::size_t nbDims = dims0.size(); + + // Find the highest equal dimension + // std::size_t contiguousIdx = nbDims - 1; + std::size_t contiguousIdx = nbDims; + while (contiguousIdx-- > 0) { + // for (; contiguousIdx+1 > 0; --contiguousIdx) { + if (dims0[contiguousIdx] != dims1[contiguousIdx]) { + if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1 + const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1; + while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) { + --contiguousIdx; + } + } + break; + } + } + ++contiguousIdx; + + // Compute the highest number of contiguous data for each Tensor + const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>()); + const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>()); + + // initialize strides to iterate through data because of broadcasting + std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx); + std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx); + if (contiguousIdx > 0) { + stride_post0[contiguousIdx - 1] = 1; + stride_post1[contiguousIdx - 1] = 1; + for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]); + stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]); + } + for (std::size_t i = 0; i != contiguousIdx; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1; + } + } + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + + + std::size_t dim = contiguousIdx - 1; + const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>()); + for (std::size_t stack = 0; stack < nbStacks;) { + sub_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size, + input_0 + offsetIn0*input0_contiguous_size, + input_1 + offsetIn1*input1_contiguous_size, + output + offsetOut*output_contiguous_size); + if (++stack < nbStacks) { + std::size_t tmp_stack = stack; + while(tmp_stack % outputDims[dim] == 0) { + tmp_stack /= outputDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = contiguousIdx - 1; + } + } +} + +template <class I1, class I2, class O> +void SubImpl_cpu_backward_kernel(const std::size_t input0Length, + const std::size_t input1Length, + const std::size_t gradOutputLength, + const std::vector<std::size_t>& dims0, + const std::vector<std::size_t>& dims1, + const std::vector<std::size_t>& outputDims, + const void* input0_, + const void* input1_, + const void* grad_output_, + void* gradientInput0_, + void* gradientInput1_) +{ + const I1* input0 = static_cast<const I1*>(input0_); + const I2* input1 = static_cast<const I2*>(input1_); + const O* grad_output = static_cast<const O*>(grad_output_); + auto* grad_input_0 = static_cast<I1*>(gradientInput0_); + auto* grad_input_1 = static_cast<I2*>(gradientInput1_); + + std::fill_n(grad_input_0, input0Length, static_cast<I1>(0)); + std::fill_n(grad_input_1, input1Length, static_cast<I2>(0)); + + auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0); + auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1); + + for (std::size_t i = 0; i < gradOutputLength; ++i) { + auto idxOutputGrad = getMultiDimIndices(outputDims, i); + std::vector<std::size_t> idxInput0(broadcastedDims0.size()); + std::vector<std::size_t> idxInput1(broadcastedDims1.size()); + + for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) { + idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension]; + } + + for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) { + idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension]; + } + + auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0); + auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1); + + // For subtraction: gradient of first input is 1 * grad_output + grad_input_0[idx0] += static_cast<I1>(grad_output[i]); + // For subtraction: gradient of second input is -1 * grad_output + grad_input_1[idx1] += static_cast<I2>(-grad_output[i]); + } +} + + +// Kernels registration to implementation entry point +REGISTRAR(SubImpl_cpu, + {DataType::Float32}, + {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<float, float, float>, Aidge::SubImpl_cpu_backward_kernel<float,float,float>}); +REGISTRAR(SubImpl_cpu, + {DataType::Float64}, + {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<double, double, double>, nullptr}); +REGISTRAR(SubImpl_cpu, + {DataType::Int8}, + {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::int8_t, std::int8_t, std::int8_t>, nullptr}); +REGISTRAR(SubImpl_cpu, + {DataType::UInt8}, + {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t, std::uint8_t>, nullptr}); +REGISTRAR(SubImpl_cpu, + {DataType::Int32}, + {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr}); +REGISTRAR(SubImpl_cpu, + {DataType::Int64}, + {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr}); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_ */ diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp index 4fe39630..0c4a64bb 100644 --- a/unit_tests/operator/Test_MetaOperator.cpp +++ b/unit_tests/operator/Test_MetaOperator.cpp @@ -705,7 +705,7 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") { auto fc2 = FC(outChannels, inChannels, true, "fc2"); // NOTE: Account for init step by adding 1 to the max timestep // parameter. - auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, "leaky"); + auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, LeakyReset::Subtraction, "leaky"); // associateInput() does not work fc1->input(1).first->getOperator()->setOutput(0, myWeights); @@ -774,7 +774,7 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") { const auto nbTimeSteps = dims[0]; const auto beta = betaDist(gen); - auto myLeaky = Leaky(nbTimeSteps, beta, 1.0, "leaky"); + auto myLeaky = Leaky(nbTimeSteps, beta, 1.0, LeakyReset::Subtraction, "leaky"); auto op = std::static_pointer_cast<MetaOperator_Op>(myLeaky->getOperator()); // auto stack = Stack(2); -- GitLab