-
Olivier BICHLER authoredOlivier BICHLER authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
ModImpl.cpp 6.01 KiB
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <memory>
#include <vector>
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include "aidge/backend/cpu/operator/ModImpl.hpp"
#include "aidge/backend/cpu/operator/ModImpl_kernels.hpp"
#include "aidge/data/Tensor.hpp"
#include "aidge/utils/Types.h"
template <>
void Aidge::ModImpl_cpu::forward() {
// 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
// 2. Find the highest equal dimension -> 3
// Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
// 3. Compute the highest number of contiguous data -> 7
// 4. Compute stride and offset step for the broadcast mechanism
// 5. Call a simple kernel
const auto& opTensor = static_cast<const Mod_Op&>(mOp);
// Find the correct kernel type
const auto impl = Registrar<ModImpl_cpu>::create(getBestMatch(getRequiredSpec()));
// Compute compatible input dimensions
std::vector<std::size_t> dims0 = opTensor.getInput(0)->dims();
std::vector<std::size_t> dims1 = opTensor.getInput(1)->dims();
const std::vector<std::size_t>& outDims = opTensor.getOutput(0)->dims();
// special case for equal dimensions, the kernel is called with the entire arrays at once
if (dims0 == dims1) {
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
impl.forward(opTensor.fmod(),
input0_contiguous_size, input0_contiguous_size, input0_contiguous_size,
getCPUPtr(mOp.getRawInput(0)),
getCPUPtr(mOp.getRawInput(1)),
getCPUPtr(mOp.getRawOutput(0)));
return;
}
// set dimensions to be of equal size by filling the smallest one with ones.
if (dims0.size() > dims1.size()) {
dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
const std::size_t nbDims = dims0.size();
// Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
}
}
++contiguousIdx;
// Compute the highest number of contiguous data for each Tensor
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t output_contiguous_size = std::accumulate(outDims.cbegin()+contiguousIdx, outDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outDims.cbegin(), outDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
impl.forward(opTensor.fmod(), input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
getCPUPtr(mOp.getRawInput(0), offsetIn0*input0_contiguous_size),
getCPUPtr(mOp.getRawInput(1), offsetIn1*input1_contiguous_size),
getCPUPtr(mOp.getRawOutput(0), offsetOut*output_contiguous_size));
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outDims[dim] == 0) {
tmp_stack /= outDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
}
template <>
void Aidge::ModImpl_cpu::backward() {
AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Mod_Op on backend cpu");
}