Skip to content
Snippets Groups Projects
Commit 740a27ba authored by Maxence Naud's avatar Maxence Naud
Browse files

Merge branch 'dev' into 'main'

v0.4.0

See merge request !118
parents a2477ff1 ed718fc0
No related branches found
Tags v0.4.0
1 merge request!118v0.4.0
Pipeline #61247 passed
Showing
with 924 additions and 166 deletions
...@@ -12,19 +12,14 @@ stages: ...@@ -12,19 +12,14 @@ stages:
- deploy - deploy
include: include:
- project: 'eclipse/aidge/gitlab_shared_files' - project: 'eclipse/aidge/gitlab_shared_files'
ref: 'main' ref: 'main'
file: file: # choose which jobs to run by including the corresponding files.
# choose which jobs to run by including the corresponding files.
- '.gitlab/ci/ubuntu_cpp.gitlab-ci.yml' - '.gitlab/ci/ubuntu_cpp.gitlab-ci.yml'
- '.gitlab/ci/ubuntu_python.gitlab-ci.yml' - '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
- '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml' - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'
- '.gitlab/ci/windows_cpp.gitlab-ci.yml' - '.gitlab/ci/windows_cpp.gitlab-ci.yml'
- '.gitlab/ci/windows_python.gitlab-ci.yml'
- '.gitlab/ci/windows_python.gitlab-ci.yml' - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'
- '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'
# Verson 0.4.0 (December 6, 2024)
# Version 0.2.2 (May 14, 2024) # Version 0.2.2 (May 14, 2024)
* Remove implmentation for Operators soly handling memory and format * Remove implmentation for Operators soly handling memory and format
......
...@@ -22,6 +22,9 @@ execute_process( ...@@ -22,6 +22,9 @@ execute_process(
message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}") message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}")
add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}") add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")
# helper for LSP users
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
# Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME} # Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME}
set(module_name _${CMAKE_PROJECT_NAME}) # target name set(module_name _${CMAKE_PROJECT_NAME}) # target name
set(pybind_module_name ${CMAKE_PROJECT_NAME}) # name of submodule for python bindings set(pybind_module_name ${CMAKE_PROJECT_NAME}) # name of submodule for python bindings
......
...@@ -13,9 +13,10 @@ class test_scheduler(unittest.TestCase): ...@@ -13,9 +13,10 @@ class test_scheduler(unittest.TestCase):
pass pass
def test_relu_forward(self): def test_relu_forward(self):
values = np.arange(6) - 3
input_node = aidge_core.Producer(aidge_core.Tensor(values), "Input") t = aidge_core.Tensor(np.arange(6, dtype=np.int32) - 3)
input_node = aidge_core.Producer(t)
relu = aidge_core.ReLU() relu = aidge_core.ReLU()
input_node.add_child(relu) input_node.add_child(relu)
...@@ -34,7 +35,7 @@ class test_scheduler(unittest.TestCase): ...@@ -34,7 +35,7 @@ class test_scheduler(unittest.TestCase):
out_tensor = relu.get_operator().get_output(0) out_tensor = relu.get_operator().get_output(0)
expected_out = [0,0,0,0,1,2] expected_out = [0,0,0,0,1,2]
for i in range(len(expected_out)): for i in range(len(expected_out)):
self.assertEqual(expected_out[i], out_tensor[i]) self.assertEqual(expected_out[i], out_tensor[i], f"On idx {i}")
def test_sequential_scheduling(self): def test_sequential_scheduling(self):
input_data = np.array([0]).astype(np.float32) input_data = np.array([0]).astype(np.float32)
...@@ -69,7 +70,7 @@ class test_scheduler(unittest.TestCase): ...@@ -69,7 +70,7 @@ class test_scheduler(unittest.TestCase):
aidge_core.Producer(input_tensor, "X"), aidge_core.Producer(input_tensor, "X"),
aidge_core.FC(1, 50, name='0'), aidge_core.FC(1, 50, name='0'),
aidge_core.parallel([aidge_core.FC(50, 50, name='1'), aidge_core.FC(50, 50, name='3')]), aidge_core.parallel([aidge_core.FC(50, 50, name='1'), aidge_core.FC(50, 50, name='3')]),
aidge_core.Add(2, name='2'), aidge_core.Add(name='2'),
]) ])
EXPECTED_SCHEDULE = [['0', '1', '3', '2'], ['0', '3', '1', '2']] # Both scheduling are valid ! EXPECTED_SCHEDULE = [['0', '1', '3', '2'], ['0', '3', '1', '2']] # Both scheduling are valid !
......
...@@ -15,11 +15,14 @@ ...@@ -15,11 +15,14 @@
#include "aidge/backend/cpu/operator/AbsImpl.hpp" #include "aidge/backend/cpu/operator/AbsImpl.hpp"
#include "aidge/backend/cpu/operator/AddImpl.hpp" #include "aidge/backend/cpu/operator/AddImpl.hpp"
#include "aidge/backend/cpu/operator/AndImpl.hpp" #include "aidge/backend/cpu/operator/AndImpl.hpp"
#include "aidge/backend/cpu/operator/AtanImpl.hpp"
#include "aidge/backend/cpu/operator/ArgMaxImpl.hpp" #include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
#include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp" #include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp"
#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp" #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
#include "aidge/backend/cpu/operator/BatchNormImpl.hpp" #include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
#include "aidge/backend/cpu/operator/BitShiftImpl.hpp" #include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
#include "aidge/backend/cpu/operator/ClipImpl.hpp"
#include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp" #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
#include "aidge/backend/cpu/operator/ConvImpl.hpp" #include "aidge/backend/cpu/operator/ConvImpl.hpp"
#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp" #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
...@@ -28,15 +31,19 @@ ...@@ -28,15 +31,19 @@
#include "aidge/backend/cpu/operator/FCImpl.hpp" #include "aidge/backend/cpu/operator/FCImpl.hpp"
#include "aidge/backend/cpu/operator/FoldImpl.hpp" #include "aidge/backend/cpu/operator/FoldImpl.hpp"
#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp" #include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
#include "aidge/backend/cpu/operator/LRNImpl.hpp"
#include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp" #include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
#include "aidge/backend/cpu/operator/LnImpl.hpp" #include "aidge/backend/cpu/operator/LnImpl.hpp"
#include "aidge/backend/cpu/operator/MatMulImpl.hpp" #include "aidge/backend/cpu/operator/MatMulImpl.hpp"
#include "aidge/backend/cpu/operator/MulImpl.hpp" #include "aidge/backend/cpu/operator/MulImpl.hpp"
#include "aidge/backend/cpu/operator/PadImpl.hpp" #include "aidge/backend/cpu/operator/PadImpl.hpp"
#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
#include "aidge/backend/cpu/operator/PowImpl.hpp" #include "aidge/backend/cpu/operator/PowImpl.hpp"
#include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp" #include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
#include "aidge/backend/cpu/operator/ReduceSumImpl.hpp" #include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
#include "aidge/backend/cpu/operator/ResizeImpl.hpp"
#include "aidge/backend/cpu/operator/ReLUImpl.hpp" #include "aidge/backend/cpu/operator/ReLUImpl.hpp"
#include "aidge/backend/cpu/operator/RoundImpl.hpp"
#include "aidge/backend/cpu/operator/ScalingImpl.hpp" #include "aidge/backend/cpu/operator/ScalingImpl.hpp"
#include "aidge/backend/cpu/operator/SigmoidImpl.hpp" #include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
#include "aidge/backend/cpu/operator/SqrtImpl.hpp" #include "aidge/backend/cpu/operator/SqrtImpl.hpp"
......
/********************************************************************************
* Copyright (c) 2024 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_DATA_INTERPOLATION_H_
#define AIDGE_CPU_DATA_INTERPOLATION_H_
#include <vector>
#include <aidge/data/Interpolation.hpp>
#include <aidge/utils/Types.h>
namespace Aidge {
class InterpolationCPU : public Interpolation {
public:
/*
* @brief Interpolates values given via input in given mode.
*
* Values are contiguously arranged in a "square" shape around the point to
* interpolate. Depending on interpolation mode.
* The point that will be interpolated is located right in the
* middle of all points.
* Immediate neighbours :
* 1D interp : 2D interp :
* . . . . . .
* . . 1 2 . . . . . . . .
* . . 1 2 . .
* . . 3 4 . .
* . . . . . .
* . . . . . .
*
* 2 neighbours :
* 1D interp : 2D interp :
* . . . . . . . .
* . . . . . . . .
* . . 1 2 3 4 . . . . 1 2 3 4 . .
* . . 5 6 7 8 . .
* . . 9 10 11 12 . .
* . . 13 14 15 16 . .
* . . . . . . . .
* . . . . . . . .
*
* @param[in] originalCoords: coord of the point to interpolate in the
* original picture. These coords are generated with
* Interpolation::untransformCoords(coordsInInterpolatedTensor)
* @param[in] points : points to interpolate, arranged in a vector of a
* pairs ((point_coord), value) :
* [[[X1, X2, ..., XN], Xval], ...., [[A1, A2, ..., AN],Aval]].
* With :
* - N: the number of dimensions.
* - A: the number of points of the grid to interpolate.
* - All coordinates expressed in originalTensor frame.
* @param[in] interpMode: interpolation mode
* @return interpolated value
*/
template <typename T>
static T interpolate(const std::vector<float> &coordsToInterpolate,
const std::set<Point<T>> &points,
const Mode interpMode = Interpolation::Mode::Linear);
/**
* @brief performs linear interpolation on given points.
* @param[in] values: values to interpolate, since we only do an average of
* all values, their indexes isn't useful.
* @return interpolated value
*/
template <typename T>
static T linear(const std::vector<float> &originalCoords,
const std::set<Point<T>> &points);
/**
* @brief performs nearest interpolation on given points.
* @note it is a wrapper for linearRecurse() private method
* @param[in] coordsToInterpolate: coordinates to interpolate
* @param[in] points: points to interpolate
* @param[in] interpMode: interpolation method, must be a Nearest...
* otherwise function will throw an error.
* @return interpolated value
*/
template <typename T>
static T nearest(const std::vector<float> &coordsToInterpolate,
const std::set<Point<T>> &points,
const Interpolation::Mode nearestMode);
private:
/**
* @brief actual linear interpolation function.
* will :
* - Split all points along each dimension depending of if their coords at
* idx alongDim are above or under coordsToInterpolate until they are
* 1-to-1.
* - Perform interpolation in 2 leftover points and return interpolated
* point to parent call with a set of size 1.
* - repeat until all dimensions have been interpolated.
* @param[in] coordsToInterpolate: coordinates to interpolate
* @param[in] points: points to interpolate
* @param[in] alongDim: discriminant on along which dimension are being
* segregated.
* @return
*/
template <typename T>
static std::set<Interpolation::Point<T>>
linearRecurse(const std::vector<float> &coordsToInterpolate,
const std::set<Point<T>> &points,
const DimIdx_t alongDim = 0);
};
} // namespace Aidge
#endif // AIDGE_CPU_DATA_INTERPOLATION_H_
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
namespace Aidge { namespace Aidge {
// Operator implementation entry point for the backend // Operator implementation entry point for the backend
using AddImpl_cpu = OperatorImpl_cpu<Add_Op, using AddImpl_cpu = OperatorImpl_cpu<Add_Op,
void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)>; void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
// Implementation entry point registration to Operator // Implementation entry point registration to Operator
REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create); REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create);
......
...@@ -14,31 +14,137 @@ ...@@ -14,31 +14,137 @@
#include "aidge/utils/Registrar.hpp" #include "aidge/utils/Registrar.hpp"
#include <cstdint> // std::int32_t, std::int64_t #include <cstddef> // std::size_t
#include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/AddImpl.hpp" #include "aidge/backend/cpu/operator/AddImpl.hpp"
namespace Aidge { namespace Aidge {
namespace {
// suppose values are contiguous in memory
template <class I, class O> template <class I, class O>
void AddImpl_cpu_forward_kernel(const std::vector<const void*> inputs_, const std::vector<std::vector<std::size_t>>& inputDims, const std::size_t outputLength, const std::vector<std::size_t>& outDims, void* output_) { void add_contiguous_arrays(const std::size_t input1size,
// FIXME: missing Add attributes as arguments const std::size_t input2size,
std::vector<const I*> inputs; const std::size_t output1size,
for (const auto& input_ : inputs_) { const I* input1,
inputs.push_back(static_cast<const I*>(input_)); const I* input2,
O* output)
{
for (std::size_t i = 0; i < output1size; ++i)
{
const std::size_t in1_id = (input1size != 1) ? i : 0;
const std::size_t in2_id = (input2size != 1) ? i : 0;
output[i] = static_cast<O>(input1[in1_id] + input2[in2_id]);
} }
}
}
template <class I, class O>
void AddImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
std::vector<std::size_t> dims1,
const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_,
void* output_) {
const I* input_0 = static_cast<const I*>(input0_);
const I* input_1 = static_cast<const I*>(input1_);
O* output = static_cast<O*>(output_); O* output = static_cast<O*>(output_);
for (std::size_t oIndex = 0; oIndex < outputLength; ++oIndex) // [5,2,1,7] & [2,6,7]
{ // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
output[oIndex] = 0; // 2. Find the highest equal dimension -> 3
std::vector<size_t> indexes = getMultiDimIndices(outDims, oIndex); // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
for(std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) { // 3. Compute the highest number of contiguous data -> 7
std::size_t idx = getFlattenedIndex(inputDims[iIndex], indexes); // 4. Compute stride and offset step for the broadcast mechanism
output[oIndex] += inputs[iIndex][idx]; // 5. Call a simple kernel
}
} // special case for equal dimensions, the kernel is called with the entire arrays at once
if (dims0 == dims1) {
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t i = 0; i < input0_contiguous_size; ++i)
{
output[i] = static_cast<O>(input_0[i] + input_1[i]);
}
return;
}
// set dimensions to be of equal size by filling the smallest one with ones.
if (dims0.size() > dims1.size()) {
dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
const std::size_t nbDims = dims0.size();
// Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
}
}
++contiguousIdx;
// Compute the highest number of contiguous data for each Tensor
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
add_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
input_0 + offsetIn0*input0_contiguous_size,
input_1 + offsetIn1*input1_contiguous_size,
output + offsetOut*output_contiguous_size);
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outputDims[dim] == 0) {
tmp_stack /= outputDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
} }
// Kernels registration to implementation entry point // Kernels registration to implementation entry point
...@@ -48,6 +154,12 @@ REGISTRAR(AddImpl_cpu, ...@@ -48,6 +154,12 @@ REGISTRAR(AddImpl_cpu,
REGISTRAR(AddImpl_cpu, REGISTRAR(AddImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}}, {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
{ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, nullptr}); {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, nullptr});
REGISTRAR(AddImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}},
{ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, nullptr});
REGISTRAR(AddImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}},
{ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, nullptr});
REGISTRAR(AddImpl_cpu, REGISTRAR(AddImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}}, {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
{ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr}); {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
......
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
namespace Aidge { namespace Aidge {
// Operator implementation entry point for the backend // Operator implementation entry point for the backend
using AndImpl_cpu = OperatorImpl_cpu<And_Op, using AndImpl_cpu = OperatorImpl_cpu<And_Op,
void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>; void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
// Implementation entry point registration to Operator // Implementation entry point registration to Operator
REGISTRAR(And_Op, "cpu", Aidge::AndImpl_cpu::create); REGISTRAR(And_Op, "cpu", Aidge::AndImpl_cpu::create);
......
...@@ -12,52 +12,152 @@ ...@@ -12,52 +12,152 @@
#ifndef AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ #ifndef AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ #define AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/AndImpl.hpp" #include "aidge/backend/cpu/operator/AndImpl.hpp"
#include "aidge/utils/Registrar.hpp" #include "aidge/utils/Registrar.hpp"
namespace Aidge { namespace Aidge {
template <class I1, class I2, class O>
void AndImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims, namespace {
const std::vector<std::size_t>& input2Dims, // suppose values are contiguous in memory
template <class I, class O>
void equal_contiguous_arrays(const std::size_t input1size,
const std::size_t input2size,
const std::size_t output1size,
const I* input1,
const I* input2,
O* output)
{
for (std::size_t i = 0; i < output1size; ++i)
{
const std::size_t in1_id = (input1size != 1) ? i : 0;
const std::size_t in2_id = (input2size != 1) ? i : 0;
output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
}
}
}
template <class I, class O>
void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
std::vector<std::size_t> dims1,
const std::vector<std::size_t>& outputDims, const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_, const void* input1_,
const void* input2_,
void* output_) { void* output_) {
const I1* input_1 = static_cast<const I1*>(input1_); const I* input_0 = static_cast<const I*>(input0_);
const I2* input_2 = static_cast<const I2*>(input2_); const I* input_1 = static_cast<const I*>(input1_);
O* output = static_cast<O*>(output_); O* output = static_cast<O*>(output_);
size_t totalElements = 1; // [5,2,1,7] & [2,6,7]
for (size_t dimSize : outputDims) { // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
totalElements *= dimSize; // 2. Find the highest equal dimension -> 3
// Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
// 3. Compute the highest number of contiguous data -> 7
// 4. Compute stride and offset step for the broadcast mechanism
// 5. Call a simple kernel
// special case for equal dimensions, the kernel is called with the entire arrays at once
if (dims0 == dims1) {
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t i = 0; i < input0_contiguous_size; ++i)
{
output[i] = static_cast<O>(input_0[i] == input_1[i]);
}
return;
} }
for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) // set dimensions to be of equal size by filling the smallest one with ones.
{ if (dims0.size() > dims1.size()) {
std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); const std::size_t nbDims = dims0.size();
std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
output[oIndex] = static_cast<O>(input_1[idx1] == input_2[idx2]); // Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
}
}
++contiguousIdx;
// Compute the highest number of contiguous data for each Tensor
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
input_0 + offsetIn0*input0_contiguous_size,
input_1 + offsetIn1*input1_contiguous_size,
output + offsetOut*output_contiguous_size);
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outputDims[dim] == 0) {
tmp_stack /= outputDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
} }
} }
// Kernels registration to implementation entry point // Kernels registration to implementation entry point
REGISTRAR(AndImpl_cpu, REGISTRAR(AndImpl_cpu,
{DataType::Float32}, {DataType::Float32},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float, float>, nullptr}); {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr});
REGISTRAR(AndImpl_cpu, REGISTRAR(AndImpl_cpu,
{DataType::Float64}, {DataType::Float64},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double, double>, nullptr}); {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr});
REGISTRAR(AndImpl_cpu, REGISTRAR(AndImpl_cpu,
{DataType::Int32}, {DataType::Int32},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr}); {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
REGISTRAR(AndImpl_cpu, REGISTRAR(AndImpl_cpu,
{DataType::Int64}, {DataType::Int64},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr}); {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
} // namespace Aidge } // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ */ #endif /* AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_ATAN_H_
#define AIDGE_CPU_OPERATOR_ATAN_H_
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/Atan.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include <memory>
#include <vector>
namespace Aidge {
// Operator implementation entry point for the backend
using AtanImpl_cpu = OperatorImpl_cpu<Atan_Op,
void(const std::size_t, const void*, void*),
void(const std::size_t, const void*, const void*, void*)>;
// Implementation entry point registration to Operator
REGISTRAR(Atan_Op, "cpu", Aidge::AtanImpl_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_ATAN_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_
#include "aidge/utils/Registrar.hpp"
#include "aidge/backend/cpu/operator/AtanImpl.hpp"
#include <cmath> // For atan()
namespace Aidge {
template <class I, class O>
void AtanImpl_cpu_forward_kernel(std::size_t inputLenght,
const void* input_,
void* output_) {
const I* input = static_cast<const I*>(input_);
O* output = static_cast<O*>(output_);
for (size_t i = 0; i < inputLenght; ++i) {
output[i] = static_cast<O>(atan(input[i]));
}
}
template <class O, class GI, class GO>
void AtanImpl_cpu_backward_kernel(const std::size_t inputLenght,
const void* output_, const void* grad_output_,
void* grad_input_) {
const O* output = static_cast<const O*>(output_);
const GO* grad_output = static_cast<const GO*>(grad_output_);
GI* grad_input = static_cast<GI*>(grad_input_);
// Apply the derivative of atan for each element in the input array
for (size_t i = 0; i < inputLenght; ++i) {
// dx = dy * (1 / (1 + x^2))
grad_input[i] = grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i]));
}
}
// Kernels registration to implementation entry point
REGISTRAR(AtanImpl_cpu,
{DataType::Float32},
{ProdConso::inPlaceModel, Aidge::AtanImpl_cpu_forward_kernel<float, float>, Aidge::AtanImpl_cpu_backward_kernel<float, float, float>});
REGISTRAR(AtanImpl_cpu,
{DataType::Float64},
{ProdConso::inPlaceModel, Aidge::AtanImpl_cpu_forward_kernel<double, double>, Aidge::AtanImpl_cpu_backward_kernel<double, double, double>});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_ */
...@@ -29,7 +29,7 @@ using BatchNorm2D_Op = BatchNorm_Op<2>; ...@@ -29,7 +29,7 @@ using BatchNorm2D_Op = BatchNorm_Op<2>;
using BatchNormImpl2D_cpu = OperatorImpl_cpu<BatchNorm_Op<2>, using BatchNormImpl2D_cpu = OperatorImpl_cpu<BatchNorm_Op<2>,
void(float, void(float,
float, float,
const std::array<DimSize_t, 4> &, const std::vector<DimSize_t> &,
const void *, const void *,
const void *, const void *,
const void *, const void *,
......
...@@ -38,7 +38,7 @@ namespace Aidge { ...@@ -38,7 +38,7 @@ namespace Aidge {
* @param output_ Output Tensor. * @param output_ Output Tensor.
*/ */
template <class I, class P, class O> template <class I, class P, class O>
void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std::array<DimSize_t, 4> &dims, void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std::vector<DimSize_t> &dims,
const void *input_, const void *scale_, const void *shift_, void *batchMean_, void *batchVar_, void *output_, const bool freeze) { const void *input_, const void *scale_, const void *shift_, void *batchMean_, void *batchVar_, void *output_, const bool freeze) {
// FIXME: missing convolution attributes as arguments // FIXME: missing convolution attributes as arguments
const I *input = static_cast<const I *>(input_); const I *input = static_cast<const I *>(input_);
...@@ -49,9 +49,8 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std ...@@ -49,9 +49,8 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std
O *output = static_cast<O *>(output_); O *output = static_cast<O *>(output_);
const DimSize_t nbBatch = dims[0]; const DimSize_t nbBatch = dims[0];
const DimSize_t nbChannels = dims[1]; const DimSize_t nbChannels = (dims.size() > 1) ? dims[1] : 1;
const DimSize_t featureMapSize = dims[2]*dims[3]; const DimSize_t featureMapSize = (dims.size() > 2) ? std::accumulate(dims.begin() + 2, dims.end(), 1, std::multiplies<DimSize_t>()) : 1;
if ((freeze == true) || (momentum == 0.0f)) { if ((freeze == true) || (momentum == 0.0f)) {
for (std::size_t batch = 0; batch < nbBatch; ++batch) { for (std::size_t batch = 0; batch < nbBatch; ++batch) {
......
...@@ -24,13 +24,13 @@ namespace Aidge { ...@@ -24,13 +24,13 @@ namespace Aidge {
// Operator implementation entry point for the backend // Operator implementation entry point for the backend
using BitShiftImpl_cpu = OperatorImpl_cpu<BitShift_Op, using BitShiftImpl_cpu = OperatorImpl_cpu<BitShift_Op,
void(const BitShift_Op::BitShiftDirection, void(const BitShift_Op::BitShiftDirection,
const std::vector<std::size_t>&, std::vector<std::size_t>,
const std::vector<std::size_t>&, std::vector<std::size_t>,
const std::vector<std::size_t>&, const std::vector<std::size_t>&,
const void*, const void*,
const void*, const void*,
void*)>; void*)>;
// Implementation entry point registration to Operator // Implementation entry point registration to Operator
REGISTRAR(BitShift_Op,"cpu",Aidge::BitShiftImpl_cpu::create); REGISTRAR(BitShift_Op,"cpu",Aidge::BitShiftImpl_cpu::create);
} // namespace Aidge } // namespace Aidge
......
...@@ -12,47 +12,150 @@ ...@@ -12,47 +12,150 @@
#ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_ #ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_ #define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_
#include "aidge/utils/Registrar.hpp"
#include <cstdint> // std::int32_t, std::int64_t #include <cstdint> // std::int32_t, std::int64_t
#include "aidge/operator/BitShift.hpp" #include <cstddef> // std::size_t
#include "aidge/backend/cpu/data/Broadcasting.hpp" #include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/BitShiftImpl.hpp" #include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
#include "aidge/operator/BitShift.hpp"
#include "aidge/utils/Registrar.hpp"
namespace {
// suppose values are contiguous in memory
template <class I1, class I2, class O>
void bitshift_contiguous_arrays(
const Aidge::BitShift_Op::BitShiftDirection direction,
const std::size_t input1size,
const std::size_t input2size,
const std::size_t output1size,
const I1* input_1,
const I2* input_2,
O* output)
{
if(direction == Aidge::BitShift_Op::BitShiftDirection::right) {
for (std::size_t i = 0; i < output1size; ++i) {
const std::size_t idx1 = (input1size != 1) ? i : 0;
const std::size_t idx2 = (input2size != 1) ? i : 0;
output[i]= input_1[idx1] >> input_2[idx2];
}
} else {
for (std::size_t i = 0; i < output1size; ++i) {
const std::size_t idx1 = (input1size != 1) ? i : 0;
const std::size_t idx2 = (input2size != 1) ? i : 0;
output[i] = input_1[idx1] << input_2[idx2];
}
}
}
}
namespace Aidge { namespace Aidge {
template <class I1, class I2, class O> template <class I1, class I2, class O>
void BitShiftImpl_cpu_forward_kernel( void BitShiftImpl_cpu_forward_kernel(
const BitShift_Op::BitShiftDirection direction, const BitShift_Op::BitShiftDirection direction,
const std::vector<std::size_t>& input1Dims, std::vector<std::size_t> dims0,
const std::vector<std::size_t>& input2Dims, std::vector<std::size_t> dims1,
const std::vector<std::size_t>& outputDims, const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_, const void* input1_,
const void* input2_,
void* output_ void* output_
) { ) {
const I1* input_1 = static_cast<const I1*>(input1_); const I1* input_0 = static_cast<const I1*>(input0_);
const I2* input_2 = static_cast<const I2*>(input2_); const I2* input_1 = static_cast<const I2*>(input1_);
O* output = static_cast<O*>(output_); O* output = static_cast<O*>(output_);
const size_t totalElements = std::accumulate(outputDims.begin(), outputDims.end(), std::size_t(1), std::multiplies<std::size_t>()); // [5,2,1,7] & [2,6,7]
// 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) // 2. Find the highest equal dimension -> 3
{ // Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex); // 3. Compute the highest number of contiguous data -> 7
std::size_t idx1 = getFlattenedIndex(input1Dims, indexes); // 4. Compute stride and offset step for the broadcast mechanism
std::size_t idx2 = getFlattenedIndex(input2Dims, indexes); // 5. Call a simple kernel
if(direction == BitShift_Op::BitShiftDirection::right)
// ## Compute compatible input dimensions
{ // special case for equal dimensions, the kernel is called with the entire arrays at once
output[oIndex]= input_1[idx1] >> input_2[idx2]; if (dims0 == dims1) {
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
bitshift_contiguous_arrays(direction, input0_contiguous_size, input0_contiguous_size, input0_contiguous_size, input_0, input_1, output);
return;
}
// set dimensions to be of equal size by filling the smallest one with ones.
if (dims0.size() > dims1.size()) {
dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
const std::size_t nbDims = dims0.size();
// Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
} }
else }
{ ++contiguousIdx;
output[oIndex] = input_1[idx1] << input_2[idx2];
// Compute the highest number of contiguous data for each Tensor
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
bitshift_contiguous_arrays<I1,I2,O>(direction, input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
input_0 + offsetIn0*input0_contiguous_size,
input_1 + offsetIn1*input1_contiguous_size,
output + offsetOut*output_contiguous_size);
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outputDims[dim] == 0) {
tmp_stack /= outputDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
} }
} }
} }
......
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_CLIPIMPL_H_
#define AIDGE_CPU_OPERATOR_CLIPIMPL_H_
#include <cstddef> // std::size_t
#include <memory>
#include <tuple> // std::tuple
#include <vector>
#include <algorithm>
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/Clip.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
namespace Aidge {
// Operator implementation entry point for the backend
using ClipImpl_cpu = OperatorImpl_cpu<Clip_Op,
void(float, //Forward Types
float,
const void*,
const std::size_t,
void*),
void(float,//Backward Types
float,
const std::size_t,
const void*,
const void*,
void*)>;
REGISTRAR(Clip_Op,"cpu",Aidge::ClipImpl_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_CLIPIMPL_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_
#include "aidge/utils/Registrar.hpp"
#include "aidge/backend/cpu/operator/ClipImpl.hpp"
namespace Aidge {
template <class I, class O>
void ClipImpl_cpu_forward_kernel(
float min_,
float max_,
const void* input_,
const std::size_t length,
void* output_)
{
const I* input = static_cast<const I*>(input_);
O* output = static_cast<O*>(output_);
for (std::size_t i = 0; i < length; ++i) {
output[i] = std::min(std::max(static_cast<float>(input[i]), min_), max_);
}
}
template <class I, class GI, class GO>
void ClipImpl_cpu_backward_kernel(
float min_,
float max_,
const std::size_t length,
const void* input_,
const void* grad_output_,
void* grad_input_)
{
const I* input = static_cast<const I*>(input_);
const GO* grad_output = static_cast<const GO*>(grad_output_);
GI* grad_input = static_cast<GI*>(grad_input_);
for (std::size_t i = 0; i < length; ++i) {
grad_input[i] = ((input[i] > min_) && (input[i] < max_)) ? grad_output[i] : 0;
}
}
REGISTRAR(ClipImpl_cpu,
{DataType::Float32},
{ProdConso::inPlaceModel,
Aidge::ClipImpl_cpu_forward_kernel<float,float>,
Aidge::ClipImpl_cpu_backward_kernel<float,float,float>});
REGISTRAR(ClipImpl_cpu,
{DataType::Float64},
{ProdConso::inPlaceModel,
Aidge::ClipImpl_cpu_forward_kernel<double,double>,
Aidge::ClipImpl_cpu_backward_kernel<double,double,double>});
REGISTRAR(ClipImpl_cpu,
{DataType::Int32},
{ProdConso::inPlaceModel,
Aidge::ClipImpl_cpu_forward_kernel<std::int32_t,std::int32_t>,
Aidge::ClipImpl_cpu_backward_kernel<std::int32_t,std::int32_t,std::int32_t>});
REGISTRAR(ClipImpl_cpu,
{DataType::Int64},
{ProdConso::inPlaceModel,
Aidge::ClipImpl_cpu_forward_kernel<std::int64_t,std::int64_t>,
Aidge::ClipImpl_cpu_backward_kernel<std::int64_t,std::int64_t,std::int64_t>});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_ */
...@@ -137,6 +137,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri ...@@ -137,6 +137,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
const std::size_t oxSize = const std::size_t oxSize =
static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) / static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
static_cast<float>(strideDims[0]))); static_cast<float>(strideDims[0])));
// output W size // output W size
const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1; const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
const std::size_t oySize = const std::size_t oySize =
...@@ -148,54 +149,106 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri ...@@ -148,54 +149,106 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
// input (batch, ch, Xin, Yin) // input (batch, ch, Xin, Yin)
// weight (outCh, ch, kernelX, kernelY) // weight (outCh, ch, kernelX, kernelY)
// does not take Dilation attribute into account // does not take Dilation attribute into account
using signedsize = std::make_signed<std::size_t>::type; const std::size_t outChannels_s = oxSize * oySize;
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { if (dilated_kernel_x ==3 && dilated_kernel_y == 3) {
const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize * oySize; for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
B biasVal = (biases != nullptr) ? biases[ch] : B(0); for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; B biasVal = (biases != nullptr) ? biases[ch] : B(0);
const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
for (std::size_t ox = 0; ox < oxSize; ++ox) { std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
// const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]); const std::size_t wIndex = ch * 9;
// const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
// const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx); if (strideDims[0] == 1 && strideDims[1]==1) {
const std::size_t sxMin = 0; for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
const std::size_t sxMax = dilated_kernel_x; for (std::size_t oy = 0; oy < oySize; ++oy) {
for (std::size_t oy = 0; oy < oySize; ++oy) { output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
// const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]); }
// const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0))); iIndex+=inputDims[3];
// const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify); for (std::size_t oy = 0; oy < oySize; ++oy) {
const std::size_t syMin = 0; output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
const std::size_t syMax = dilated_kernel_y; }
const std::size_t oIndexFull = oIndex + ox*oySize + oy; iIndex+=inputDims[3];
const signedsize ix = static_cast<signedsize>(ox * strideDims[0]); for (std::size_t oy = 0; oy < oySize; ++oy) {
const signedsize iy = static_cast<signedsize>(oy * strideDims[1]); output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
}
if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) { }
output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] + } else {
weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] + for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) {
weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] + for (std::size_t oy = 0; oy < oySize; ++oy) {
weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] + output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2];
weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] + }
weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] + iIndex+=strideDims[0]*inputDims[3];
weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] + for (std::size_t oy = 0; oy < oySize; ++oy) {
weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] + output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2];
weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]); }
} else { iIndex+=strideDims[0]*inputDims[3];
for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) { for (std::size_t oy = 0; oy < oySize; ++oy) {
for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) { output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2];
}
}
}
output += outChannels_s;
}
}
} else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
std::size_t index = 0;
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
B biasVal = (biases != nullptr) ? biases[ch] : B(0);
const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = ch;
if (strideDims[0] == 1 && strideDims[1] == 1) {
for (; index < iIndex + oxSize*oySize; ++index) {
output[index] = biasVal + weights[wIndex] * input[index];
}
} else {
std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize * oySize;
for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize) {
index = iIndex + strideDims[0]*inputDims[3];
for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
output[oIndex + oy] += weights[wIndex]*input[index+iy];
}
}
}
}
}
} else {
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
B biasVal = (biases != nullptr) ? biases[ch] : B(0);
std::fill(output, output+outChannels_s, biasVal);
const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
for (std::size_t ox = 0; ox < oxSize; ++ox) {
for (std::size_t oy = 0; oy < oySize; ++oy) {
const std::size_t oIndexFull = ox*oySize + oy;
const std::size_t ix = ox * strideDims[0];
const std::size_t iy = oy * strideDims[1];
for (std::size_t sx = 0; sx*dilationDims[0] < dilated_kernel_x; ++sx) {
for (std::size_t sy = 0; sy*dilationDims[1] < dilated_kernel_y; ++sy) {
output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] * output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))]; input[iIndex + static_cast<std::size_t>(ix + sx*dilationDims[0])*inputDims[3] + static_cast<std::size_t>(iy + sy*dilationDims[1])];
} }
} }
} }
} }
} }
output += outChannels_s;
} }
} }
} }
// Kernels registration to implementation entry point // Kernels registration to implementation entry point
REGISTRAR(ConvDepthWiseImpl2D_cpu, REGISTRAR(ConvDepthWiseImpl2D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}}, {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
......
...@@ -141,15 +141,15 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, ...@@ -141,15 +141,15 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
O *output = static_cast<O *>(output_); O *output = static_cast<O *>(output_);
// output H size // output H size
const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
const std::size_t oxSize = const std::size_t oxSize =
static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) / static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
static_cast<float>(strideDims[0]))); static_cast<float>(strideDims[0])));
const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
// output W size // output W size
const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
const std::size_t oySize = const std::size_t oySize =
static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilationDims[1]*(kernelDims[1] - 1) - 1 + strideDims[1]) / static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
static_cast<float>(strideDims[1]))); static_cast<float>(strideDims[1])));
const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
// TODO: kernel computation // TODO: kernel computation
...@@ -157,57 +157,107 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims, ...@@ -157,57 +157,107 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
// input (batch, inCh, Xin, Yin) // input (batch, inCh, Xin, Yin)
// weight (outCh, inCh, kernelX, kernelY) // weight (outCh, inCh, kernelX, kernelY)
// does not take Dilation attribute into account // does not take Dilation attribute into account
using signedsize = std::make_signed<std::size_t>::type; const std::size_t outChannels_s = oxSize * oySize;
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize; for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
// If bias = nullptr, set B(0) for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
B biasVal = (biases != nullptr) ? biases[outCh] : B(0); // If bias = nullptr, set B(0)
std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal); B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { std::fill(output, output+outChannels_s, biasVal);
const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3]; for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1]; std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
for (std::size_t ox = 0; ox < oxSize; ++ox) { const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9;
// const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]); if (strideDims[0] == 1 && strideDims[1]==1) {
// const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0))); for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
// const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx); for (std::size_t oy = 0; oy < oySize; ++oy) {
const std::size_t sxMin = 0; output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
const std::size_t sxMax = dilated_kernel_x; }
for (std::size_t oy = 0; oy < oySize; ++oy) { iIndex+=inputDims[3];
// const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]); for (std::size_t oy = 0; oy < oySize; ++oy) {
// const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0))); output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
// const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify); }
const std::size_t syMin = 0; iIndex+=inputDims[3];
const std::size_t syMax = dilated_kernel_y; for (std::size_t oy = 0; oy < oySize; ++oy) {
const std::size_t oIndexFull = oIndex + ox*oySize + oy; output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
const signedsize ix = static_cast<signedsize>(ox * strideDims[0]); }
const signedsize iy = static_cast<signedsize>(oy * strideDims[1]); }
} else {
if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) { for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) {
output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] + for (std::size_t oy = 0; oy < oySize; ++oy) {
weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] + output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2];
weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] + }
weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] + iIndex+=strideDims[0]*inputDims[3];
weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] + for (std::size_t oy = 0; oy < oySize; ++oy) {
weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] + output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2];
weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] + }
weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] + iIndex+=strideDims[0]*inputDims[3];
weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]); for (std::size_t oy = 0; oy < oySize; ++oy) {
} else { output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2];
for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) { }
for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) { }
output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] * }
input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))]; }
output += outChannels_s;
}
}
} else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
// If bias = nullptr, set B(0)
B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
std::fill(output, output+outChannels_s, biasVal);
for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = (inCh + outCh*inputDims[1]);
if (strideDims[0] == 1 && strideDims[1] == 1) {
for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) {
output[oIndex] += weights[wIndex] * input[iIndex];
}
} else {
for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) {
for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy];
}
}
}
}
output += outChannels_s;
}
}
} else {
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
// If bias = nullptr, set B(0)
B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
std::fill(output, output+outChannels_s, biasVal);
for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
// loop over each ouput line
for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) {
// loop over associated input line
for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) {
// loop over the entire line
for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
const std::size_t iIndex = iIndex_channel + ix + iy;
// loop over elements assosicated with one output
for (std::size_t kx = 0; kx < kernelDims[0]; ++kx) {
output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]];
} }
} }
} }
} }
} }
output += outChannels_s;
} }
} }
} }
} }
// Kernels registration to implementation entry point // Kernels registration to implementation entry point
REGISTRAR(ConvImpl2D_cpu, REGISTRAR(ConvImpl2D_cpu,
{{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}}, {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment