Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • mszczep/aidge_backend_cpu
  • eclipse/aidge/aidge_backend_cpu
  • hrouis/aidge_backend_cpu
  • oantoni/aidge_backend_cpu
  • raphaelmillet/aidge_backend_cpu
  • cguillon/aidge_backend_cpu
  • jeromeh/aidge_backend_cpu
  • axelfarr/aidge_backend_cpu
  • noamzerah/aidge_backend_cpu
  • silvanosky/aidge_backend_cpu
  • maab05/aidge_backend_cpu
  • lucaslopez/aidge_backend_cpu_ll
  • farnez/aidge_backend_cpu
  • mick94/aidge_backend_cpu
14 results
Show changes
Commits on Source (254)
Showing
with 732 additions and 117 deletions
......@@ -4,6 +4,7 @@
# C++ Build
build*/
install*/
include/aidge/backend/cpu_version.h
# VSCode
.vscode
......
......@@ -12,19 +12,14 @@ stages:
- deploy
include:
- project: 'eclipse/aidge/gitlab_shared_files'
- project: 'eclipse/aidge/gitlab_shared_files'
ref: 'main'
file:
# choose which jobs to run by including the corresponding files.
file: # choose which jobs to run by including the corresponding files.
- '.gitlab/ci/ubuntu_cpp.gitlab-ci.yml'
- '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
- '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'
- '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'
- '.gitlab/ci/windows_cpp.gitlab-ci.yml'
- '.gitlab/ci/windows_python.gitlab-ci.yml'
- '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'
- '.gitlab/ci/windows_python.gitlab-ci.yml'
- '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'
# Verson 0.5.0 (January 31, 2025)
# Verson 0.4.0 (December 6, 2024)
# Version 0.2.2 (May 14, 2024)
* Remove implmentation for Operators soly handling memory and format
......
cmake_minimum_required(VERSION 3.18)
set(CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
file(STRINGS "${CMAKE_SOURCE_DIR}/version.txt" version)
# Parse version.txt to retrieve Major, Minor and Path
string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" _ MATCHES ${version})
set(PROJECT_VERSION_MAJOR ${CMAKE_MATCH_1})
set(PROJECT_VERSION_MINOR ${CMAKE_MATCH_2})
set(PROJECT_VERSION_PATCH ${CMAKE_MATCH_3})
project(aidge_backend_cpu
VERSION ${version}
DESCRIPTION "CPU implementations of the operators of aidge framework."
LANGUAGES CXX)
message(STATUS "Project name: ${CMAKE_PROJECT_NAME}")
message(STATUS "Project version: ${version}")
add_definitions(-DPROJECT_VERSION="${version}")
# Retrieve latest git commit
execute_process(
COMMAND git rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
......@@ -19,8 +25,13 @@ execute_process(
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
message(STATUS "Project name: ${CMAKE_PROJECT_NAME}")
message(STATUS "Project version: ${version}")
message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}")
add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")
# helper for LSP users
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
# Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME}
set(module_name _${CMAKE_PROJECT_NAME}) # target name
......@@ -53,6 +64,16 @@ if(NOT $ENV{AIDGE_INSTALL} STREQUAL "")
endif()
find_package(aidge_core REQUIRED)
find_package(OpenMP)
find_package(OpenSSL QUIET)
if(OpenSSL_FOUND)
message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
add_definitions(-DWITH_OPENSSL)
else()
message(WARNING "OpenSSL not found, SHA256 will not be available.")
endif()
##############################################
# Create target and set properties
file(GLOB_RECURSE src_files "src/*.cpp")
......@@ -61,10 +82,22 @@ file(GLOB_RECURSE inc_files "include/*.hpp")
add_library(${module_name} ${src_files} ${inc_files})
target_link_libraries(${module_name}
PRIVATE
fmt::fmt
PUBLIC
_aidge_core # _ is added because we link the exported target and not the project
)
if(OpenMP_CXX_FOUND)
target_link_libraries(${module_name} PRIVATE OpenMP::OpenMP_CXX)
set(AIDGE_REQUIRES_OPENMP TRUE)
endif()
# Add definition _USE_MATH_DEFINES to enable math constant definitions from math.h/cmath.
if (WIN32)
target_compile_definitions(${module_name} PRIVATE _USE_MATH_DEFINES)
endif()
#Set target properties
set_property(TARGET ${module_name} PROPERTY POSITION_INDEPENDENT_CODE ON)
......@@ -99,6 +132,12 @@ target_include_directories(${module_name}
${CMAKE_CURRENT_SOURCE_DIR}/src
)
set(AIDGE_REQUIRES_OPENSSL FALSE)
if(OpenSSL_FOUND)
target_link_libraries(${module_name} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
set(AIDGE_REQUIRES_OPENSSL TRUE)
endif()
target_compile_features(${module_name} PRIVATE cxx_std_14)
target_compile_options(${module_name} PRIVATE
......@@ -112,6 +151,13 @@ if(CMAKE_COMPILER_IS_GNUCXX AND COVERAGE)
append_coverage_compiler_flags()
endif()
message(STATUS "Creating ${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cpu_version.h")
# Generate version.h file from config file version.h.in
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/version.h.in"
"${CMAKE_CURRENT_SOURCE_DIR}/include/aidge/backend/cpu_version.h"
)
##############################################
# Installation instructions
include(GNUInstallDirs)
......
......@@ -2,6 +2,14 @@
include(CMakeFindDependencyMacro)
find_dependency(aidge_core)
set(AIDGE_REQUIRES_OPENMP @AIDGE_REQUIRES_OPENMP@)
if (AIDGE_REQUIRES_OPENMP)
find_dependency(OpenMP)
endif()
set(AIDGE_REQUIRES_OPENSSL @AIDGE_REQUIRES_OPENSSL@)
if (AIDGE_REQUIRES_OPENSSL)
find_dependency(OpenSSL)
endif()
include(CMakeFindDependencyMacro)
......
import aidge_core
from aidge_backend_cpu.aidge_backend_cpu import * # import so generated by PyBind
from ._version import *
from . import benchmark
import time
import numpy as np
import aidge_core
def prepare_model_scheduler_inputs(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> tuple[aidge_core.GraphView, aidge_core.SequentialScheduler]:
# update model and inputs backend
model.set_backend("cpu")
ordered_inputs = [aidge_core.Tensor(i[1]) for i in input_data]
for ordered_input in ordered_inputs:
ordered_input.set_backend("cpu")
scheduler = aidge_core.SequentialScheduler(model)
scheduler.generate_scheduling()
return model, scheduler, ordered_inputs
def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
model, scheduler, ordered_inputs = prepare_model_scheduler_inputs(model, input_data)
timings = []
# Warm-up runs.
for i in range(nb_warmup + nb_iterations):
if i < nb_warmup:
scheduler.forward(forward_dims=False, data=ordered_inputs)
else:
start = time.process_time()
scheduler.forward(forward_dims=False, data=ordered_inputs)
end = time.process_time()
timings.append((end - start))
return timings
def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
model, scheduler, ordered_inputs = prepare_model_scheduler_inputs(model, input_data)
scheduler.forward(forward_dims=False, data=ordered_inputs)
return [np.array(t[0].get_operator().get_output(t[1])) for t in model.get_ordered_outputs()]
\ No newline at end of file
......@@ -13,16 +13,17 @@ class test_scheduler(unittest.TestCase):
pass
def test_relu_forward(self):
values = np.arange(6) - 3
input_node = aidge_core.Producer(aidge_core.Tensor(values), "Input")
t = aidge_core.Tensor(np.arange(6, dtype=np.int32) - 3)
input_node = aidge_core.Producer(t)
relu = aidge_core.ReLU()
input_node.add_child(relu)
gv = aidge_core.GraphView()
gv.add(relu)
gv.add(input_node)
input_node.add_child(relu)
gv.set_datatype(aidge_core.dtype.int32)
gv.set_backend("cpu")
......@@ -34,7 +35,7 @@ class test_scheduler(unittest.TestCase):
out_tensor = relu.get_operator().get_output(0)
expected_out = [0,0,0,0,1,2]
for i in range(len(expected_out)):
self.assertEqual(expected_out[i], out_tensor[i])
self.assertEqual(expected_out[i], out_tensor[i], f"On idx {i}")
def test_sequential_scheduling(self):
input_data = np.array([0]).astype(np.float32)
......@@ -56,9 +57,9 @@ class test_scheduler(unittest.TestCase):
scheduler = aidge_core.SequentialScheduler(graph_view)
scheduler.generate_scheduling()
self.assertEqual(len(scheduler.get_static_scheduling()), 10)
self.assertEqual(len(scheduler.get_sequential_static_scheduling()), 10)
# Do not care about the order of execution of the producers
self.assertListEqual([i.name() for i in scheduler.get_static_scheduling()[-3:]], EXPECTED_SCHEDULE)
self.assertListEqual([i.name() for i in scheduler.get_sequential_static_scheduling()[-3:]], EXPECTED_SCHEDULE)
def test_parallel_scheduling(self):
......@@ -69,7 +70,7 @@ class test_scheduler(unittest.TestCase):
aidge_core.Producer(input_tensor, "X"),
aidge_core.FC(1, 50, name='0'),
aidge_core.parallel([aidge_core.FC(50, 50, name='1'), aidge_core.FC(50, 50, name='3')]),
aidge_core.Add(2, name='2'),
aidge_core.Add(name='2'),
])
EXPECTED_SCHEDULE = [['0', '1', '3', '2'], ['0', '3', '1', '2']] # Both scheduling are valid !
......@@ -82,9 +83,9 @@ class test_scheduler(unittest.TestCase):
scheduler = aidge_core.SequentialScheduler(graph_view)
scheduler.generate_scheduling()
self.assertEqual(len(scheduler.get_static_scheduling()), 11)
self.assertEqual(len(scheduler.get_sequential_static_scheduling()), 11)
# Do not care about the order of execution of the producers
self.assertTrue([i.name() for i in scheduler.get_static_scheduling()[-4:]] in EXPECTED_SCHEDULE)
self.assertTrue([i.name() for i in scheduler.get_sequential_static_scheduling()[-4:]] in EXPECTED_SCHEDULE)
if __name__ == '__main__':
unittest.main()
function(generate_python_binding pybind_module_name target_to_bind)
function(generate_python_binding pybind_module_name target_to_bind)
find_package(Python COMPONENTS Interpreter Development.Module)
Include(FetchContent)
set(PYBIND_VERSION v2.10.4)
set(PYBIND_VERSION v2.13.6)
message(STATUS "Retrieving pybind ${PYBIND_VERSION} from git")
FetchContent_Declare(
......
......@@ -12,40 +12,58 @@
#ifndef AIDGE_CPU_IMPORTS_H_
#define AIDGE_CPU_IMPORTS_H_
#include "aidge/backend/cpu_version.h"
#include "aidge/backend/cpu/operator/AbsImpl.hpp"
#include "aidge/backend/cpu/operator/AddImpl.hpp"
#include "aidge/backend/cpu/operator/AndImpl.hpp"
#include "aidge/backend/cpu/operator/AtanImpl.hpp"
#include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
#include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp"
#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
#include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
#include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
#include "aidge/backend/cpu/operator/ClipImpl.hpp"
#include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
#include "aidge/backend/cpu/operator/ConvImpl.hpp"
#include "aidge/backend/cpu/operator/ConvTransposeImpl.hpp"
#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
#include "aidge/backend/cpu/operator/DivImpl.hpp"
#include "aidge/backend/cpu/operator/DropoutImpl.hpp"
#include "aidge/backend/cpu/operator/EqualImpl.hpp"
#include "aidge/backend/cpu/operator/ErfImpl.hpp"
#include "aidge/backend/cpu/operator/ExpandImpl.hpp"
#include "aidge/backend/cpu/operator/FCImpl.hpp"
#include "aidge/backend/cpu/operator/FoldImpl.hpp"
#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
#include "aidge/backend/cpu/operator/HardmaxImpl.hpp"
#include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
#include "aidge/backend/cpu/operator/LRNImpl.hpp"
#include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
#include "aidge/backend/cpu/operator/LnImpl.hpp"
#include "aidge/backend/cpu/operator/MatMulImpl.hpp"
#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
#include "aidge/backend/cpu/operator/ModImpl.hpp"
#include "aidge/backend/cpu/operator/MulImpl.hpp"
#include "aidge/backend/cpu/operator/PadImpl.hpp"
#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
#include "aidge/backend/cpu/operator/PowImpl.hpp"
#include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
#include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
#include "aidge/backend/cpu/operator/ResizeImpl.hpp"
#include "aidge/backend/cpu/operator/ReLUImpl.hpp"
#include "aidge/backend/cpu/operator/ScalingImpl.hpp"
#include "aidge/backend/cpu/operator/RoundImpl.hpp"
#include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
#include "aidge/backend/cpu/operator/SqrtImpl.hpp"
#include "aidge/backend/cpu/operator/SliceImpl.hpp"
#include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
#include "aidge/backend/cpu/operator/SubImpl.hpp"
#include "aidge/backend/cpu/operator/TopKImpl.hpp"
#include "aidge/backend/cpu/operator/TanhImpl.hpp"
#include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp"
#include "aidge/backend/cpu/data/TensorImpl.hpp"
#endif /* AIDGE_CPU_IMPORTS_H_ */
/********************************************************************************
* Copyright (c) 2024 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_DATA_INTERPOLATION_H_
#define AIDGE_CPU_DATA_INTERPOLATION_H_
#include <vector>
#include <aidge/data/Interpolation.hpp>
#include <aidge/utils/Types.h>
namespace Aidge {
class InterpolationCPU : public Interpolation {
public:
/*
* @brief Interpolates values given via input in given mode.
*
* Values are contiguously arranged in a "square" shape around the point to
* interpolate. Depending on interpolation mode.
* The point that will be interpolated is located right in the
* middle of all points.
* Immediate neighbours :
* 1D interp : 2D interp :
* . . . . . .
* . . 1 2 . . . . . . . .
* . . 1 2 . .
* . . 3 4 . .
* . . . . . .
* . . . . . .
*
* 2 neighbours :
* 1D interp : 2D interp :
* . . . . . . . .
* . . . . . . . .
* . . 1 2 3 4 . . . . 1 2 3 4 . .
* . . 5 6 7 8 . .
* . . 9 10 11 12 . .
* . . 13 14 15 16 . .
* . . . . . . . .
* . . . . . . . .
*
* @param[in] originalCoords: coord of the point to interpolate in the
* original picture. These coords are generated with
* Interpolation::untransformCoords(coordsInInterpolatedTensor)
* @param[in] points : points to interpolate, arranged in a vector of a
* pairs ((point_coord), value) :
* [[[X1, X2, ..., XN], Xval], ...., [[A1, A2, ..., AN],Aval]].
* With :
* - N: the number of dimensions.
* - A: the number of points of the grid to interpolate.
* - All coordinates expressed in originalTensor frame.
* @param[in] interpMode: interpolation mode
* @return interpolated value
*/
template <typename T>
static T interpolate(const std::vector<float> &coordsToInterpolate,
const std::set<Point<T>> &points,
const Mode interpMode = Interpolation::Mode::Linear);
/**
* @brief performs linear interpolation on given points.
* @param[in] values: values to interpolate, since we only do an average of
* all values, their indexes isn't useful.
* @return interpolated value
*/
template <typename T>
static T linear(const std::vector<float> &originalCoords,
const std::set<Point<T>> &points);
/**
* @brief performs nearest interpolation on given points.
* @note it is a wrapper for linearRecurse() private method
* @param[in] coordsToInterpolate: coordinates to interpolate
* @param[in] points: points to interpolate
* @param[in] interpMode: interpolation method, must be a Nearest...
* otherwise function will throw an error.
* @return interpolated value
*/
template <typename T>
static T nearest(const std::vector<float> &coordsToInterpolate,
const std::set<Point<T>> &points,
const Interpolation::Mode nearestMode);
private:
/**
* @brief actual linear interpolation function.
* will :
* - Split all points along each dimension depending of if their coords at
* idx alongDim are above or under coordsToInterpolate until they are
* 1-to-1.
* - Perform interpolation in 2 leftover points and return interpolated
* point to parent call with a set of size 1.
* - repeat until all dimensions have been interpolated.
* @param[in] coordsToInterpolate: coordinates to interpolate
* @param[in] points: points to interpolate
* @param[in] alongDim: discriminant on along which dimension are being
* segregated.
* @return
*/
template <typename T>
static std::set<Interpolation::Point<T>>
linearRecurse(const std::vector<float> &coordsToInterpolate,
const std::set<Point<T>> &points,
const DimIdx_t alongDim = 0);
};
} // namespace Aidge
#endif // AIDGE_CPU_DATA_INTERPOLATION_H_
......@@ -20,14 +20,14 @@
namespace Aidge {
template <class I, class O>
void AbsImpl_cpu_forward_kernel(std::size_t inputLenght,
void AbsImpl_cpu_forward_kernel(std::size_t inputLength,
const void* input_,
void* output_) {
const I* input = static_cast<const I*>(input_);
O* output = static_cast<O*>(output_);
for (std::size_t i = 0; i < inputLenght; ++i) {
for (std::size_t i = 0; i < inputLength; ++i) {
output[i] = std::abs(input[i]);
}
}
......
......@@ -25,7 +25,17 @@
namespace Aidge {
// Operator implementation entry point for the backend
using AddImpl_cpu = OperatorImpl_cpu<Add_Op,
void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)>;
void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*),
void(const std::size_t,
const std::size_t,
const std::size_t,
const std::vector<std::size_t>&,
const std::vector<std::size_t>&,
const std::vector<std::size_t>&,
const void*,
void*,
void*)
>;
// Implementation entry point registration to Operator
REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create);
......
......@@ -14,46 +14,204 @@
#include "aidge/utils/Registrar.hpp"
#include <cstdint> // std::int32_t, std::int64_t
#include <cstddef> // std::size_t
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/AddImpl.hpp"
namespace Aidge {
namespace {
// suppose values are contiguous in memory
template <class I, class O>
void AddImpl_cpu_forward_kernel(const std::vector<const void*> inputs_, const std::vector<std::vector<std::size_t>>& inputDims, const std::size_t outputLength, const std::vector<std::size_t>& outDims, void* output_) {
// FIXME: missing Add attributes as arguments
std::vector<const I*> inputs;
for (const auto& input_ : inputs_) {
inputs.push_back(static_cast<const I*>(input_));
void add_contiguous_arrays(const std::size_t input1size,
const std::size_t input2size,
const std::size_t output1size,
const I* input1,
const I* input2,
O* output)
{
for (std::size_t i = 0; i < output1size; ++i)
{
const std::size_t in1_id = (input1size != 1) ? i : 0;
const std::size_t in2_id = (input2size != 1) ? i : 0;
output[i] = static_cast<O>(input1[in1_id] + input2[in2_id]);
}
}
}
template <class I, class O>
void AddImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
std::vector<std::size_t> dims1,
const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_,
void* output_) {
const I* input_0 = static_cast<const I*>(input0_);
const I* input_1 = static_cast<const I*>(input1_);
O* output = static_cast<O*>(output_);
for (std::size_t oIndex = 0; oIndex < outputLength; ++oIndex)
{
output[oIndex] = 0;
std::vector<size_t> indexes = getMultiDimIndices(outDims, oIndex);
for(std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) {
std::size_t idx = getFlattenedIndex(inputDims[iIndex], indexes);
output[oIndex] += inputs[iIndex][idx];
}
}
// [5,2,1,7] & [2,6,7]
// 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
// 2. Find the highest equal dimension -> 3
// Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
// 3. Compute the highest number of contiguous data -> 7
// 4. Compute stride and offset step for the broadcast mechanism
// 5. Call a simple kernel
// special case for equal dimensions, the kernel is called with the entire arrays at once
if (dims0 == dims1) {
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t i = 0; i < input0_contiguous_size; ++i)
{
output[i] = static_cast<O>(input_0[i] + input_1[i]);
}
return;
}
// set dimensions to be of equal size by filling the smallest one with ones.
if (dims0.size() > dims1.size()) {
dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
const std::size_t nbDims = dims0.size();
// Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
}
}
++contiguousIdx;
// Compute the highest number of contiguous data for each Tensor
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
add_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
input_0 + offsetIn0*input0_contiguous_size,
input_1 + offsetIn1*input1_contiguous_size,
output + offsetOut*output_contiguous_size);
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outputDims[dim] == 0) {
tmp_stack /= outputDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
}
template <class I, class O>
void AddImpl_cpu_backward_kernel(const std::size_t input0Length,
const std::size_t input1Length,
const std::size_t gradOutputLength,
const std::vector<std::size_t>& dims0,
const std::vector<std::size_t>& dims1,
const std::vector<std::size_t>& outputDims,
const void* grad_output_,
void* gradientInput0_,
void* gradientInput1_)
{
// TODO: Remove input0/1 from the function
const O* gradOutput = static_cast<const O*>(grad_output_);
auto* gradInput0 = static_cast<I*>(gradientInput0_);
auto* gradInput1 = static_cast<I*>(gradientInput1_);
std::fill_n(gradInput0, input0Length, static_cast<I>(0));
std::fill_n(gradInput1, input1Length, static_cast<I>(0));
auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
for (std::size_t i = 0; i < gradOutputLength; ++i) {
auto idxOutputGrad = getMultiDimIndices(outputDims, i);
std::vector<std::size_t> idxInput0(broadcastedDims0.size());
std::vector<std::size_t> idxInput1(broadcastedDims1.size());
for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
}
for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
}
auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
// For addition: gradient of both inputs is just the output gradient
// (unlike multiplication where we need to multiply by the other input,
// or subtraction where we need to negate one of them)
gradInput0[idx0] += static_cast<I>(gradOutput[i]);
gradInput1[idx1] += static_cast<I>(gradOutput[i]);
}
}
// Kernels registration to implementation entry point
REGISTRAR(AddImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
{ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<float, float>, nullptr});
{ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<float, float>, Aidge::AddImpl_cpu_backward_kernel<float, float>});
REGISTRAR(AddImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
{ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, nullptr});
{ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, Aidge::AddImpl_cpu_backward_kernel<double, double>});
REGISTRAR(AddImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}},
{ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, Aidge::AddImpl_cpu_backward_kernel<std::int8_t, std::int8_t>});
REGISTRAR(AddImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}},
{ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, Aidge::AddImpl_cpu_backward_kernel<std::uint8_t, std::uint8_t>});
REGISTRAR(AddImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
{ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
{ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, Aidge::AddImpl_cpu_backward_kernel<std::int32_t, std::int32_t>});
REGISTRAR(AddImpl_cpu,
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
{ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
{ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, Aidge::AddImpl_cpu_backward_kernel<std::int64_t, std::int64_t>});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_ADDIMPL_CPU_KERNELS_H_ */
\ No newline at end of file
#endif /* AIDGE_CPU_OPERATOR_ADDIMPL_CPU_KERNELS_H_ */
......@@ -23,7 +23,7 @@
namespace Aidge {
// Operator implementation entry point for the backend
using AndImpl_cpu = OperatorImpl_cpu<And_Op,
void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>;
void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
// Implementation entry point registration to Operator
REGISTRAR(And_Op, "cpu", Aidge::AndImpl_cpu::create);
......
......@@ -12,52 +12,151 @@
#ifndef AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/AndImpl.hpp"
#include "aidge/utils/Registrar.hpp"
namespace Aidge {
template <class I1, class I2, class O>
void AndImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
const std::vector<std::size_t>& input2Dims,
namespace {
// suppose values are contiguous in memory
template <class I, class O>
void and_contiguous_arrays(const std::size_t input1size,
const std::size_t input2size,
const std::size_t output1size,
const I* input1,
const I* input2,
O* output)
{
for (std::size_t i = 0; i < output1size; ++i)
{
const std::size_t in1_id = (input1size != 1) ? i : 0;
const std::size_t in2_id = (input2size != 1) ? i : 0;
output[i] = static_cast<O>(input1[in1_id] && input2[in2_id]);
}
}
}
template <class I, class O>
void AndImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
std::vector<std::size_t> dims1,
const std::vector<std::size_t>& outputDims,
const void* input0_,
const void* input1_,
const void* input2_,
void* output_) {
const I1* input_1 = static_cast<const I1*>(input1_);
const I2* input_2 = static_cast<const I2*>(input2_);
const I* input_0 = static_cast<const I*>(input0_);
const I* input_1 = static_cast<const I*>(input1_);
O* output = static_cast<O*>(output_);
size_t totalElements = 1;
for (size_t dimSize : outputDims) {
totalElements *= dimSize;
// [5,2,1,7] & [2,6,7]
// 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
// 2. Find the highest equal dimension -> 3
// Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
// 3. Compute the highest number of contiguous data -> 7
// 4. Compute stride and offset step for the broadcast mechanism
// 5. Call a simple kernel
// special case for equal dimensions, the kernel is called with the entire arrays at once
if (dims0 == dims1) {
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t i = 0; i < input0_contiguous_size; ++i) {
output[i] = static_cast<O>(input_0[i] && input_1[i]);
}
return;
}
for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
{
std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
// set dimensions to be of equal size by filling the smallest one with ones.
if (dims0.size() > dims1.size()) {
dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
}
else if (dims1.size() > dims0.size()) {
dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
}
std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
const std::size_t nbDims = dims0.size();
output[oIndex] = static_cast<O>(input_1[idx1] == input_2[idx2]);
// Find the highest equal dimension
// std::size_t contiguousIdx = nbDims - 1;
std::size_t contiguousIdx = nbDims;
while (contiguousIdx-- > 0) {
// for (; contiguousIdx+1 > 0; --contiguousIdx) {
if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
--contiguousIdx;
}
}
break;
}
}
++contiguousIdx;
// Compute the highest number of contiguous data for each Tensor
const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
// initialize strides to iterate through data because of broadcasting
std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
if (contiguousIdx > 0) {
stride_post0[contiguousIdx - 1] = 1;
stride_post1[contiguousIdx - 1] = 1;
for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
}
for (std::size_t i = 0; i != contiguousIdx; ++i) {
stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
}
}
// variables for arrays offsets
std::size_t offsetIn0 = 0;
std::size_t offsetIn1 = 0;
std::size_t offsetOut = 0;
std::size_t dim = contiguousIdx - 1;
const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
for (std::size_t stack = 0; stack < nbStacks;) {
and_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
input_0 + offsetIn0*input0_contiguous_size,
input_1 + offsetIn1*input1_contiguous_size,
output + offsetOut*output_contiguous_size);
if (++stack < nbStacks) {
std::size_t tmp_stack = stack;
while(tmp_stack % outputDims[dim] == 0) {
tmp_stack /= outputDims[dim];
dim--;
}
offsetIn0 += stride_step0[dim];
offsetIn1 += stride_step1[dim];
++offsetOut;
dim = contiguousIdx - 1;
}
}
}
// Kernels registration to implementation entry point
REGISTRAR(AndImpl_cpu,
{DataType::Float32},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float, float>, nullptr});
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float>, nullptr});
REGISTRAR(AndImpl_cpu,
{DataType::Float64},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double, double>, nullptr});
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double>, nullptr});
REGISTRAR(AndImpl_cpu,
{DataType::Int32},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
REGISTRAR(AndImpl_cpu,
{DataType::Int64},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr});
{ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
{ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_ATAN_H_
#define AIDGE_CPU_OPERATOR_ATAN_H_
#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
#include "aidge/operator/Atan.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include <memory>
#include <vector>
namespace Aidge {
// Operator implementation entry point for the backend
using AtanImpl_cpu = OperatorImpl_cpu<Atan_Op,
void(const std::size_t, const void*, void*),
void(const std::size_t, const void*, const void*, void*)>;
// Implementation entry point registration to Operator
REGISTRAR(Atan_Op, "cpu", Aidge::AtanImpl_cpu::create);
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_ATAN_H_ */
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_
#define AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_
#include "aidge/utils/Registrar.hpp"
#include "aidge/backend/cpu/operator/AtanImpl.hpp"
#include <cmath> // For atan()
namespace Aidge {
template <class I, class O>
void AtanImpl_cpu_forward_kernel(std::size_t inputLength,
const void* input_,
void* output_) {
const I* input = static_cast<const I*>(input_);
O* output = static_cast<O*>(output_);
for (size_t i = 0; i < inputLength; ++i) {
output[i] = static_cast<O>(atan(input[i]));
}
}
template <class O, class GI, class GO>
void AtanImpl_cpu_backward_kernel(const std::size_t inputLength,
const void* output_, const void* grad_output_,
void* grad_input_) {
const O* output = static_cast<const O*>(output_);
const GO* grad_output = static_cast<const GO*>(grad_output_);
GI* grad_input = static_cast<GI*>(grad_input_);
// Apply the derivative of atan for each element in the input array
for (size_t i = 0; i < inputLength; ++i) {
// dx = dy * (1 / (1 + x^2))
grad_input[i] += grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i]));
}
}
// Kernels registration to implementation entry point
REGISTRAR(AtanImpl_cpu,
{DataType::Float32},
{ProdConso::inPlaceModel, Aidge::AtanImpl_cpu_forward_kernel<float, float>, Aidge::AtanImpl_cpu_backward_kernel<float, float, float>});
REGISTRAR(AtanImpl_cpu,
{DataType::Float64},
{ProdConso::inPlaceModel, Aidge::AtanImpl_cpu_forward_kernel<double, double>, Aidge::AtanImpl_cpu_backward_kernel<double, double, double>});
} // namespace Aidge
#endif /* AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_ */
......@@ -28,8 +28,11 @@ namespace Aidge {
using AvgPooling2D_Op = AvgPooling_Op<2>;
using AvgPoolingImpl2D_cpu = OperatorImpl_cpu<AvgPooling_Op<2>,
void(const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 2>&,
const std::array<DimSize_t, 4>&,
bool,
RoundingMode,
const void *,
void *)>;
......
......@@ -23,6 +23,22 @@
#include "aidge/utils/Types.h"
namespace Aidge {
template <typename T>
using Acc_T = typename std::conditional<std::is_floating_point<T>::value, T, double>::type;
template <typename T>
typename std::enable_if<std::is_floating_point<T>::value, T>::type
castFromFloat(T value, RoundingMode /*roundingMode*/) {
return value;
}
template <typename T>
typename std::enable_if<!std::is_floating_point<T>::value, T>::type
castFromFloat(double value, RoundingMode roundingMode) {
return static_cast<T>(round(value, roundingMode));
}
/**
* @brief Forward kernel for 2D AvgPoolingolution on CPU backend.
* @tparam I Input data type.
......@@ -35,66 +51,72 @@ namespace Aidge {
template <class I, class O>
void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
const std::array<DimSize_t, 2>& kernelDims,
const std::array<DimSize_t, 2>& dilations,
const std::array<DimSize_t, 4> &dims,
bool ceilMode,
RoundingMode roundingMode,
const void *input_,
void *output_) {
// FIXME: missing convolution attributes as arguments
const I *input = static_cast<const I *>(input_);
O *output = static_cast<O *>(output_);
// output H size
const std::size_t oxSize =
static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) /
static_cast<float>(strideDims[0])));
const std::size_t oxSize =
ceilMode
? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
static_cast<float>(strideDims[0])))
: static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
static_cast<float>(strideDims[0])));
// output W size
const std::size_t oySize =
static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) /
static_cast<float>(strideDims[1])));
// TODO: kernel computation
// output (batch, outCh, Xout, Yout)
// input (batch, ch, Xin, Yin)
// weight (outCh, ch, kernelX, kernelY)
// does not take Dilation attribute into account
const std::size_t oySize =
ceilMode
? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
static_cast<float>(strideDims[1])))
: static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
static_cast<float>(strideDims[1])));
using signedsize = std::make_signed<std::size_t>::type;
for (std::size_t batch = 0; batch < dims[0]; ++batch) {
for (std::size_t ch = 0; ch < dims[1]; ++ch) {
const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
std::fill(output + oIndex, output+(oIndex+oxSize*oySize), 0);
#ifdef _OPENMP
#pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16)
#endif
for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) {
for (int ch = 0; ch < static_cast<int>(dims[1]); ++ch) {
const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize;
const std::size_t iIndex = (ch + batch * dims[1]) * dims[2] * dims[3];
for (std::size_t ox = 0; ox < oxSize; ++ox) {
const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
const signedsize difx = static_cast<signedsize>(-ox * strideDims[0]);
const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx);
for (std::size_t oy = 0; oy < oySize; ++oy) {
const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
const signedsize dify = static_cast<signedsize>(-oy * strideDims[1]);
const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify);
const std::size_t oIndexFull = oIndex + ox*oySize + oy;
const std::size_t oIndexFull = oIndex + ox * oySize + oy;
const std::size_t ix = ox * strideDims[0];
const std::size_t iy = oy * strideDims[1];
if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
output[oIndexFull] += static_cast<O>(
input[iIndex + (ix+0)*dims[3] + (iy+0)] +
input[iIndex + (ix+0)*dims[3] + (iy+1)] +
input[iIndex + (ix+0)*dims[3] + (iy+2)] +
input[iIndex + (ix+1)*dims[3] + (iy+0)] +
input[iIndex + (ix+1)*dims[3] + (iy+1)] +
input[iIndex + (ix+1)*dims[3] + (iy+2)] +
input[iIndex + (ix+2)*dims[3] + (iy+0)] +
input[iIndex + (ix+2)*dims[3] + (iy+1)] +
input[iIndex + (ix+2)*dims[3] + (iy+2)]) / O(9);
} else {
for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
for (std::size_t sy = syMin; sy < syMax; ++sy) {
output[oIndexFull] += input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
Acc_T<I> sum = static_cast<Acc_T<I>>(0);
std::size_t count = 0;
for (unsigned int sy = syMin; sy < syMax; ++sy) {
for (unsigned int sx = sxMin; sx < sxMax; ++sx) {
// Apply dilation factor
const std::size_t dilated_sx = sx * dilations[0];
const std::size_t dilated_sy = sy * dilations[1];
// Ensure within bounds
if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) {
sum += static_cast<Acc_T<I>>(input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)]);
++count;
}
}
// padding not used
output[oIndexFull] /= (sxMax - sxMin) * (syMax - syMin);
}
output[oIndexFull] = count > 0 ? castFromFloat<O>(sum / count, roundingMode) : 0;
}
}
}
......