diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp index 760fc71a4b659e1bffe28e2796c7bb400e8ec1a2..694275067b8b9708bab868da83688716f34e4fae 100644 --- a/include/aidge/backend/cpu.hpp +++ b/include/aidge/backend/cpu.hpp @@ -22,6 +22,7 @@ #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp" #include "aidge/backend/cpu/operator/BatchNormImpl.hpp" #include "aidge/backend/cpu/operator/BitShiftImpl.hpp" +#include "aidge/backend/cpu/operator/ClipImpl.hpp" #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp" #include "aidge/backend/cpu/operator/ConvImpl.hpp" #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp" diff --git a/include/aidge/backend/cpu/operator/ClipImpl.hpp b/include/aidge/backend/cpu/operator/ClipImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c83836d5aa1d6aae27e3fdce1bbb9561b70ec31e --- /dev/null +++ b/include/aidge/backend/cpu/operator/ClipImpl.hpp @@ -0,0 +1,46 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_CLIPIMPL_H_ +#define AIDGE_CPU_OPERATOR_CLIPIMPL_H_ + +#include <cstddef> // std::size_t +#include <memory> +#include <tuple> // std::tuple +#include <vector> +#include <algorithm> + +#include "aidge/backend/cpu/operator/OperatorImpl.hpp" +#include "aidge/operator/Clip.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" + + +namespace Aidge { +// Operator implementation entry point for the backend + using ClipImpl_cpu = OperatorImpl_cpu<Clip_Op, + void(float, //Forward Types + float, + const void*, + const std::size_t, + void*), + void(float,//Backward Types + float, + const std::size_t, + const void*, + const void*, + void*)>; + + REGISTRAR(Clip_Op,"cpu",Aidge::ClipImpl_cpu::create); +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_CLIPIMPL_H_ */ diff --git a/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1afac4698be2a63790ebac671ecc1e59166c5f94 --- /dev/null +++ b/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp @@ -0,0 +1,77 @@ + +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_ +#define AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_ + +#include "aidge/utils/Registrar.hpp" +#include "aidge/backend/cpu/operator/ClipImpl.hpp" + +namespace Aidge { +template <class I, class O> +void ClipImpl_cpu_forward_kernel( + float min_, + float max_, + const void* input_, + const std::size_t length, + void* output_) +{ + const I* input = static_cast<const I*>(input_); + O* output = static_cast<O*>(output_); + + for (std::size_t i = 0; i < length; ++i) { + output[i] = std::min(std::max(static_cast<float>(input[i]), min_), max_); + } +} + +template <class I, class GI, class GO> +void ClipImpl_cpu_backward_kernel( + float min_, + float max_, + const std::size_t length, + const void* input_, + const void* grad_output_, + void* grad_input_) +{ + const I* input = static_cast<const I*>(input_); + const GO* grad_output = static_cast<const GO*>(grad_output_); + GI* grad_input = static_cast<GI*>(grad_input_); + + for (std::size_t i = 0; i < length; ++i) { + grad_input[i] = ((input[i] > min_) && (input[i] < max_)) ? grad_output[i] : 0; + } +} + +REGISTRAR(ClipImpl_cpu, +{DataType::Float32}, +{ProdConso::inPlaceModel, +Aidge::ClipImpl_cpu_forward_kernel<float,float>, +Aidge::ClipImpl_cpu_backward_kernel<float,float,float>}); +REGISTRAR(ClipImpl_cpu, +{DataType::Float64}, +{ProdConso::inPlaceModel, +Aidge::ClipImpl_cpu_forward_kernel<double,double>, +Aidge::ClipImpl_cpu_backward_kernel<double,double,double>}); +REGISTRAR(ClipImpl_cpu, +{DataType::Int32}, +{ProdConso::inPlaceModel, +Aidge::ClipImpl_cpu_forward_kernel<std::int32_t,std::int32_t>, +Aidge::ClipImpl_cpu_backward_kernel<std::int32_t,std::int32_t,std::int32_t>}); +REGISTRAR(ClipImpl_cpu, +{DataType::Int64}, +{ProdConso::inPlaceModel, +Aidge::ClipImpl_cpu_forward_kernel<std::int64_t,std::int64_t>, +Aidge::ClipImpl_cpu_backward_kernel<std::int64_t,std::int64_t,std::int64_t>}); + +} // namespace Aidge + +#endif /* AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_ */ diff --git a/src/operator/ClipImpl.cpp b/src/operator/ClipImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..931d25426a8f6e08363bfc08d23f1714e934634c --- /dev/null +++ b/src/operator/ClipImpl.cpp @@ -0,0 +1,67 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <memory> +#include <vector> + +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/Clip.hpp" +#include "aidge/utils/Types.h" +#include "aidge/backend/cpu/data/GetCPUPtr.h" +#include "aidge/utils/ErrorHandling.hpp" + +#include "aidge/backend/cpu/operator/ClipImpl.hpp" +#include "aidge/backend/cpu/operator/ClipImpl_kernels.hpp" + +template<> +void Aidge::ClipImpl_cpu::forward() { + + const Clip_Op& op_ = dynamic_cast<const Clip_Op&>(mOp); + std::shared_ptr<Tensor> in0 = op_.getInput(0); + std::shared_ptr<Tensor> out0 = op_.getOutput(0); + AIDGE_ASSERT(in0, "missing input #0"); + /*AIDGE_ASSERT(in1, "missing input #1 -> Min value empty shape Tensor"); + AIDGE_ASSERT(in2, "missing input #2 -> Max value empty shape Tensor");*/ + // Find the correct kernel type + const auto impl = Registrar<ClipImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Call kernel + impl.forward( + op_.min(), + op_.max(), + getCPUPtr(mOp.getRawInput(0)), + in0->size(), + getCPUPtr(mOp.getRawOutput(0)) + ); +} + +template<> +void Aidge::ClipImpl_cpu::backward() { + + const Clip_Op& op_ = dynamic_cast<const Clip_Op&>(mOp); + std::shared_ptr<Tensor> in0 = op_.getInput(0); + std::shared_ptr<Tensor> out0 = op_.getOutput(0); + std::shared_ptr<Tensor> gra_in0 = op_.getInput(0)->grad(); + std::shared_ptr<Tensor> gra_out0 = op_.getOutput(0)->grad(); + AIDGE_ASSERT(out0, "missing output #0 for current {} operator", op_.type()); + + // Find the correct kernel type + const auto impl = Registrar<ClipImpl_cpu>::create(getBestMatch(getRequiredSpec())); + // Call kernel + impl.backward( + op_.min(), + op_.max(), + gra_in0->size(), + getCPUPtr(in0), + getCPUPtr(gra_out0), + getCPUPtr(gra_in0) + ); +} diff --git a/unit_tests/operator/Test_ClipImpl.cpp b/unit_tests/operator/Test_ClipImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..45c8da5bf7ecc84fad6b3e694fe204540f579af3 --- /dev/null +++ b/unit_tests/operator/Test_ClipImpl.cpp @@ -0,0 +1,318 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <catch2/catch_test_macros.hpp> +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <chrono> +#include <iostream> +#include <vector> +#include <algorithm> +#include <iomanip> +#include <memory> +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution + +#include "aidge/data/Tensor.hpp" +#include "aidge/operator/Clip.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/utils/TensorUtils.hpp" +#include "aidge/backend/cpu.hpp" + +void ComputeClipBackward(const std::vector<float>& vec1, std::vector<float>& vec2, float min, float max) { + if (vec1.size() != vec2.size()) { + std::cerr << "Vectors should have the same sizes." << std::endl; + return; + } + + for (size_t i = 0; i < vec1.size(); ++i) { + if (vec1[i] < min || vec1[i] > max) { + vec2[i] = 0.0f; + } + } +} +namespace Aidge +{ +TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]") + { + const std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> dis(0.0, 10.0); + std::uniform_real_distribution<float> dismin(0.0, 4.5); + std::uniform_real_distribution<float> dismax(5.5, 10.0); + std::uniform_int_distribution<std::size_t> distDims(5,15); + std::uniform_int_distribution<std::size_t> distNbMatrix(1, 5); + + // Create MatMul Operator + std::shared_ptr<Node> myClip = Aidge::Clip("nop"); + auto op = std::static_pointer_cast<OperatorTensor>(myClip -> getOperator()); + + // To measure execution time of 'MatMul_Op::forward()' member function call + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration; + + SECTION("Simple clip test [Forward]") { + std::size_t totalComputation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate Tensors dimensions + const std::size_t dim0 = distDims(gen); + const std::size_t dim1 = distDims(gen); + totalComputation += dim0*dim1; + + // Create and populate the array with random float values + float* Array = new float[dim0*dim1]; + for (int i = 0; i < dim0*dim1; ++i) { + Array[i] = dis(gen); // Generate random float value + } + + // Convert Input to Tensor + std::shared_ptr<Tensor> TInput = std::make_shared<Tensor>(DataType::Float32); + TInput -> resize({dim0,dim1}); + TInput -> setBackend("cpu"); + TInput -> getImpl() -> setRawPtr(Array, dim0*dim1); + + float min = dismin(gen); + std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32); + Tmin -> resize({}); + Tmin -> setBackend("cpu"); + Tmin -> getImpl() -> setRawPtr(&min,1); + + float max = dismax(gen); + std::shared_ptr<Tensor> Tmax = std::make_shared<Tensor>(DataType::Float32); + Tmax -> resize({}); + Tmax -> setBackend("cpu"); + Tmax -> getImpl() -> setRawPtr(&max,1); + // convert res to Tensordf + std::vector<float> GT(Array, Array + (dim0*dim1)); + for (float& val : GT) + { + val = std::max(min, std::min(val, max)); + } + std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32); + Tres -> resize({dim0,dim1}); + Tres -> setBackend("cpu"); + Tres -> getImpl() -> setRawPtr(GT.data(), dim0*dim1); + + op->associateInput(0, TInput); + op->associateInput(1, Tmin); + op->associateInput(2, Tmax); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->forwardDims(true); + + start = std::chrono::system_clock::now(); + myClip->forward(); + end = std::chrono::system_clock::now(); + + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); + } + std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; + std::cout << "total time: " << duration.count() << std::endl; + } + SECTION("Clip test with min >= max [Forward]") { + std::size_t totalComputation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate Tensors dimensions + const std::size_t dim0 = distDims(gen); + const std::size_t dim1 = distDims(gen); + totalComputation += dim0*dim1; + + // Create and populate the array with random float values + float* Array = new float[dim0*dim1]; + for (int i = 0; i < dim0*dim1; ++i) { + Array[i] = dis(gen); // Generate random float value + } + + // Convert Input to Tensor + std::shared_ptr<Tensor> TInput = std::make_shared<Tensor>(DataType::Float32); + TInput -> resize({dim0,dim1}); + TInput -> setBackend("cpu"); + TInput -> getImpl() -> setRawPtr(Array, dim0*dim1); + + float min = dismax(gen); + std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32); + Tmin -> resize({}); + Tmin -> setBackend("cpu"); + Tmin -> getImpl() -> setRawPtr(&min,1); + + float max = dismin(gen); //We generate max and min so that max is always <= min + std::shared_ptr<Tensor> Tmax = std::make_shared<Tensor>(DataType::Float32); + Tmax -> resize({}); + Tmax -> setBackend("cpu"); + Tmax -> getImpl() -> setRawPtr(&max,1); + // convert res to Tensor + std::vector<float> GT(Array, Array + (dim0*dim1)); + for (float& val : GT) + { + val = max; + } + std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32); + Tres -> resize({dim0,dim1}); + Tres -> setBackend("cpu"); + Tres -> getImpl() -> setRawPtr(GT.data(), dim0*dim1); + + op->associateInput(0, TInput); + op->associateInput(1, Tmin); + op->associateInput(2, Tmax); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->forwardDims(true); + + start = std::chrono::system_clock::now(); + myClip->forward(); + end = std::chrono::system_clock::now(); + + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); + } + std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; + std::cout << "total time: " << duration.count() << std::endl; + } + SECTION("Clip with Clip Attr [Forward]") + { + std::size_t totalComputation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) + { + + float min = dismin(gen); + float max = dismax(gen); + std::shared_ptr<Node> myCl = Aidge::Clip("",min,max); + auto op = std::static_pointer_cast<OperatorTensor>(myCl -> getOperator()); + + + // generate Tensors dimensions + const std::size_t dim0 = 3; + const std::size_t dim1 = 3; + totalComputation += dim0*dim1; + + // Create and populate the array with random float values + float* Array = new float[dim0*dim1]; + for (int i = 0; i < dim0*dim1; ++i) { + Array[i] = dis(gen); // Generate random float value + } + // Convert Input to Tensor + std::shared_ptr<Tensor> TInput = std::make_shared<Tensor>(DataType::Float32); + TInput -> resize({dim0,dim1}); + TInput -> setBackend("cpu"); + TInput -> getImpl() -> setRawPtr(Array, dim0*dim1); + + // convert res to Tensordf + std::vector<float> GT(Array, Array + (dim0*dim1)); + for (float& val : GT) + { + val = std::max(min, std::min(val, max)); + } + std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32); + Tres -> resize({dim0,dim1}); + Tres -> setBackend("cpu"); + Tres -> getImpl() -> setRawPtr(GT.data(), dim0*dim1); + op->associateInput(0, TInput); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->forwardDims(true); + start = std::chrono::system_clock::now(); + myCl->forward(); + end = std::chrono::system_clock::now(); + + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); + } + std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; + std::cout << "total time: " << duration.count() << std::endl; + } + SECTION("Simple clip test [Backward]") { + std::size_t totalComputation = 0; + duration = std::chrono::duration<double, std::micro>::zero(); + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + std::size_t totalComputation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate Tensors dimensions + const std::size_t dim0 = distDims(gen); + const std::size_t dim1 = distDims(gen); + + totalComputation += dim0*dim1; + + // Create and populate the array with random float values + float* Array = new float[dim0*dim1]; + float* gradArray = new float[dim0*dim1]; + for (int i = 0; i < dim0*dim1; ++i) { + Array[i] = dis(gen); // Generate random float value + gradArray[i] = dis(gen); + } + + std::shared_ptr<Tensor> TGrad = std::make_shared<Tensor>(DataType::Float32); + TGrad -> resize({dim0,dim1}); + TGrad -> setBackend("cpu"); + TGrad -> getImpl() -> setRawPtr(gradArray, dim0*dim1); + + // Convert Input to Tensor + std::shared_ptr<Tensor> TInput = std::make_shared<Tensor>(DataType::Float32); + TInput -> resize({dim0,dim1}); + TInput -> setBackend("cpu"); + TInput -> getImpl() -> setRawPtr(Array, dim0*dim1); + + float min = dismin(gen); + std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32); + Tmin -> resize({}); + Tmin -> setBackend("cpu"); + Tmin -> getImpl() -> setRawPtr(&min,1); + + float max = dismax(gen); + std::shared_ptr<Tensor> Tmax = std::make_shared<Tensor>(DataType::Float32); + Tmax -> resize({}); + Tmax -> setBackend("cpu"); + Tmax -> getImpl() -> setRawPtr(&max,1); + // convert res to Tensor + std::vector<float> GT(Array, Array + (dim0*dim1)); + for (float& val : GT) + { + val = std::max(min, std::min(val, max));//Clip operation + } + std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32); + Tres -> resize({dim0,dim1}); + Tres -> setBackend("cpu"); + Tres -> getImpl() -> setRawPtr(GT.data(), dim0*dim1); + + op->associateInput(0, TInput); + op->associateInput(1, Tmin); + op->associateInput(2, Tmax); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->forwardDims(true); + myClip->forward(); + + op->getOutput(0)->setGrad(TGrad); + + start = std::chrono::system_clock::now(); + REQUIRE_NOTHROW(myClip->backward()); + end = std::chrono::system_clock::now(); + + auto GradTensor = op->getInput(0)->grad(); + float* BackwardTensor = (float*)GradTensor->getImpl()->rawPtr(); + std::vector<float> GT0(Array,Array+(dim0*dim1)); + std::vector<float> GT1(gradArray,gradArray+(dim0*dim1)); + std::vector<float> BackwardTensorVec(BackwardTensor,BackwardTensor+(dim0*dim1)); + ComputeClipBackward(GT0,GT1,min,max); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + REQUIRE(GT1 == BackwardTensorVec); + } + std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; + std::cout << "total time: " << duration.count() << std::endl; + } + } +} // namespace Aidge +} \ No newline at end of file