Skip to content
Snippets Groups Projects
Commit 2b4d63d1 authored by Maxence Naud's avatar Maxence Naud
Browse files

Merge remote-tracking branch 'origin/convert' into dev

parents ed45d0c4 869b863d
Branches
Tags
1 merge request!29Temporary master branch
Pipeline #36597 failed
...@@ -3,8 +3,11 @@ ...@@ -3,8 +3,11 @@
#include "aidge/backend/TensorImpl.hpp" #include "aidge/backend/TensorImpl.hpp"
#include "aidge/data/Tensor.hpp" #include "aidge/data/Tensor.hpp"
#include "aidge/data/half.hpp"
#include "aidge/utils/Registrar.hpp" #include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h" #include "aidge/utils/Types.h"
#include "aidge/utils/ErrorHandling.hpp"
#include "aidge/utils/future_std/span.hpp"
namespace Aidge { namespace Aidge {
template <class T> template <class T>
...@@ -12,7 +15,10 @@ class TensorImpl_cpu : public TensorImpl { ...@@ -12,7 +15,10 @@ class TensorImpl_cpu : public TensorImpl {
private: private:
const Tensor &mTensor; // Impl needs to access Tensor information, but is not const Tensor &mTensor; // Impl needs to access Tensor information, but is not
// supposed to change it! // supposed to change it!
std::vector<T> mData; /// Pointer to the data and its capacity
future_std::span<T> mData;
/// If this instance own the data, std::unique_ptr manages it
std::unique_ptr<T[]> mDataOwner;
public: public:
static constexpr const char *Backend = "cpu"; static constexpr const char *Backend = "cpu";
...@@ -20,9 +26,12 @@ class TensorImpl_cpu : public TensorImpl { ...@@ -20,9 +26,12 @@ class TensorImpl_cpu : public TensorImpl {
TensorImpl_cpu(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor) {} TensorImpl_cpu(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor) {}
bool operator==(const TensorImpl &otherImpl) const override final { bool operator==(const TensorImpl &otherImpl) const override final {
const auto& typedOtherImpl = reinterpret_cast<const TensorImpl_cpu<T> &>(otherImpl);
AIDGE_INTERNAL_ASSERT(typedOtherImpl.data().size() >= mTensor.size());
std::size_t i = 0; std::size_t i = 0;
for (; i < mTensor.size() && for (; i < mTensor.size() &&
mData[i] == reinterpret_cast<const TensorImpl_cpu<T> &>(otherImpl).data()[i]; mData[i] == typedOtherImpl.data()[i];
++i) { ++i) {
} }
return i == mTensor.size(); return i == mTensor.size();
...@@ -33,36 +42,134 @@ class TensorImpl_cpu : public TensorImpl { ...@@ -33,36 +42,134 @@ class TensorImpl_cpu : public TensorImpl {
} }
// native interface // native interface
const std::vector<T> &data() const { return mData; } const future_std::span<T>& data() const { return mData; }
std::size_t size() const override { return mData.size(); }
std::size_t scalarSize() const override { return sizeof(T); } std::size_t scalarSize() const override { return sizeof(T); }
void copy(const void *src, NbElts_t length) override { void setDevice(int device) override {
AIDGE_ASSERT(device == 0, "device cannot be != 0 for CPU backend");
}
void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override {
AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
std::copy(static_cast<const T *>(src), static_cast<const T *>(src) + length, std::copy(static_cast<const T *>(src), static_cast<const T *>(src) + length,
static_cast<T *>(rawPtr())); static_cast<T *>(rawPtr()) + offset);
}
void copyCast(const void *src, NbElts_t length, const DataType srcDt) override {
if (length == 0) {
return;
}
AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
if (srcDt == DataType::Float64) {
std::copy(static_cast<const double*>(src), static_cast<const double*>(src) + length,
static_cast<T *>(rawPtr()));
}
else if (srcDt == DataType::Float32) {
std::copy(static_cast<const float*>(src), static_cast<const float*>(src) + length,
static_cast<T *>(rawPtr()));
}
else if (srcDt == DataType::Float16) {
std::copy(static_cast<const half_float::half*>(src), static_cast<const half_float::half*>(src) + length,
static_cast<T *>(rawPtr()));
}
else if (srcDt == DataType::Int64) {
std::copy(static_cast<const int64_t*>(src), static_cast<const int64_t*>(src) + length,
static_cast<T *>(rawPtr()));
}
else if (srcDt == DataType::UInt64) {
std::copy(static_cast<const uint64_t*>(src), static_cast<const uint64_t*>(src) + length,
static_cast<T *>(rawPtr()));
}
else if (srcDt == DataType::Int32) {
std::copy(static_cast<const int32_t*>(src), static_cast<const int32_t*>(src) + length,
static_cast<T *>(rawPtr()));
}
else if (srcDt == DataType::UInt32) {
std::copy(static_cast<const uint32_t*>(src), static_cast<const uint32_t*>(src) + length,
static_cast<T *>(rawPtr()));
}
else if (srcDt == DataType::Int16) {
std::copy(static_cast<const int16_t*>(src), static_cast<const int16_t*>(src) + length,
static_cast<T *>(rawPtr()));
}
else if (srcDt == DataType::UInt16) {
std::copy(static_cast<const uint16_t*>(src), static_cast<const uint16_t*>(src) + length,
static_cast<T *>(rawPtr()));
}
else if (srcDt == DataType::Int8) {
std::copy(static_cast<const int8_t*>(src), static_cast<const int8_t*>(src) + length,
static_cast<T *>(rawPtr()));
}
else if (srcDt == DataType::UInt8) {
std::copy(static_cast<const uint8_t*>(src), static_cast<const uint8_t*>(src) + length,
static_cast<T *>(rawPtr()));
}
else {
AIDGE_THROW_OR_ABORT(std::runtime_error, "Unsupported data type.");
}
}
void copyFromDevice(const void *src, NbElts_t length, const std::pair<std::string, int>& device) override {
AIDGE_ASSERT(device.first == Backend, "backend must match");
AIDGE_ASSERT(device.second == 0, "device cannot be != 0 for CPU backend");
copy(src, length);
}
void copyFromHost(const void *src, NbElts_t length) override {
copy(src, length);
}
void copyToHost(void *dst, NbElts_t length) const override {
AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
const T* src = static_cast<const T*>(rawPtr());
std::copy(static_cast<const T *>(src), static_cast<const T *>(src) + length,
static_cast<T *>(dst));
} }
void *rawPtr() override { void *rawPtr() override {
lazyInit(mData); lazyInit();
return mData.data(); return mData.data();
}; };
void* getRaw(std::size_t idx){ const void *rawPtr() const override {
return static_cast<void*>(static_cast<T *>(rawPtr()) + idx); AIDGE_ASSERT(mData.size() >= mTensor.size(), "accessing uninitialized const rawPtr");
}; return mData.data();
};
virtual ~TensorImpl_cpu() = default; void *hostPtr() override {
lazyInit();
return mData.data();
};
void setRawPtr(void *ptr) override final { const void *hostPtr() const override {
T *newPtr = static_cast<T *>(ptr); AIDGE_ASSERT(mData.size() >= mTensor.size(), "accessing uninitialized const hostPtr");
mData = std::vector<T>(newPtr, newPtr + mTensor.size()); return mData.data();
}; };
private: void* getRawPtr(NbElts_t idx) override final {
void lazyInit(std::vector<T> &data) { AIDGE_ASSERT(idx < mData.size(), "idx out of range");
assert(mTensor.dataType() == NativeType<T>::type); return static_cast<void*>(static_cast<T*>(rawPtr()) + idx);
};
if (data.size() != mTensor.size()) data.resize(mTensor.size()); void setRawPtr(void *ptr, NbElts_t length) override final {
AIDGE_ASSERT(length >= mTensor.size(), "trying to set raw pointer of insufficient capacity");
mData = future_std::span<T>(static_cast<T *>(ptr), length);
mDataOwner.reset();
};
virtual ~TensorImpl_cpu() = default;
private:
void lazyInit() {
if (mData.size() < mTensor.size()) {
// Need more data, a re-allocation will occur
AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "trying to enlarge non-owned data");
mDataOwner.reset(new T[mTensor.size()]);
mData = future_std::span<T>(mDataOwner.get(), mTensor.size());
}
} }
}; };
...@@ -71,6 +178,8 @@ static Registrar<Tensor> registrarTensorImpl_cpu_Float64( ...@@ -71,6 +178,8 @@ static Registrar<Tensor> registrarTensorImpl_cpu_Float64(
{"cpu", DataType::Float64}, Aidge::TensorImpl_cpu<double>::create); {"cpu", DataType::Float64}, Aidge::TensorImpl_cpu<double>::create);
static Registrar<Tensor> registrarTensorImpl_cpu_Float32( static Registrar<Tensor> registrarTensorImpl_cpu_Float32(
{"cpu", DataType::Float32}, Aidge::TensorImpl_cpu<float>::create); {"cpu", DataType::Float32}, Aidge::TensorImpl_cpu<float>::create);
static Registrar<Tensor> registrarTensorImpl_cpu_Float16(
{"cpu", DataType::Float16}, Aidge::TensorImpl_cpu<half_float::half>::create);
static Registrar<Tensor> registrarTensorImpl_cpu_Int32( static Registrar<Tensor> registrarTensorImpl_cpu_Int32(
{"cpu", DataType::Int32}, Aidge::TensorImpl_cpu<int>::create); {"cpu", DataType::Int32}, Aidge::TensorImpl_cpu<int>::create);
} // namespace } // namespace
......
...@@ -39,19 +39,7 @@ public: ...@@ -39,19 +39,7 @@ public:
return std::make_unique<AddImpl_cpu>(op); return std::make_unique<AddImpl_cpu>(op);
} }
public:
NbElts_t getNbRequiredData(const IOIndex_t inputIdx) const override final;
NbElts_t getNbRequiredProtected(const IOIndex_t /*inputIdx*/) const override final; NbElts_t getNbRequiredProtected(const IOIndex_t /*inputIdx*/) const override final;
NbElts_t getRequiredMemory(const IOIndex_t outputIdx, const std::vector<DimSize_t>& /*inputsSize*/) const override final;
NbElts_t getNbConsumedData(const IOIndex_t inputIdx) const override final;
NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final;
void updateConsummerProducer() override final;
void forward() override; void forward() override;
}; };
......
...@@ -27,11 +27,12 @@ void AddImpl_cpu_forward_kernel(const std::size_t inputLength, const std::vector ...@@ -27,11 +27,12 @@ void AddImpl_cpu_forward_kernel(const std::size_t inputLength, const std::vector
} }
O* output = static_cast<O*>(output_); O* output = static_cast<O*>(output_);
for (std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) { for (std::size_t oIndex = 0; oIndex < inputLength; ++oIndex) {
for (std::size_t oIndex = 0; oIndex < inputLength; ++oIndex) { output[oIndex] = 0;
output[oIndex] += inputs[iIndex][oIndex]; for (std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) {
} output[oIndex] += inputs[iIndex][oIndex];
} }
}
} }
namespace { namespace {
......
...@@ -61,6 +61,7 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const AvgPooling_Op<2>::Attrs &attrs, ...@@ -61,6 +61,7 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const AvgPooling_Op<2>::Attrs &attrs,
for (std::size_t ch = 0; ch < dims[1]; ++ch) { for (std::size_t ch = 0; ch < dims[1]; ++ch) {
const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize; const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3]; const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
std::fill(output + oIndex, output+(oIndex+oxSize*oySize), 0);
for (std::size_t ox = 0; ox < oxSize; ++ox) { for (std::size_t ox = 0; ox < oxSize; ++ox) {
const signedsize difx = static_cast<signedsize>(- ox * std::get<0>(attrs)[0]); const signedsize difx = static_cast<signedsize>(- ox * std::get<0>(attrs)[0]);
const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0))); const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "aidge/utils/Registrar.hpp" #include "aidge/utils/Registrar.hpp"
#include "aidge/data/half.hpp"
#include "aidge/backend/cpu/operator/ConvImpl.hpp" #include "aidge/backend/cpu/operator/ConvImpl.hpp"
#include "aidge/utils/Types.h" #include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h" #include "aidge/backend/cpu/data/GetCPUPtr.h"
...@@ -151,6 +152,9 @@ namespace { ...@@ -151,6 +152,9 @@ namespace {
static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float32( static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float32(
{DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32}, {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>); Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>);
static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float16(
{DataType::Float16, DataType::Float16, DataType::Float16, DataType::Float16},
Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>);
static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Int32( static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Int32(
{DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32}, {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
Aidge::ConvImpl2D_cpu_forward_kernel<int, int, int, int>); Aidge::ConvImpl2D_cpu_forward_kernel<int, int, int, int>);
......
...@@ -21,46 +21,11 @@ ...@@ -21,46 +21,11 @@
#include "aidge/backend/cpu/operator/AddImpl.hpp" #include "aidge/backend/cpu/operator/AddImpl.hpp"
#include "aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp" #include "aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp"
Aidge::NbElts_t Aidge::AddImpl_cpu::getNbRequiredData(const Aidge::IOIndex_t inputIdx) const {
assert(mOp.getRawInput(inputIdx) && "requires valid input");
// Requires the whole tensors
const auto& inputDims = std::static_pointer_cast<Tensor>(mOp.getRawInput(inputIdx))->dims();
return std::accumulate(inputDims.begin(), inputDims.end(), NbElts_t(1), std::multiplies<NbElts_t>());
}
Aidge::NbElts_t Aidge::AddImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const { Aidge::NbElts_t Aidge::AddImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
// for the direct convolution algorithm, convolutions can be in-place, if there is no padding! // this implementation can be in-place
return 0; return 0;
} }
Aidge::NbElts_t Aidge::AddImpl_cpu::getRequiredMemory(const Aidge::IOIndex_t outputIdx, const std::vector<Aidge::DimSize_t>& /*inputsSize*/) const {
// Requires the whole tensors, regardless of available data on inputs
assert(outputIdx == 0 && "operator has only one output");
(void) outputIdx;
const auto& outputDims = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims();
return std::accumulate(outputDims.begin(), outputDims.end(), NbElts_t(1), std::multiplies<NbElts_t>());
}
Aidge::NbElts_t Aidge::AddImpl_cpu::getNbConsumedData(const Aidge::IOIndex_t inputIdx) const {
assert(inputIdx < mNbConsumedData.size());
return mNbConsumedData[inputIdx];
}
Aidge::NbElts_t Aidge::AddImpl_cpu::getNbProducedData(const Aidge::IOIndex_t outputIdx) const {
assert(outputIdx < mNbProducedData.size());
return mNbProducedData[outputIdx];
}
void Aidge::AddImpl_cpu::updateConsummerProducer() {
for (IOIndex_t inputIdx = 0; static_cast<NbElts_t>(inputIdx) < mNbConsumedData.size(); ++inputIdx)
mNbConsumedData[inputIdx]+= getNbRequiredData(inputIdx); // each input is consumed by the minimum amount for a forward pass
mNbProducedData[0]+= getRequiredMemory(0, {});
}
void Aidge::AddImpl_cpu::forward() { void Aidge::AddImpl_cpu::forward() {
assert(mOp.getRawInput(0) && "missing input in Add operator"); assert(mOp.getRawInput(0) && "missing input in Add operator");
DataType datatypeFirstInput = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(); DataType datatypeFirstInput = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType();
...@@ -69,16 +34,36 @@ void Aidge::AddImpl_cpu::forward() { ...@@ -69,16 +34,36 @@ void Aidge::AddImpl_cpu::forward() {
assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(i))->dataType() == datatypeFirstInput); assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(i))->dataType() == datatypeFirstInput);
} }
auto kernelFunc = Registrar<AddImplForward_cpu>::create({ // Find the correct kernel type
const auto outputDataType = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType();
const Registrar<AddImplForward_cpu>::registrar_key registrarKey = {
datatypeFirstInput, datatypeFirstInput,
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()}); outputDataType};
Registrar<AddImplForward_cpu>::registrar_type kernelFunc;
if (Registrar<AddImplForward_cpu>::exists(registrarKey)) {
// One exists with the right inputs/output types
kernelFunc = Registrar<AddImplForward_cpu>::create(registrarKey);
}
else {
// Otherwise, fallback to the kernel with all types matching output type
kernelFunc = Registrar<AddImplForward_cpu>::create({
outputDataType, outputDataType});
}
// Convert input data (no overhead if not needed!)
// TODO: right now, if needed, memory will be allocated/deallocated at each
// call to forward(). We might put the following shared_ptr as members of
// this class to avoid that.
std::vector<const void*> opInputs; std::vector<const void*> opInputs;
std::vector<std::shared_ptr<Tensor>> inputsFallback(mOp.nbInputs());
for (IOIndex_t i = 0; i < mOp.nbInputs(); ++i) { for (IOIndex_t i = 0; i < mOp.nbInputs(); ++i) {
opInputs.push_back(getCPUPtr(mOp.getRawInput(i))); const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(i))->refCastFrom(inputsFallback[i], *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
opInputs.push_back(input.getImpl()->rawPtr());
} }
// Call kernel
kernelFunc(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size(), kernelFunc(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size(),
opInputs, opInputs,
getCPUPtr(mOp.getRawOutput(0))); getCPUPtr(mOp.getRawOutput(0)));
} }
\ No newline at end of file
...@@ -34,14 +34,35 @@ void Aidge::ConvImpl2D_cpu::forward() { ...@@ -34,14 +34,35 @@ void Aidge::ConvImpl2D_cpu::forward() {
assert(mOp.getRawInput(2) && "missing input #2"); assert(mOp.getRawInput(2) && "missing input #2");
// Find the correct kernel type // Find the correct kernel type
auto kernelFunc = const auto outputDataType = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType();
Registrar<ConvImpl2DForward_cpu>::create({std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(), const Registrar<ConvImpl2DForward_cpu>::registrar_key registrarKey = {
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(), std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType(), std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()}); std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType(),
outputDataType};
Registrar<ConvImpl2DForward_cpu>::registrar_type kernelFunc;
if (Registrar<ConvImpl2DForward_cpu>::exists(registrarKey)) {
// One exists with the right inputs/output types
kernelFunc = Registrar<ConvImpl2DForward_cpu>::create(registrarKey);
}
else {
// Otherwise, fallback to the kernel with all types matching output type
kernelFunc = Registrar<ConvImpl2DForward_cpu>::create({
outputDataType, outputDataType, outputDataType, outputDataType});
}
// Convert input data (no overhead if not needed!)
// TODO: right now, if needed, memory will be allocated/deallocated at each
// call to forward(). We might put the following shared_ptr as members of
// this class to avoid that.
std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
// Call kernel // Call kernel
kernelFunc(dynamic_cast<const Conv_Op<2>&>(mOp).getStaticAttributes(), std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->template dims<4>(), kernelFunc(dynamic_cast<const Conv_Op<2>&>(mOp).getStaticAttributes(), std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->template dims<4>(),
getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(1)), input0.getImpl()->rawPtr(), input1.getImpl()->rawPtr(), input2.getImpl()->rawPtr(),
getCPUPtr(mOp.getRawInput(2)), getCPUPtr(mOp.getRawOutput(0))); getCPUPtr(mOp.getRawOutput(0)));
} }
...@@ -29,29 +29,37 @@ void Aidge::FCImpl_cpu::forward() ...@@ -29,29 +29,37 @@ void Aidge::FCImpl_cpu::forward()
assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(2)) && "missing input #2"); assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(2)) && "missing input #2");
// Find the correct kernel type // Find the correct kernel type
auto kernelFunc = Registrar<FCImplForward_cpu>::create( const auto outputDataType = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType();
{std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(), const Registrar<FCImplForward_cpu>::registrar_key registrarKey = {
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(), std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType(), std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()}); std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType(),
outputDataType};
Registrar<FCImplForward_cpu>::registrar_type kernelFunc;
if (Registrar<FCImplForward_cpu>::exists(registrarKey)) {
// One exists with the right inputs/output types
kernelFunc = Registrar<FCImplForward_cpu>::create(registrarKey);
}
else {
// Otherwise, fallback to the kernel with all types matching output type
kernelFunc = Registrar<FCImplForward_cpu>::create({
outputDataType, outputDataType, outputDataType, outputDataType});
}
// Convert input data (no overhead if not needed!)
// TODO: right now, if needed, memory will be allocated/deallocated at each
// call to forward(). We might put the following shared_ptr as members of
// this class to avoid that.
std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
// Call kernel // Call kernel
// if (std::static_pointer_cast<Tensor>(mOp.getRawInput(0)->nbDims() == 4) { kernelFunc(dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(),
// kernelFunc( input0.dims()[0],
// mOp.getStaticAttributes(), input0.size() / input0.dims()[0],
// std::static_pointer_cast<Tensor>(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->template dims<4>(), input0.getImpl()->rawPtr(), input1.getImpl()->rawPtr(), input2.getImpl()->rawPtr(),
// getCPUPtr(mOp.getRawInput(0),
// mOp.mInputs[1]->getImpl()->rawPtr(),
// mOp.mInputs[2]->getImpl()->rawPtr(),
// mOp.getOutput(0)->getImpl()->rawPtr());
// }
// else
kernelFunc(
dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0],
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->sizeM1(),
getCPUPtr(mOp.getRawInput(0)),
getCPUPtr(mOp.getRawInput(1)),
getCPUPtr(mOp.getRawInput(2)),
getCPUPtr(mOp.getRawOutput(0))); getCPUPtr(mOp.getRawOutput(0)));
} }
...@@ -47,7 +47,7 @@ void Aidge::MatMulImpl_cpu::forward() ...@@ -47,7 +47,7 @@ void Aidge::MatMulImpl_cpu::forward()
kernelFunc( kernelFunc(
dynamic_cast<const MatMul_Op&>(mOp).getStaticAttributes(), dynamic_cast<const MatMul_Op&>(mOp).getStaticAttributes(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0], std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0],
std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->sizeM1(), std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size() / std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0],
getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(0)),
getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawInput(1)),
getCPUPtr(mOp.getRawOutput(0))); getCPUPtr(mOp.getRawOutput(0)));
......
...@@ -38,7 +38,7 @@ void Aidge::SoftmaxImpl_cpu::forward() { ...@@ -38,7 +38,7 @@ void Aidge::SoftmaxImpl_cpu::forward() {
DimSize_t batchSize = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0]; DimSize_t batchSize = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0];
DimSize_t channelSize = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[1]; DimSize_t channelSize = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[1];
DimSize_t featureSize = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->sizeM1()/channelSize; DimSize_t featureSize = (std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size()/batchSize)/channelSize;
// Call kernel // Call kernel
kernelFunc(batchSize, kernelFunc(batchSize,
channelSize, channelSize,
......
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <catch2/catch_test_macros.hpp>
#include "aidge/recipies/Recipies.hpp"
#include "aidge/operator/Conv.hpp"
#include "aidge/operator/Producer.hpp"
#include "aidge/graph/OpArgs.hpp"
#include <cstddef>
using namespace Aidge;
TEST_CASE("[ExplicitCastMove] conv") {
auto conv1 = Conv(3, 32, {3, 3}, "conv1");
auto conv2 = Conv(32, 64, {3, 3}, "conv2");
auto conv3 = Conv(64, 10, {1, 1}, "conv3", {2, 2});
auto g1 = Sequential({
Producer({16, 3, 224, 224}, "dataProvider"),
conv1,
conv2,
conv3
});
g1->setBackend("cpu");
conv1->getOperator()->setDataType(DataType::Int32);
conv3->getOperator()->setDataType(DataType::Float64);
g1->save("explicitCastMove_before");
REQUIRE(g1->getNodes().size() == 10);
g1->forwardDims();
explicitCastMove(g1);
g1->save("explicitCastMove_after");
REQUIRE(g1->getNodes().size() == 13);
}
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <catch2/catch_test_macros.hpp>
#include <memory>
#include <string>
#include "aidge/data/Tensor.hpp"
#include "aidge/utils/TensorUtils.hpp"
#include "aidge/graph/Node.hpp"
#include "aidge/graph/GraphView.hpp"
#include "aidge/graph/OpArgs.hpp"
#include "aidge/scheduler/Scheduler.hpp"
#include "aidge/recipies/Recipies.hpp"
#include "aidge/backend/cpu.hpp"
using namespace Aidge;
TEST_CASE("[cpu/castmove] CastMove(forward)") {
std::shared_ptr<Tensor> inputTensor =
std::make_shared<Tensor>(Array4D<int, 2, 1, 5, 5>{{{{{0, 1, 2, 3, 4},
{5, 6, 7, 8, 9},
{10, 11, 12, 13, 14},
{15, 16, 17, 18, 19},
{20, 21, 22, 23, 24}}},
{{{25, 26, 27, 28, 29},
{30, 31, 32, 33, 34},
{35, 36, 37, 38, 39},
{40, 41, 42, 43, 44},
{45, 46, 47, 48, 49}}}}});
std::shared_ptr<Tensor> weight1 = std::make_shared<Tensor>(
Array4D<int, 3, 1, 3, 3>{{{{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}},
{{{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}},
{{{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}}});
std::shared_ptr<Tensor> bias1 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
SECTION("Test implicit") {
std::shared_ptr<GraphView> g =
Sequential({
Conv(1, 3, {3, 3}, "conv1"),
Conv(3, 4, {1, 1}, "conv2"),
Conv(4, 3, {1, 1}, "conv3"),
FC(27, 5, false, "fc")});
g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
g->getNode("conv1")->getOperator()->setInput(1, weight1);
g->getNode("conv1")->getOperator()->setInput(2, bias1);
std::shared_ptr<Tensor> weight2 =
std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}},
{{{4}}, {{5}}, {{6}}},
{{{7}}, {{8}}, {{9}}},
{{{10}}, {{11}}, {{12}}}}});
std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}});
g->getNode("conv2")->getOperator()->setInput(1, weight2);
g->getNode("conv2")->getOperator()->setInput(2, bias2);
// *(g->getNode("conv2")->getOperator()->input(1, weight2);
std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>(
Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}},
{{{5}}, {{6}}, {{7}}, {{8}}},
{{{9}}, {{10}}, {{11}}, {{12}}}}});
std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
g->getNode("conv3")->getOperator()->setInput(1, weight3);
g->getNode("conv3")->getOperator()->setInput(2, bias3);
std::shared_ptr<Tensor> weightfc = std::make_shared<Tensor>(
Array2D<int, 5, 27>{{{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
{13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9},
{10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6},
{7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3},
{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}});
std::shared_ptr<Tensor> biasfc = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
g->getNode("fc")->getOperator()->setInput(1, weightfc);
g->getNode("fc")->getOperator()->setInput(2, biasfc);
// input->addChild(g);
g->setDataType(Aidge::DataType::Int32);
g->getNode("conv1")->getOperator()->setDataType(DataType::Float32);
g->getNode("conv3")->getOperator()->setDataType(DataType::Float64);
g->setBackend("cpu");
g->forwardDims();
SequentialScheduler scheduler(g);
REQUIRE_NOTHROW(scheduler.forward());
scheduler.saveSchedulingDiagram("schedulingSequential");
std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
{{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
{{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
{{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
{{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
{{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
{{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}});
std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{
{{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}},
{{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}},
{{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}},
{{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}},
{{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}},
{{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}},
{{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}},
{{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}});
std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
{{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}},
{{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}},
{{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}},
{{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}},
{{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}},
{{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}});
Tensor expectedOutput4 = Array2D<int, 2, 5>{
{{205050376, 198925904, 181355097, 196978090, 238868348},
{598467376, 561797804, 560823897, 593043790, 698672948}}};
std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0);
REQUIRE(approxEq<float, int>(*other1, *expectedOutput1, 0.0, 1.0e-12));
std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0);
REQUIRE(approxEq<int>(*other2, *expectedOutput2, 0.0, 1.0e-12));
std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0);
REQUIRE(approxEq<double, int>(*other3, *expectedOutput3, 0.0, 1.0e-12));
std::shared_ptr<Tensor> other4 = std::static_pointer_cast<OperatorTensor>(g->getNode("fc")->getOperator())->getOutput(0);
REQUIRE(approxEq<int>(*other4, expectedOutput4, 0.0, 1.0e-12));
}
SECTION("Half") {
Tensor refTensor = Array2D<float, 3, 2>{{{0.0, 1.0},{2.1, 3.4},{5000.0, 1.0e5}}};
Tensor tensor(DataType::Float16);
tensor.copyCastFrom(refTensor);
REQUIRE(approxEq<float, half_float::half>(refTensor, tensor, 1.0e-3, 0.0));
}
SECTION("Test explicit") {
std::shared_ptr<GraphView> g =
Sequential({
Conv(1, 3, {3, 3}, "conv1"),
Conv(3, 4, {1, 1}, "conv2"),
Conv(4, 3, {1, 1}, "conv3"),
FC(27, 5, false, "fc")});
g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
g->getNode("conv1")->getOperator()->setInput(1, weight1);
g->getNode("conv1")->getOperator()->setInput(2, bias1);
std::shared_ptr<Tensor> weight2 =
std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}},
{{{4}}, {{5}}, {{6}}},
{{{7}}, {{8}}, {{9}}},
{{{10}}, {{11}}, {{12}}}}});
std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}});
g->getNode("conv2")->getOperator()->setInput(1, weight2);
g->getNode("conv2")->getOperator()->setInput(2, bias2);
// *(g->getNode("conv2")->getOperator()->input(1, weight2);
std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>(
Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}},
{{{5}}, {{6}}, {{7}}, {{8}}},
{{{9}}, {{10}}, {{11}}, {{12}}}}});
std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
g->getNode("conv3")->getOperator()->setInput(1, weight3);
g->getNode("conv3")->getOperator()->setInput(2, bias3);
std::shared_ptr<Tensor> weightfc = std::make_shared<Tensor>(
Array2D<int, 5, 27>{{{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
{13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9},
{10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6},
{7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3},
{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}});
std::shared_ptr<Tensor> biasfc = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
g->getNode("fc")->getOperator()->setInput(1, weightfc);
g->getNode("fc")->getOperator()->setInput(2, biasfc);
// input->addChild(g);
g->setDataType(Aidge::DataType::Int32);
g->getNode("conv1")->getOperator()->setDataType(DataType::Float32);
g->getNode("conv3")->getOperator()->setDataType(DataType::Float64);
explicitCastMove(g);
g->setBackend("cpu");
g->forwardDims();
SequentialScheduler scheduler(g);
REQUIRE_NOTHROW(scheduler.forward());
scheduler.saveSchedulingDiagram("schedulingSequential");
std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
{{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
{{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
{{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
{{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
{{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
{{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}});
std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{
{{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}},
{{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}},
{{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}},
{{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}},
{{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}},
{{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}},
{{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}},
{{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}});
std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
{{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}},
{{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}},
{{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}},
{{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}},
{{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}},
{{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}});
Tensor expectedOutput4 = Array2D<int, 2, 5>{
{{205050376, 198925904, 181355097, 196978090, 238868348},
{598467376, 561797804, 560823897, 593043790, 698672948}}};
std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0);
REQUIRE(approxEq<float, int>(*other1, *expectedOutput1, 0.0, 1.0e-12));
std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0);
REQUIRE(approxEq<int>(*other2, *expectedOutput2, 0.0, 1.0e-12));
std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0);
REQUIRE(approxEq<double, int>(*other3, *expectedOutput3, 0.0, 1.0e-12));
std::shared_ptr<Tensor> other4 = std::static_pointer_cast<OperatorTensor>(g->getNode("fc")->getOperator())->getOutput(0);
REQUIRE(approxEq<int>(*other4, expectedOutput4, 0.0, 1.0e-12));
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment