diff --git a/aidge_backend_cpu/unit_tests/test_recipies.py b/aidge_backend_cpu/unit_tests/test_recipies.py index 841c15590e9dac7596958b8392c99948978723c5..e343fad1aeda82555a57778a394a4590b1e8772e 100644 --- a/aidge_backend_cpu/unit_tests/test_recipies.py +++ b/aidge_backend_cpu/unit_tests/test_recipies.py @@ -31,7 +31,7 @@ class test_recipies(unittest.TestCase): input_node = aidge_core.Producer(input_tensor, "X") conv = aidge_core.Conv2D(1, 1, [3, 3], name="Conv0") - bn = aidge_core.BatchNorm2D(name="Add0") + bn = aidge_core.BatchNorm2D(1, name="Add0") graph_view = aidge_core.sequential([conv, bn]) diff --git a/include/aidge/backend/cpu/data/TensorImpl.hpp b/include/aidge/backend/cpu/data/TensorImpl.hpp index 590c83ed0cbf3b302e6b45c34ffe52e5f85963d1..39bfdd0a54bbaaab43c09ecc338f09723c30c1b6 100644 --- a/include/aidge/backend/cpu/data/TensorImpl.hpp +++ b/include/aidge/backend/cpu/data/TensorImpl.hpp @@ -3,8 +3,11 @@ #include "aidge/backend/TensorImpl.hpp" #include "aidge/data/Tensor.hpp" +#include "aidge/data/half.hpp" #include "aidge/utils/Registrar.hpp" #include "aidge/utils/Types.h" +#include "aidge/utils/ErrorHandling.hpp" +#include "aidge/utils/future_std/span.hpp" namespace Aidge { template <class T> @@ -12,7 +15,10 @@ class TensorImpl_cpu : public TensorImpl { private: const Tensor &mTensor; // Impl needs to access Tensor information, but is not // supposed to change it! - std::vector<T> mData; + /// Pointer to the data and its capacity + future_std::span<T> mData; + /// If this instance own the data, std::unique_ptr manages it + std::unique_ptr<T[]> mDataOwner; public: static constexpr const char *Backend = "cpu"; @@ -20,9 +26,12 @@ class TensorImpl_cpu : public TensorImpl { TensorImpl_cpu(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor) {} bool operator==(const TensorImpl &otherImpl) const override final { + const auto& typedOtherImpl = reinterpret_cast<const TensorImpl_cpu<T> &>(otherImpl); + AIDGE_INTERNAL_ASSERT(typedOtherImpl.data().size() >= mTensor.size()); + std::size_t i = 0; for (; i < mTensor.size() && - mData[i] == reinterpret_cast<const TensorImpl_cpu<T> &>(otherImpl).data()[i]; + mData[i] == typedOtherImpl.data()[i]; ++i) { } return i == mTensor.size(); @@ -33,36 +42,129 @@ class TensorImpl_cpu : public TensorImpl { } // native interface - const std::vector<T> &data() const { return mData; } + const future_std::span<T>& data() const { return mData; } + std::size_t size() const override { return mData.size(); } std::size_t scalarSize() const override { return sizeof(T); } - void copy(const void *src, NbElts_t length, std::size_t offset = 0) override { + void setDevice(DeviceIdx_t device) override { + AIDGE_ASSERT(device == 0, "device cannot be != 0 for CPU backend"); + } + + void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override { + AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity"); + std::copy(static_cast<const T *>(src), static_cast<const T *>(src) + length, + static_cast<T *>(rawPtr()) + offset); + } + + void copyCast(const void *src, NbElts_t length, const DataType srcDt) override { + if (length == 0) { + return; + } + + AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity"); + if (srcDt == DataType::Float64) { + std::copy(static_cast<const double*>(src), static_cast<const double*>(src) + length, + static_cast<T *>(rawPtr())); + } + else if (srcDt == DataType::Float32) { + std::copy(static_cast<const float*>(src), static_cast<const float*>(src) + length, + static_cast<T *>(rawPtr())); + } + else if (srcDt == DataType::Float16) { + std::copy(static_cast<const half_float::half*>(src), static_cast<const half_float::half*>(src) + length, + static_cast<T *>(rawPtr())); + } + else if (srcDt == DataType::Int64) { + std::copy(static_cast<const int64_t*>(src), static_cast<const int64_t*>(src) + length, + static_cast<T *>(rawPtr())); + } + else if (srcDt == DataType::UInt64) { + std::copy(static_cast<const uint64_t*>(src), static_cast<const uint64_t*>(src) + length, + static_cast<T *>(rawPtr())); + } + else if (srcDt == DataType::Int32) { + std::copy(static_cast<const int32_t*>(src), static_cast<const int32_t*>(src) + length, + static_cast<T *>(rawPtr())); + } + else if (srcDt == DataType::UInt32) { + std::copy(static_cast<const uint32_t*>(src), static_cast<const uint32_t*>(src) + length, + static_cast<T *>(rawPtr())); + } + else if (srcDt == DataType::Int16) { + std::copy(static_cast<const int16_t*>(src), static_cast<const int16_t*>(src) + length, + static_cast<T *>(rawPtr())); + } + else if (srcDt == DataType::UInt16) { + std::copy(static_cast<const uint16_t*>(src), static_cast<const uint16_t*>(src) + length, + static_cast<T *>(rawPtr())); + } + else if (srcDt == DataType::Int8) { + std::copy(static_cast<const int8_t*>(src), static_cast<const int8_t*>(src) + length, + static_cast<T *>(rawPtr())); + } + else if (srcDt == DataType::UInt8) { + std::copy(static_cast<const uint8_t*>(src), static_cast<const uint8_t*>(src) + length, + static_cast<T *>(rawPtr())); + } + else { + AIDGE_THROW_OR_ABORT(std::runtime_error, "Unsupported data type."); + } + } + + void copyFromDevice(const void *src, NbElts_t length, const std::pair<std::string, DeviceIdx_t>& device) override { + AIDGE_ASSERT(device.first == Backend, "backend must match"); + AIDGE_ASSERT(device.second == 0, "device cannot be != 0 for CPU backend"); + copy(src, length); + } + + void copyFromHost(const void *src, NbElts_t length) override { + copy(src, length); + } + + void copyToHost(void *dst, NbElts_t length) const override { + AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity"); + const T* src = static_cast<const T*>(rawPtr()); std::copy(static_cast<const T *>(src), static_cast<const T *>(src) + length, - static_cast<T *>(rawPtr())+offset); + static_cast<T *>(dst)); } - void *rawPtr() override { - lazyInit(mData); - return mData.data(); + void *rawPtr(NbElts_t offset = 0) override { + lazyInit(); + return (mData.data() + offset); }; - void* getRaw(std::size_t idx){ - return static_cast<void*>(static_cast<T *>(rawPtr()) + idx); - }; + const void *rawPtr(NbElts_t offset = 0) const override { + AIDGE_ASSERT(mData.size() >= mTensor.size(), "accessing uninitialized const rawPtr"); + return (mData.data() + offset); + }; - virtual ~TensorImpl_cpu() = default; + void *hostPtr(NbElts_t offset = 0) override { + lazyInit(); + return (mData.data() + offset); + }; - void setRawPtr(void *ptr) override final { - T *newPtr = static_cast<T *>(ptr); - mData = std::vector<T>(newPtr, newPtr + mTensor.size()); + const void *hostPtr(NbElts_t offset = 0) const override { + AIDGE_ASSERT(mData.size() >= mTensor.size(), "accessing uninitialized const hostPtr"); + return (mData.data() + offset); }; - private: - void lazyInit(std::vector<T> &data) { - assert(mTensor.dataType() == NativeType<T>::type); + void setRawPtr(void *ptr, NbElts_t length) override final { + AIDGE_ASSERT(length >= mTensor.size(), "trying to set raw pointer of insufficient capacity"); + mData = future_std::span<T>(static_cast<T *>(ptr), length); + mDataOwner.reset(); + }; + + virtual ~TensorImpl_cpu() = default; - if (data.size() != mTensor.size()) data.resize(mTensor.size()); +private: + void lazyInit() { + if (mData.size() < mTensor.size()) { + // Need more data, a re-allocation will occur + AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "trying to enlarge non-owned data"); + mDataOwner.reset(new T[mTensor.size()]); + mData = future_std::span<T>(mDataOwner.get(), mTensor.size()); + } } }; namespace { @@ -70,6 +172,8 @@ static Registrar<Tensor> registrarTensorImpl_cpu_Float64( {"cpu", DataType::Float64}, Aidge::TensorImpl_cpu<double>::create); static Registrar<Tensor> registrarTensorImpl_cpu_Float32( {"cpu", DataType::Float32}, Aidge::TensorImpl_cpu<float>::create); +static Registrar<Tensor> registrarTensorImpl_cpu_Float16( + {"cpu", DataType::Float16}, Aidge::TensorImpl_cpu<half_float::half>::create); static Registrar<Tensor> registrarTensorImpl_cpu_Int32( {"cpu", DataType::Int32}, Aidge::TensorImpl_cpu<int>::create); static Registrar<Tensor> registrarTensorImpl_cpu_Int16( diff --git a/include/aidge/backend/cpu/operator/AddImpl.hpp b/include/aidge/backend/cpu/operator/AddImpl.hpp index fa1b837902ee72f22c54afdec0ff897db3b39b76..0299148d086ae6e2be967232e8157c6a6229b0f7 100644 --- a/include/aidge/backend/cpu/operator/AddImpl.hpp +++ b/include/aidge/backend/cpu/operator/AddImpl.hpp @@ -39,19 +39,7 @@ public: return std::make_unique<AddImpl_cpu>(op); } -public: - NbElts_t getNbRequiredData(const IOIndex_t inputIdx) const override final; - NbElts_t getNbRequiredProtected(const IOIndex_t /*inputIdx*/) const override final; - - NbElts_t getRequiredMemory(const IOIndex_t outputIdx, const std::vector<DimSize_t>& /*inputsSize*/) const override final; - - NbElts_t getNbConsumedData(const IOIndex_t inputIdx) const override final; - - NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final; - - void updateConsummerProducer() override final; - void forward() override; }; diff --git a/include/aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp index 198bcbacc395edf2709fa229828e2228554e6fd2..631ad44a562c17d41ad019a1da112dbf8a69185c 100644 --- a/include/aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp @@ -27,11 +27,12 @@ void AddImpl_cpu_forward_kernel(const std::size_t inputLength, const std::vector } O* output = static_cast<O*>(output_); - for (std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) { - for (std::size_t oIndex = 0; oIndex < inputLength; ++oIndex) { - output[oIndex] += inputs[iIndex][oIndex]; - } - } + for (std::size_t oIndex = 0; oIndex < inputLength; ++oIndex) { + output[oIndex] = 0; + for (std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) { + output[oIndex] += inputs[iIndex][oIndex]; + } + } } namespace { diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_forward_kernels.hpp index 5598cc9cdfd463b6e40e6801b74203b911a318e6..d6950e11e935a3f6d5548148d1c393a5340af224 100644 --- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_forward_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_forward_kernels.hpp @@ -61,6 +61,7 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const AvgPooling_Op<2>::Attrs &attrs, for (std::size_t ch = 0; ch < dims[1]; ++ch) { const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize; const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3]; + std::fill(output + oIndex, output+(oIndex+oxSize*oySize), 0); for (std::size_t ox = 0; ox < oxSize; ++ox) { const signedsize difx = static_cast<signedsize>(- ox * std::get<0>(attrs)[0]); const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0))); diff --git a/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp index cbd784698fcce5152c0bb42a192c327abb2b10dd..83607f280f53e5e477db7d8bbbbd1634dd9c584d 100644 --- a/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp @@ -14,6 +14,7 @@ #include "aidge/utils/Registrar.hpp" +#include "aidge/data/half.hpp" #include "aidge/backend/cpu/operator/ConvImpl.hpp" #include "aidge/utils/Types.h" #include "aidge/backend/cpu/data/GetCPUPtr.h" @@ -151,6 +152,9 @@ namespace { static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float32( {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32}, Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>); +static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float16( + {DataType::Float16, DataType::Float16, DataType::Float16, DataType::Float16}, + Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>); static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Int32( {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32}, Aidge::ConvImpl2D_cpu_forward_kernel<int, int, int, int>); diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp index 91d4533c4ee2754dce1b9b7ea9ca8c598f530a52..3b53eaf3b88fb418746ab5a7a2297a15606974d3 100644 --- a/src/operator/AddImpl.cpp +++ b/src/operator/AddImpl.cpp @@ -21,46 +21,11 @@ #include "aidge/backend/cpu/operator/AddImpl.hpp" #include "aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp" -Aidge::NbElts_t Aidge::AddImpl_cpu::getNbRequiredData(const Aidge::IOIndex_t inputIdx) const { - assert(mOp.getRawInput(inputIdx) && "requires valid input"); - - // Requires the whole tensors - const auto& inputDims = std::static_pointer_cast<Tensor>(mOp.getRawInput(inputIdx))->dims(); - return std::accumulate(inputDims.begin(), inputDims.end(), NbElts_t(1), std::multiplies<NbElts_t>()); -} - Aidge::NbElts_t Aidge::AddImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const { - // for the direct convolution algorithm, convolutions can be in-place, if there is no padding! + // this implementation can be in-place return 0; } -Aidge::NbElts_t Aidge::AddImpl_cpu::getRequiredMemory(const Aidge::IOIndex_t outputIdx, const std::vector<Aidge::DimSize_t>& /*inputsSize*/) const { - // Requires the whole tensors, regardless of available data on inputs - assert(outputIdx == 0 && "operator has only one output"); - (void) outputIdx; - - const auto& outputDims = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(); - return std::accumulate(outputDims.begin(), outputDims.end(), NbElts_t(1), std::multiplies<NbElts_t>()); -} - -Aidge::NbElts_t Aidge::AddImpl_cpu::getNbConsumedData(const Aidge::IOIndex_t inputIdx) const { - assert(inputIdx < mNbConsumedData.size()); - return mNbConsumedData[inputIdx]; -} - -Aidge::NbElts_t Aidge::AddImpl_cpu::getNbProducedData(const Aidge::IOIndex_t outputIdx) const { - assert(outputIdx < mNbProducedData.size()); - return mNbProducedData[outputIdx]; -} - -void Aidge::AddImpl_cpu::updateConsummerProducer() { - for (IOIndex_t inputIdx = 0; static_cast<NbElts_t>(inputIdx) < mNbConsumedData.size(); ++inputIdx) - mNbConsumedData[inputIdx]+= getNbRequiredData(inputIdx); // each input is consumed by the minimum amount for a forward pass - - mNbProducedData[0]+= getRequiredMemory(0, {}); - -} - void Aidge::AddImpl_cpu::forward() { assert(mOp.getRawInput(0) && "missing input in Add operator"); DataType datatypeFirstInput = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(); @@ -69,16 +34,36 @@ void Aidge::AddImpl_cpu::forward() { assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(i))->dataType() == datatypeFirstInput); } - auto kernelFunc = Registrar<AddImplForward_cpu>::create({ + // Find the correct kernel type + const auto outputDataType = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType(); + const Registrar<AddImplForward_cpu>::registrar_key registrarKey = { datatypeFirstInput, - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()}); + outputDataType}; + Registrar<AddImplForward_cpu>::registrar_type kernelFunc; + if (Registrar<AddImplForward_cpu>::exists(registrarKey)) { + // One exists with the right inputs/output types + kernelFunc = Registrar<AddImplForward_cpu>::create(registrarKey); + } + else { + // Otherwise, fallback to the kernel with all types matching output type + kernelFunc = Registrar<AddImplForward_cpu>::create({ + outputDataType, outputDataType}); + } + + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. std::vector<const void*> opInputs; + std::vector<std::shared_ptr<Tensor>> inputsFallback(mOp.nbInputs()); for (IOIndex_t i = 0; i < mOp.nbInputs(); ++i) { - opInputs.push_back(getCPUPtr(mOp.getRawInput(i))); + const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(i))->refCastFrom(inputsFallback[i], *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + opInputs.push_back(input.getImpl()->rawPtr()); } + // Call kernel kernelFunc(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size(), opInputs, getCPUPtr(mOp.getRawOutput(0))); -} \ No newline at end of file +} diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp index d476f84717c0ed6f7bd45d68bd24b4d7ada6cbbd..b849142dd3abe0131fb0c6c448530a7669ce27dc 100644 --- a/src/operator/ConvImpl.cpp +++ b/src/operator/ConvImpl.cpp @@ -34,14 +34,35 @@ void Aidge::ConvImpl2D_cpu::forward() { assert(mOp.getRawInput(2) && "missing input #2"); // Find the correct kernel type - auto kernelFunc = - Registrar<ConvImpl2DForward_cpu>::create({std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType(), - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()}); + const auto outputDataType = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType(); + const Registrar<ConvImpl2DForward_cpu>::registrar_key registrarKey = { + std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(), + std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(), + std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType(), + outputDataType}; + + Registrar<ConvImpl2DForward_cpu>::registrar_type kernelFunc; + if (Registrar<ConvImpl2DForward_cpu>::exists(registrarKey)) { + // One exists with the right inputs/output types + kernelFunc = Registrar<ConvImpl2DForward_cpu>::create(registrarKey); + } + else { + // Otherwise, fallback to the kernel with all types matching output type + kernelFunc = Registrar<ConvImpl2DForward_cpu>::create({ + outputDataType, outputDataType, outputDataType, outputDataType}); + } + + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; + const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); // Call kernel kernelFunc(dynamic_cast<const Conv_Op<2>&>(mOp).getStaticAttributes(), std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->template dims<4>(), - getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(1)), - getCPUPtr(mOp.getRawInput(2)), getCPUPtr(mOp.getRawOutput(0))); + input0.getImpl()->rawPtr(), input1.getImpl()->rawPtr(), input2.getImpl()->rawPtr(), + getCPUPtr(mOp.getRawOutput(0))); } diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp index 14f59f6f7baff57602ad71c8c08023038963b5f0..bc4a7a7cab91049c623e9a9e95ee63367da00722 100644 --- a/src/operator/FCImpl.cpp +++ b/src/operator/FCImpl.cpp @@ -29,29 +29,37 @@ void Aidge::FCImpl_cpu::forward() assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(2)) && "missing input #2"); // Find the correct kernel type - auto kernelFunc = Registrar<FCImplForward_cpu>::create( - {std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType(), - std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()}); + const auto outputDataType = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType(); + const Registrar<FCImplForward_cpu>::registrar_key registrarKey = { + std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(), + std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(), + std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType(), + outputDataType}; + + Registrar<FCImplForward_cpu>::registrar_type kernelFunc; + if (Registrar<FCImplForward_cpu>::exists(registrarKey)) { + // One exists with the right inputs/output types + kernelFunc = Registrar<FCImplForward_cpu>::create(registrarKey); + } + else { + // Otherwise, fallback to the kernel with all types matching output type + kernelFunc = Registrar<FCImplForward_cpu>::create({ + outputDataType, outputDataType, outputDataType, outputDataType}); + } + + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback; + const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); + const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); // Call kernel - // if (std::static_pointer_cast<Tensor>(mOp.getRawInput(0)->nbDims() == 4) { - // kernelFunc( - // mOp.getStaticAttributes(), - // std::static_pointer_cast<Tensor>(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->template dims<4>(), - // getCPUPtr(mOp.getRawInput(0), - // mOp.mInputs[1]->getImpl()->rawPtr(), - // mOp.mInputs[2]->getImpl()->rawPtr(), - // mOp.getOutput(0)->getImpl()->rawPtr()); - // } - // else - kernelFunc( - dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0], - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->sizeM1(), - getCPUPtr(mOp.getRawInput(0)), - getCPUPtr(mOp.getRawInput(1)), - getCPUPtr(mOp.getRawInput(2)), + kernelFunc(dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(), + input0.dims()[0], + input0.size() / input0.dims()[0], + input0.getImpl()->rawPtr(), input1.getImpl()->rawPtr(), input2.getImpl()->rawPtr(), getCPUPtr(mOp.getRawOutput(0))); } diff --git a/src/operator/MatMulImpl.cpp b/src/operator/MatMulImpl.cpp index 1abd75db070bbd3b197519318f5bf23c7b46ee5a..f02effb3172e2c0624c6c7532513a2b794ee3a89 100644 --- a/src/operator/MatMulImpl.cpp +++ b/src/operator/MatMulImpl.cpp @@ -47,7 +47,7 @@ void Aidge::MatMulImpl_cpu::forward() kernelFunc( dynamic_cast<const MatMul_Op&>(mOp).getStaticAttributes(), std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0], - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->sizeM1(), + std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size() / std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0], getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawOutput(0))); diff --git a/src/operator/SoftmaxImpl.cpp b/src/operator/SoftmaxImpl.cpp index 428d32fc7a4c1a2b639d4f78601c78ab41376b47..c3086d8f9067996b9b0a8546b6deb3e281c777b4 100644 --- a/src/operator/SoftmaxImpl.cpp +++ b/src/operator/SoftmaxImpl.cpp @@ -38,7 +38,7 @@ void Aidge::SoftmaxImpl_cpu::forward() { DimSize_t batchSize = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0]; DimSize_t channelSize = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[1]; - DimSize_t featureSize = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->sizeM1()/channelSize; + DimSize_t featureSize = (std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size()/batchSize)/channelSize; // Call kernel kernelFunc(batchSize, channelSize, diff --git a/unit_tests/data/Test_TensorImpl.cpp b/unit_tests/data/Test_TensorImpl.cpp index ceedd8c10f22c2afb0331eccafa11c748628fd7d..de1c722da8bd7d12857512f6ffedab52bab7b7e6 100644 --- a/unit_tests/data/Test_TensorImpl.cpp +++ b/unit_tests/data/Test_TensorImpl.cpp @@ -45,7 +45,7 @@ TEST_CASE("Tensor creation") { REQUIRE(x.get<int>({0, 0, 1}) == 2); REQUIRE(x.get<int>({0, 1, 1}) == 4); REQUIRE(x.get<int>({1, 1, 0}) == 7); - x.get<int>({1, 1, 1}) = 36; + x.set<int>({1, 1, 1}, 36); REQUIRE(x.get<int>({1, 1, 1}) == 36); } diff --git a/unit_tests/operator/Test_BatchNormImpl.cpp b/unit_tests/operator/Test_BatchNormImpl.cpp index e6b7c3c655b865973028fc8c43323a7db3f4a5ef..a1a749d805a45361c671544f5c94aed3421e557d 100644 --- a/unit_tests/operator/Test_BatchNormImpl.cpp +++ b/unit_tests/operator/Test_BatchNormImpl.cpp @@ -20,7 +20,7 @@ using namespace Aidge; TEST_CASE("[cpu/operator] BatchNorm(forward)", "[BatchNorm][CPU]") { - std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(0.00001F, 0.1F, "mybatchnorm"); + std::shared_ptr<Node> myBatchNorm = BatchNorm<2>(3, 0.00001F, 0.1F, "mybatchnorm"); auto op = std::static_pointer_cast<OperatorTensor>(myBatchNorm -> getOperator()); std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array1D<float,3> {{0.9044, 0.3028, 0.0218}}); std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,3> {{0.1332, 0.7503, 0.0878}}); diff --git a/unit_tests/recipies/Test_ExplicitCastMove.cpp b/unit_tests/recipies/Test_ExplicitCastMove.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7d169ba9ba949ead0bf96f80e53a47e1ca6c24d9 --- /dev/null +++ b/unit_tests/recipies/Test_ExplicitCastMove.cpp @@ -0,0 +1,46 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <catch2/catch_test_macros.hpp> + +#include "aidge/recipies/Recipies.hpp" +#include "aidge/operator/Conv.hpp" +#include "aidge/operator/Producer.hpp" +#include "aidge/graph/OpArgs.hpp" +#include <cstddef> + +using namespace Aidge; + +TEST_CASE("[ExplicitCastMove] conv") { + auto conv1 = Conv(3, 32, {3, 3}, "conv1"); + auto conv2 = Conv(32, 64, {3, 3}, "conv2"); + auto conv3 = Conv(64, 10, {1, 1}, "conv3", {2, 2}); + + auto g1 = Sequential({ + Producer({16, 3, 224, 224}, "dataProvider"), + conv1, + conv2, + conv3 + }); + + g1->setBackend("cpu"); + conv1->getOperator()->setDataType(DataType::Int32); + conv3->getOperator()->setDataType(DataType::Float64); + + g1->save("explicitCastMove_before"); + REQUIRE(g1->getNodes().size() == 10); + + g1->forwardDims(); + explicitCastMove(g1); + + g1->save("explicitCastMove_after"); + REQUIRE(g1->getNodes().size() == 13); +} diff --git a/unit_tests/recipies/Test_FuseBatchNorm.cpp b/unit_tests/recipies/Test_FuseBatchNorm.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c4b3bf18a5f5b68d0e41b9cd40966790a0cf7ff6 --- /dev/null +++ b/unit_tests/recipies/Test_FuseBatchNorm.cpp @@ -0,0 +1,118 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <catch2/catch_test_macros.hpp> +#include <memory> +#include <cmath> + +#include "aidge/graph/GraphView.hpp" +#include "aidge/graph/OpArgs.hpp" +#include "aidge/operator/Conv.hpp" +#include "aidge/operator/BatchNorm.hpp" +#include "aidge/operator/Producer.hpp" +#include "aidge/recipies/Recipies.hpp" +#include "aidge/scheduler/Scheduler.hpp" + +#include "aidge/data/Tensor.hpp" + +namespace Aidge { + +TEST_CASE("[core/recipies] FuseBatchNorm", "[recipies][FuseBatchNorm]") { + auto myProd = Producer({2, 3, 3, 3}, "dataProvider"); + auto myConv = Conv(3, 3, {1, 1}, "conv1"); + auto myBN = BatchNorm<2>(32, 1.0e-5F, 0.1F, "batchnorm1"); + + auto myProdOp = std::static_pointer_cast<Producer_Op>(myProd->getOperator()); + auto myConvOp = std::static_pointer_cast<Conv_Op<2>>(myConv->getOperator()); + auto myBNOp = std::static_pointer_cast<BatchNorm_Op<2>>(myBN->getOperator()); + + myProdOp->setOutput(0, std::make_shared<Tensor>(Array4D<float,2,3,3,3> { //NCHW + { + { + {{8.28257084e-01, 7.99335480e-01, 7.36702740e-01}, + {2.36729562e-01, 8.61912668e-01, 9.93067741e-01}, + {1.63514376e-01, 8.95773172e-02, 2.96533108e-01}}, + {{2.20776618e-01, 5.89067876e-01, 2.03930080e-01}, + {1.31294072e-01, 7.10182846e-01, 1.08420849e-04}, + {7.21750259e-01, 4.38212037e-01, 5.08823872e-01}}, + {{4.30953979e-01, 1.51903450e-01, 3.76343548e-01}, + {8.07861805e-01, 7.79679358e-01, 5.01209974e-01}, + {9.31280375e-01, 9.94207084e-01, 1.74868107e-03}} + }, + { + {{6.22058094e-01, 2.32256651e-02, 6.18222237e-01}, + {9.58304763e-01, 2.11395025e-02, 4.95614648e-01}, + {2.50825584e-01, 4.50860739e-01, 3.80362332e-01}}, + {{9.91703272e-02, 5.06073236e-01, 4.88969564e-01}, + {1.12059772e-01, 7.64178872e-01, 7.60362148e-01}, + {2.84135342e-02, 4.29610193e-01, 1.27862811e-01}}, + {{9.57209170e-01, 8.22797656e-01, 1.91352129e-01}, + {9.52722490e-01, 6.35501027e-01, 5.67592978e-02}, + {2.00799644e-01, 4.00822222e-01, 9.14380193e-01}} + } + } + })); + myConvOp -> setInput(1, std::make_shared<Tensor>(Array4D<float,3,3,1,1> { //NCHW + { + { + {{8.28257084e-01}}, + {{7.99335480e-01}}, + {{7.36702740e-01}} + }, + { + {{2.36729562e-01}}, + {{8.61912668e-01}}, + {{9.93067741e-01}} + }, + { + {{1.63514376e-01}}, + {{8.95773172e-02}}, + {{2.96533108e-01}} + } + } + })); + myConvOp -> setInput(2, std::make_shared<Tensor>(Array1D<float,3> {{0.4470, 0.3064, 0.7061}})); + myBNOp -> setInput(1, std::make_shared<Tensor>(Array1D<float,3> {{0.9044, 0.3028, 0.0218}})); + myBNOp -> setInput(2, std::make_shared<Tensor>(Array1D<float,3> {{0.1332, 0.7503, 0.0878}})); + myBNOp -> setInput(3, std::make_shared<Tensor>(Array1D<float,3> {{0.9931, 0.8421, 0.9936}})); + myBNOp -> setInput(4, std::make_shared<Tensor>(Array1D<float,3> {{0.4470, 0.3064, 0.7061}})); + + auto g1 = Sequential({ + myConv, + myBN + }); + g1 -> setName("fuseBNGraph"); + myProd -> addChild(myConv); // set graph input + + myProdOp -> setDataType(DataType::Float32); + myProdOp -> setBackend("cpu"); + g1 -> compile("cpu", DataType::Float32); + + auto s = SequentialScheduler(g1); + s.forward(); + std::shared_ptr<Tensor> res1 = std::make_shared<Tensor>(*(myBNOp -> getOutput(0))); + + fuseBatchNorm(g1); + + s.resetScheduling(); + s.forward(); + std::shared_ptr<Tensor> res2 = std::make_shared<Tensor>(*(myConvOp -> getOutput(0))); + + REQUIRE(g1 -> outputNodes().size() == 1); + REQUIRE(g1 -> inputNodes().size() == 1); + bool eq = true; + for (std::size_t i = 0; i < res1->size(); ++i) { + eq &= std::abs(res1->get<float>(i) - res2->get<float>(i)) < 1.0e-06; + } + REQUIRE(eq); + +} +} // namespace Aidge diff --git a/unit_tests/scheduler/Test_CastMove.cpp b/unit_tests/scheduler/Test_CastMove.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a52b2b06901818f01117273d181d5d5388348f95 --- /dev/null +++ b/unit_tests/scheduler/Test_CastMove.cpp @@ -0,0 +1,246 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <catch2/catch_test_macros.hpp> +#include <memory> +#include <string> + +#include "aidge/data/Tensor.hpp" +#include "aidge/utils/TensorUtils.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/graph/GraphView.hpp" +#include "aidge/graph/OpArgs.hpp" +#include "aidge/scheduler/Scheduler.hpp" +#include "aidge/recipies/Recipies.hpp" + +#include "aidge/backend/cpu.hpp" + +using namespace Aidge; + +TEST_CASE("[cpu/castmove] CastMove(forward)") { + std::shared_ptr<Tensor> inputTensor = + std::make_shared<Tensor>(Array4D<int, 2, 1, 5, 5>{{{{{0, 1, 2, 3, 4}, + {5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14}, + {15, 16, 17, 18, 19}, + {20, 21, 22, 23, 24}}}, + {{{25, 26, 27, 28, 29}, + {30, 31, 32, 33, 34}, + {35, 36, 37, 38, 39}, + {40, 41, 42, 43, 44}, + {45, 46, 47, 48, 49}}}}}); + + std::shared_ptr<Tensor> weight1 = std::make_shared<Tensor>( + Array4D<int, 3, 1, 3, 3>{{{{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}}, + {{{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}}, + {{{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}}}); + + std::shared_ptr<Tensor> bias1 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}}); + + SECTION("Test implicit") { + std::shared_ptr<GraphView> g = + Sequential({ + Conv(1, 3, {3, 3}, "conv1"), + Conv(3, 4, {1, 1}, "conv2"), + Conv(4, 3, {1, 1}, "conv3"), + FC(27, 5, false, "fc")}); + + g->getNode("conv1")->getOperator()->setInput(0, inputTensor); + g->getNode("conv1")->getOperator()->setInput(1, weight1); + g->getNode("conv1")->getOperator()->setInput(2, bias1); + + std::shared_ptr<Tensor> weight2 = + std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}}, + {{{4}}, {{5}}, {{6}}}, + {{{7}}, {{8}}, {{9}}}, + {{{10}}, {{11}}, {{12}}}}}); + std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}}); + g->getNode("conv2")->getOperator()->setInput(1, weight2); + g->getNode("conv2")->getOperator()->setInput(2, bias2); + // *(g->getNode("conv2")->getOperator()->input(1, weight2); + + std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>( + Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}}, + {{{5}}, {{6}}, {{7}}, {{8}}}, + {{{9}}, {{10}}, {{11}}, {{12}}}}}); + std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}}); + g->getNode("conv3")->getOperator()->setInput(1, weight3); + g->getNode("conv3")->getOperator()->setInput(2, bias3); + + std::shared_ptr<Tensor> weightfc = std::make_shared<Tensor>( + Array2D<int, 5, 27>{{{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, + {13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6}, + {7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3}, + {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}}); + std::shared_ptr<Tensor> biasfc = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}}); + g->getNode("fc")->getOperator()->setInput(1, weightfc); + g->getNode("fc")->getOperator()->setInput(2, biasfc); + + // input->addChild(g); + g->setDataType(Aidge::DataType::Int32); + g->getNode("conv1")->getOperator()->setDataType(DataType::Float32); + g->getNode("conv3")->getOperator()->setDataType(DataType::Float64); + + g->setBackend("cpu"); + g->forwardDims(); + SequentialScheduler scheduler(g); + REQUIRE_NOTHROW(scheduler.forward()); + scheduler.saveSchedulingDiagram("schedulingSequential"); + + std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{ + {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}}, + {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}}, + {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}}, + {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}}, + {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}}, + {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}}); + + std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{ + {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}}, + {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}}, + {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}}, + {{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}}, + {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}}, + {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}}, + {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}}, + {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}}); + + std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{ + {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}}, + {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}}, + {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}}, + {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}}, + {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}}, + {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}}); + + Tensor expectedOutput4 = Array2D<int, 2, 5>{ + {{205050376, 198925904, 181355097, 196978090, 238868348}, + {598467376, 561797804, 560823897, 593043790, 698672948}}}; + std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0); + REQUIRE(approxEq<float, int>(*other1, *expectedOutput1, 0.0, 1.0e-12)); + std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0); + REQUIRE(approxEq<int>(*other2, *expectedOutput2, 0.0, 1.0e-12)); + std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0); + REQUIRE(approxEq<double, int>(*other3, *expectedOutput3, 0.0, 1.0e-12)); + std::shared_ptr<Tensor> other4 = std::static_pointer_cast<OperatorTensor>(g->getNode("fc")->getOperator())->getOutput(0); + REQUIRE(approxEq<int>(*other4, expectedOutput4, 0.0, 1.0e-12)); + } + + SECTION("Half") { + Tensor refTensor = Array2D<float, 3, 2>{{{0.0, 1.0},{2.1, 3.4},{5000.0, 1.0e5}}}; + Tensor tensor(DataType::Float16); + tensor.copyCastFrom(refTensor); + REQUIRE(approxEq<float, half_float::half>(refTensor, tensor, 1.0e-3, 0.0)); + } + + SECTION("Test explicit") { + std::shared_ptr<GraphView> g = + Sequential({ + Conv(1, 3, {3, 3}, "conv1"), + Conv(3, 4, {1, 1}, "conv2"), + Conv(4, 3, {1, 1}, "conv3"), + FC(27, 5, false, "fc")}); + + g->getNode("conv1")->getOperator()->setInput(0, inputTensor); + g->getNode("conv1")->getOperator()->setInput(1, weight1); + g->getNode("conv1")->getOperator()->setInput(2, bias1); + + std::shared_ptr<Tensor> weight2 = + std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}}, + {{{4}}, {{5}}, {{6}}}, + {{{7}}, {{8}}, {{9}}}, + {{{10}}, {{11}}, {{12}}}}}); + std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}}); + g->getNode("conv2")->getOperator()->setInput(1, weight2); + g->getNode("conv2")->getOperator()->setInput(2, bias2); + // *(g->getNode("conv2")->getOperator()->input(1, weight2); + + std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>( + Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}}, + {{{5}}, {{6}}, {{7}}, {{8}}}, + {{{9}}, {{10}}, {{11}}, {{12}}}}}); + std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}}); + g->getNode("conv3")->getOperator()->setInput(1, weight3); + g->getNode("conv3")->getOperator()->setInput(2, bias3); + + std::shared_ptr<Tensor> weightfc = std::make_shared<Tensor>( + Array2D<int, 5, 27>{{{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, + {13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + {10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6}, + {7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3}, + {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}}); + std::shared_ptr<Tensor> biasfc = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}}); + g->getNode("fc")->getOperator()->setInput(1, weightfc); + g->getNode("fc")->getOperator()->setInput(2, biasfc); + + // input->addChild(g); + g->setDataType(Aidge::DataType::Int32); + g->getNode("conv1")->getOperator()->setDataType(DataType::Float32); + g->getNode("conv3")->getOperator()->setDataType(DataType::Float64); + + explicitCastMove(g); + g->setBackend("cpu"); + g->forwardDims(); + + SequentialScheduler scheduler(g); + REQUIRE_NOTHROW(scheduler.forward()); + scheduler.saveSchedulingDiagram("schedulingSequential"); + + std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{ + {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}}, + {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}}, + {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}}, + {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}}, + {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}}, + {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}}); + + std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{ + {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}}, + {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}}, + {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}}, + {{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}}, + {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}}, + {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}}, + {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}}, + {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}}); + + std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{ + {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}}, + {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}}, + {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}}, + {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}}, + {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}}, + {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}}); + + Tensor expectedOutput4 = Array2D<int, 2, 5>{ + {{205050376, 198925904, 181355097, 196978090, 238868348}, + {598467376, 561797804, 560823897, 593043790, 698672948}}}; + std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0); + REQUIRE(approxEq<float, int>(*other1, *expectedOutput1, 0.0, 1.0e-12)); + std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0); + REQUIRE(approxEq<int>(*other2, *expectedOutput2, 0.0, 1.0e-12)); + std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0); + REQUIRE(approxEq<double, int>(*other3, *expectedOutput3, 0.0, 1.0e-12)); + std::shared_ptr<Tensor> other4 = std::static_pointer_cast<OperatorTensor>(g->getNode("fc")->getOperator())->getOutput(0); + REQUIRE(approxEq<int>(*other4, expectedOutput4, 0.0, 1.0e-12)); + } +}