Merge branch 'dataloader' of gitlab.eclipse.org:eclipse/aidge/aidge_core into dataloader

60d053c0 · Maxence Naud · 9f7c89aa · efb68e6a · 60d053c0 · 60d053c0
Commit 60d053c0 authored 1 year ago by Maxence Naud
--- a/include/aidge/aidge.hpp
+++ b/include/aidge/aidge.hpp
@@ -14,10 +14,11 @@
 #include "aidge/backend/OperatorImpl.hpp"
 #include "aidge/backend/TensorImpl.hpp"
+#include "aidge/backend/StimuliImpl.hpp"
 #include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/data/Database.hpp"
+#include "aidge/data/DataProvider.hpp"
 #include "aidge/graph/Connector.hpp"
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/graph/Node.hpp"
@@ -58,6 +59,7 @@
 #include "aidge/operator/Sub.hpp"
 #include "aidge/operator/Transpose.hpp"
 #include "aidge/scheduler/Scheduler.hpp"
+#include "aidge/stimuli/Stimuli.hpp"
 #include "aidge/recipies/Recipies.hpp"
@@ -66,7 +68,5 @@
 #include "aidge/utils/DynamicAttributes.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
-//#include "aidge/utilsParsing/AstNode.hpp"
-//#include "aidge/utilsParsing/ParsingToken.hpp"
 #endif /* AIDGE_IMPORTS_H_ */
--- a/include/aidge/backend/StimuliImpl.hpp
+++ b/include/aidge/backend/StimuliImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_STIMULIIMPL_H_
+#define AIDGE_STIMULIIMPL_H_
+#include <memory>
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+namespace Aidge {
+/**
+ * @brief StimuliImpl. Base class to implement data loading functions.
+ * 
+ */
+class StimuliImpl {
+public:
+    StimuliImpl(){};
+    virtual std::shared_ptr<Tensor> load(){};
+    virtual ~StimuliImpl() = default;
+};
+} // namespace Aidge
+#endif /* AIDGE_STIMULIIMPL_H_ */
--- a/include/aidge/data/DataProvider.hpp
+++ b/include/aidge/data/DataProvider.hpp
+#ifndef DATAPROVIDER_H_
+#define DATAPROVIDER_H_
+#include "aidge/data/Database.hpp"
+#include "aidge/data/Data.hpp"
+namespace Aidge{
+/**
+ * @brief Data Provider. Takes in a database and compose batches by fetching data from the given database.
+ * @todo Implement Drop last batch option. Currently returns the last batch with less elements in the batch.
+ * @todo Implement readRandomBatch to compose batches from the database with a random sampling startegy. Necessary for training.
+ */
+class DataProvider {
+public:
+    /**
+     * @brief Constructor of Data Provider. 
+     * @param database database from which to load the data.
+     * @param batchSize number of data samples per batch.
+     */
+    DataProvider(Database& database, std::size_t batchSize);
+    /**
+     * @brief Create a batch for each data modality in the database. The returned batch contain the data as sorted in the database.
+     * @param startIndex the starting index in the database to start the batch from. 
+     * @return a vector of tensors. Each tensor is a batch corresponding to one modality.
+     */
+    std::vector<std::shared_ptr<Tensor>> readBatch(std::size_t startIndex);
+protected:
+    // Dataset providing the data to the dataProvider
+    Database& mDatabase;
+    size_t mNumberModality;
+    std::vector<std::vector<std::size_t>> mDataSizes;
+    std::vector<std::string> mDataBackends;
+    std::vector<DataType> mDataTypes;
+    // Desired size of the produced batches
+    size_t mBatchSize;
+};
+}
+#endif /* DATAPROVIDER_H_ */
\ No newline at end of file
--- a/include/aidge/data/Database.hpp
+++ b/include/aidge/data/Database.hpp
+#ifndef Database_H_
+#define Database_H_
+#include <cstring>
+#include <memory>
+#include <vector>
+#include <tuple>
+#include "aidge/data/Tensor.hpp"
+namespace Aidge {
+/**
+ * @brief Database. An abstract class representing a map from a key to data. All databases should inherit from this class. All subclasses should overwrite :cpp:function:`Database::getItem` to fetch data from a given index.
+ */
+class Database {
+public:
+    Database() = default;
+    virtual ~Database() = default; 
+    /**
+     * @brief Fetch an item of the database. 
+     * @param index index of the item.
+     * @return vector of data mapped to index.
+     */
+    virtual std::vector<std::shared_ptr<Tensor>> getItem(std::size_t index) = 0;
+    /**
+     * @brief Get the number of items in the database
+     * 
+     * @return std::size_t 
+     */
+    virtual std::size_t getLen() = 0;
+    /**
+     * @brief Get the number of modalities in one database item
+     * 
+     * @return std::size_t 
+     */
+    virtual std::size_t getNbModalities() = 0;
+};
+} // namespace Aidge
+#endif /* Database_H_ */
\ No newline at end of file
--- a/include/aidge/data/Tensor.hpp
+++ b/include/aidge/data/Tensor.hpp
@@ -55,6 +55,20 @@ class Tensor : public Data,
    {
        // ctor
    }
+    /**
+     * @brief Construct a new Tensor object from dimensions.
+     * 
+     * @param dims dimensions of the tensor
+     * @param dataType datatype of the tensor (default = DataType::Float32)
+     */
+    Tensor(std::vector<DimSize_t> dims, DataType dataType = DataType::Float32)
+        : Data(Type),
+          mDataType(dataType),
+          mDims(dims)
+    {
+        computeSize();
+    }
    /**
     * @brief Construct a new Tensor object copied from another one.

--- a/include/aidge/graph/GraphView.hpp
+++ b/include/aidge/graph/GraphView.hpp
@@ -209,7 +209,7 @@ public:
     * @brief Compute dimensions of input/output Tensors for each Operator of the
     * GraphView object's Nodes.
     */
-    void forwardDims();
+    void forwardDims(const std::vector<std::vector<DimSize_t>> dims = {});
    /** @brief Set the same backend for each Operator of the GraphView object's Nodes. */
    void setBackend(const std::string &backend, DeviceIdx_t device = 0);

--- a/include/aidge/scheduler/Scheduler.hpp
+++ b/include/aidge/scheduler/Scheduler.hpp
@@ -18,6 +18,8 @@
 #include <string>
 #include <vector>
+#include "aidge/data/Tensor.hpp"
 namespace Aidge {
 class Node;
 class GraphView;
@@ -49,11 +51,17 @@ public:
        mScheduling.clear();
        mStaticSchedule.clear();
    }
+    /**
+     * @brief Place the data tensors inside in the data input tensor of the graphView. In case of multiple data input tensors, they are mapped to producers in the order given by the graph.
+     * 
+     * @param data data input tensors
+     */
+    void connectInputs(std::vector<std::shared_ptr<Aidge::Tensor>> data);
    /**
     * @brief Run the provided Computational Graph with a batch of data
     */
-    void forward(bool forwardDims = true, bool verbose = false);
+    void forward(bool forwardDims = true, bool verbose = false, std::vector<std::shared_ptr<Aidge::Tensor>> data = {});
    /**
     * @brief Save in a Markdown file the order of layers execution.

--- a/include/aidge/stimuli/Stimuli.hpp
+++ b/include/aidge/stimuli/Stimuli.hpp
+#ifndef STIMULI_H
+#define STIMULI_H
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include "aidge/backend/StimuliImpl.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/Registrar.hpp"
+namespace Aidge {
+/**
+ * @brief Stimuli. A class wrapping a data sample. Stimuli has two functioning modes. The first mode enables to load data samples from a dataPath and optionnaly store the data in-memory. The second mode enables to store a data sample that was already loaded in memory.
+ * @details When Stimuli is used in the first mode, the loading function is determined automaticaly based on the backend and the file extension.
+ */
+class Stimuli : public Registrable<Stimuli, std::tuple<std::string, std::string>, std::unique_ptr<StimuliImpl>(const std::string&)> {
+public:
+    Stimuli() = delete;
+    /**
+     * @brief Construct a new Stimuli object based on a dataPath to load the data.
+     * 
+     * @param dataPath path to the data to be loaded.
+     * @param loadDataInMemory when true, keep the data in memory once loaded
+     */
+    Stimuli(const std::string& dataPath,
+            bool loadDataInMemory = false) : 
+            mDataPath(dataPath) 
+    {
+        size_t dotPos = dataPath.find_last_of(".");
+        assert(dotPos != std::string::npos && "Cannot find extension");
+        mFileExtension = dataPath.substr(dotPos + 1);
+    };
+    /**
+     * @brief Construct a new Stimuli object copied from another one.
+     * @param otherStimuli
+     */
+    Stimuli(const Stimuli& otherStimuli)
+        : mDataPath(otherStimuli.mDataPath),
+          mLoadDataInMemory(otherStimuli.mLoadDataInMemory),
+          mFileExtension(otherStimuli.mFileExtension),
+          mData(otherStimuli.mData)
+    {
+        if (otherStimuli.mImpl) {
+            mImpl = Registrar<Stimuli>::create({"opencv", mFileExtension})(mDataPath);
+        }
+    }
+    /**
+     * @brief Construct a new Stimuli object based on a tensor that is already loaded in memory.
+     * 
+     * @param data the data tensor.
+     */
+    Stimuli(const std::shared_ptr<Tensor> data) : 
+            mData(data),
+            mLoadDataInMemory(true) {}
+    virtual ~Stimuli() {};
+    /**
+     * @brief Set the backend of the stimuli associated load implementation
+     * @details Create and initialize an implementation.
+     * @param name name of the backend.
+     */
+    inline void setBackend(const std::string &name) {
+        mImpl = Registrar<Stimuli>::create({name, mFileExtension})(mDataPath);
+    }
+    /**
+     * @brief Get the data tensor associated to the stimuli. The data is either loaded from a datapath or passed from an in-memory tensor.
+     * 
+     * @return std::shared_ptr<Tensor> the data tensor.
+     */
+    virtual std::shared_ptr<Tensor> load(){
+        assert((mImpl!=nullptr || mData!=nullptr) && "No load implementation and No stored data");
+        if (mLoadDataInMemory){
+            if (mData == nullptr){
+                mData = mImpl->load();
+            }
+            return mData;
+        }
+        return mImpl->load(); 
+    };
+protected:
+    // Implementation of the Stimuli
+    std::unique_ptr<StimuliImpl> mImpl; 
+    /// Stimuli data path
+    std::string mDataPath;
+    std::string mFileExtension;
+    bool mLoadDataInMemory;
+    /// Stimuli data ptr 
+    std::shared_ptr<Tensor> mData;
+};
+} // namespace Aidge
+#endif // STIMULI_H
--- a/python_binding/data/pybind_DataProvider.cpp
+++ b/python_binding/data/pybind_DataProvider.cpp
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include "aidge/data/DataProvider.hpp"
+#include "aidge/data/Database.hpp"
+namespace py = pybind11;
+namespace Aidge {
+void init_DataProvider(py::module& m){
+    py::class_<DataProvider, std::shared_ptr<DataProvider>>(m, "DataProvider")
+          .def(py::init<Database&, std::size_t>(), py::arg("database"), py::arg("batchSize"))
+          .def("read_batch", &DataProvider::readBatch, py::arg("start_index"),
+          R"mydelimiter(
+          Return a batch of each data modality.
+          :param start_index: Database starting index to read the batch from
+          :type start_index: int
+          )mydelimiter");
+}
+}
--- a/python_binding/data/pybind_Database.cpp
+++ b/python_binding/data/pybind_Database.cpp
+#include <pybind11/pybind11.h>
+#include "aidge/data/Database.hpp"
+namespace py = pybind11;
+namespace Aidge {
+void init_Database(py::module& m){
+    py::class_<Database, std::shared_ptr<Database>>(m,"Database");
+}
+}
--- a/python_binding/graph/pybind_GraphView.cpp
+++ b/python_binding/graph/pybind_GraphView.cpp
@@ -100,7 +100,7 @@ void init_GraphView(py::module& m) {
          .def("get_nodes", &GraphView::getNodes)
          .def("get_node", &GraphView::getNode, py::arg("node_name"))
-          .def("forward_dims", &GraphView::forwardDims)
+          .def("forward_dims", &GraphView::forwardDims, py::arg("dims")=std::vector<std::vector<DimSize_t>>())
          .def("compile", &GraphView::compile, py::arg("backend"), py::arg("datatype"), py::arg("device") = 0)
          .def("__call__", &GraphView::operator(), py::arg("connectors"))
          .def("set_datatype", &GraphView::setDataType, py::arg("datatype"))

--- a/python_binding/pybind_core.cpp
+++ b/python_binding/pybind_core.cpp
@@ -15,6 +15,8 @@ namespace py = pybind11;
 namespace Aidge {
 void init_Data(py::module&);
+void init_Database(py::module&);
+void init_DataProvider(py::module&);
 void init_Tensor(py::module&);
 void init_OperatorImpl(py::module&);
 void init_Attributes(py::module&);
@@ -66,6 +68,8 @@ void init_TensorUtils(py::module&);
 void init_Aidge(py::module& m){
    init_Data(m);
+    init_Database(m);
+    init_DataProvider(m);
    init_Tensor(m);
    init_Node(m);

--- a/python_binding/scheduler/pybind_Scheduler.cpp
+++ b/python_binding/scheduler/pybind_Scheduler.cpp
@@ -13,13 +13,14 @@
 #include <pybind11/stl.h>
 #include "aidge/scheduler/Scheduler.hpp"
 #include "aidge/graph/GraphView.hpp"
+#include "aidge/data/Tensor.hpp"
 namespace py = pybind11;
 namespace Aidge {
 void init_Scheduler(py::module& m){
    py::class_<SequentialScheduler, std::shared_ptr<SequentialScheduler>>(m, "SequentialScheduler")
    .def(py::init<std::shared_ptr<GraphView>&>(), py::arg("graph_view"))
-    .def("forward", &SequentialScheduler::forward, py::arg("forward_dims")=true, py::arg("verbose")=false)
+    .def("forward", &SequentialScheduler::forward, py::arg("forward_dims")=true, py::arg("verbose")=false, py::arg("data")=std::vector<Tensor>())
    .def("save_scheduling_diagram", &SequentialScheduler::saveSchedulingDiagram, py::arg("file_name"))
    .def("resetScheduling", &SequentialScheduler::resetScheduling)
    .def("generate_scheduling", &SequentialScheduler::generateScheduling, py::arg("verbose")=false)

--- a/src/data/DataProvider.cpp
+++ b/src/data/DataProvider.cpp
+#include <cassert>
+#include "aidge/data/DataProvider.hpp"
+using namespace Aidge; 
+DataProvider::DataProvider(Database& database, std::size_t batchSize)
+    :
+    mDatabase(database),
+    mBatchSize(batchSize)
+{
+    // Get the tensor dimensions, datatype and backend of each modality to ensure each data have the same
+    auto item = mDatabase.getItem(0);
+    mNumberModality = item.size();
+    // Iterating on each data modality in the database
+    for (std::size_t i = 0; i < mNumberModality; ++i) {
+        mDataSizes.push_back(item[i]->dims());
+        // assert(std::strcmp(item[i]->getImpl()->backend(), "cpu") == 0 && "DataProvider currently only supports cpu backend tensors");
+        // mDataBackends.push_back(item[i]->getImpl()->backend());
+        mDataTypes.push_back(item[i]->dataType());
+    }
+}
+std::vector<std::shared_ptr<Tensor>> DataProvider::readBatch(std::size_t startIndex)
+{
+    assert((startIndex) <= mDatabase.getLen() && " DataProvider readBatch : database fetch out of bounds");
+    // Determine the batch size (may differ for the last batch)
+    size_t current_batch_size;
+    if ((startIndex+mBatchSize) > mDatabase.getLen()){
+        current_batch_size = mDatabase.getLen()-startIndex;
+    } else {
+        current_batch_size = mBatchSize;
+    }
+    // Create batch tensors (dimensions, backends, datatype) for each modality
+    std::vector<std::shared_ptr<Tensor>> batchTensors;
+    auto dataBatchSize = mDataSizes;
+    for (std::size_t i = 0; i < mNumberModality; ++i) {
+        dataBatchSize[i].insert(dataBatchSize[i].begin(), current_batch_size);
+        auto batchData = std::make_shared<Tensor>();
+        batchData->resize(dataBatchSize[i]);
+        // batchData->setBackend(mDataBackends[i]);
+        batchData->setBackend("cpu");
+        batchData->setDataType(mDataTypes[i]);
+        batchTensors.push_back(batchData);
+    }
+    // Call each database item and concatenate each data modularity in the batch tensors
+    for (std::size_t i = 0; i < current_batch_size; ++i){
+        auto dataItem = mDatabase.getItem(startIndex+i); 
+        // assert same number of modalities
+        assert(dataItem.size() == mNumberModality && "DataProvider readBatch : item from database have inconsistent number of modality.");
+        // Browse each modularity in the database item
+        for (std::size_t j = 0; j < mNumberModality; ++j) {
+            auto dataSample = dataItem[j];
+            // Assert tensor sizes
+            assert(dataSample->dims() == mDataSizes[j] && "DataProvider readBatch : corrupted Data size");
+            // Assert implementation backend
+            // assert(dataSample->getImpl()->backend() == mDataBackends[j] && "DataProvider readBatch : corrupted data backend");
+            // Assert DataType
+            assert(dataSample->dataType() == mDataTypes[j] && "DataProvider readBatch : corrupted data DataType");
+            // Concatenate into the batch tensor 
+            batchTensors[j]->getImpl()->copy(dataSample->getImpl()->rawPtr(), dataSample->size(), i*dataSample->size());
+        }
+    }
+    return batchTensors;
+}
\ No newline at end of file
--- a/src/graph/GraphView.cpp
+++ b/src/graph/GraphView.cpp
@@ -265,10 +265,18 @@ void Aidge::GraphView::compile(const std::string& backend, const Aidge::DataType
    forwardDims();
 }
-void Aidge::GraphView::forwardDims() {
+void Aidge::GraphView::forwardDims(const std::vector<std::vector<Aidge::DimSize_t>> dims) {
    // setInputs
    // Link every tensor to the right pointer
    // following parent - children informations
+    if (!dims.empty()){
+      AIDGE_ASSERT(dims.size() == mInputNodes.size(), "GraphView forwardDims error - Inconsistent number of dimensions and graph inputs");
+      for (std::size_t i = 0; i < dims.size(); ++i){
+        auto tensor = std::make_shared<Tensor>(dims[i]);
+        mInputNodes[i].first->getOperator()->setInput(mInputNodes[i].second, tensor);
+      }
+    }
    for (std::shared_ptr<Node> nodePtr : getNodes()) {
        for (IOIndex_t i = 0; i < nodePtr->nbInputs(); ++i) {
            // assess if the input was not already set and is a Tensor then link it to parent output

--- a/src/scheduler/Scheduler.cpp
+++ b/src/scheduler/Scheduler.cpp
@@ -174,8 +174,28 @@ void Aidge::SequentialScheduler::generateScheduling(bool verbose) {
 }
+void Aidge::SequentialScheduler::connectInputs(std::vector<std::shared_ptr<Aidge::Tensor>> data){
+    // This version of connect inputs only connects tensor inputs in input data producers.
+    auto inputNodes = mGraphView->getOrderedInputs();
+    // Assert that the number of input data producers corresponds to the number of data input
+    assert(data.size() == inputNodes.size()  && "Scheduler connectInput error - Inconsistent number of graph inputs and inputs passed to the graph");
+    for (std::size_t i = 0; i < data.size(); ++i){
+        // TODO : maybe shallow copy instead of deepcopy
+        inputNodes[i].first->getOperator()->setInput(inputNodes[i].second, data[i]);
+    }
+}
 // TODO: handle multiple inputs/outputs
-void Aidge::SequentialScheduler::forward(bool forwardDims, bool verbose) {
+void Aidge::SequentialScheduler::forward(bool forwardDims, bool verbose, std::vector<std::shared_ptr<Aidge::Tensor>> data) {
+    // Collect all data input of the graph (that are producers)
+    if (!data.empty()){
+        connectInputs(data);
+    }
    // Forward dims (if allowed)
    if (forwardDims) {mGraphView->forwardDims(); }