Test_Scheduler.cpp

/********************************************************************************
 * Copyright (c) 2023 CEA-List
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License 2.0 which is available at
 * http://www.eclipse.org/legal/epl-2.0.
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 ********************************************************************************/

#include <catch2/catch_test_macros.hpp>
#include <memory>
#include <string>

#include "aidge/data/Tensor.hpp"
#include "aidge/graph/Node.hpp"
#include "aidge/graph/GraphView.hpp"
#include "aidge/graph/OpArgs.hpp"
#include "aidge/operator/GenericOperator.hpp"
#include "aidge/operator/Memorize.hpp"
#include "aidge/operator/Pop.hpp"
#include "aidge/operator/Stack.hpp"
#include "aidge/operator/Identity.hpp"
#include "aidge/operator/CryptoHash.hpp"
#include "aidge/operator/Mod.hpp"
#include "aidge/operator/Tanh.hpp"
#include "aidge/operator/Select.hpp"
#include "aidge/operator/MetaOperator.hpp"
#include "aidge/scheduler/SequentialScheduler.hpp"
#include "aidge/scheduler/ParallelScheduler.hpp"
#include "aidge/graph/Testing.hpp"

#include "aidge/backend/cpu/operator/FCImpl.hpp"
#include "aidge/backend/cpu/operator/ConvImpl.hpp"
#include "aidge/backend/cpu/operator/ReLUImpl.hpp"
#include "aidge/backend/cpu/operator/SqrtImpl.hpp"
#include "aidge/backend/cpu/operator/AddImpl.hpp"
#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
#include "aidge/backend/cpu/operator/ModImpl.hpp"
#include "aidge/backend/cpu/operator/TanhImpl.hpp"

#include "aidge/recipes/GraphViewHelper.hpp"


namespace Aidge {

TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
    std::shared_ptr<Tensor> inputTensor =
            std::make_shared<Tensor>(Array4D<int, 2, 1, 5, 5>{{{{{0, 1, 2, 3, 4},
                                                                 {5, 6, 7, 8, 9},
                                                                 {10, 11, 12, 13, 14},
                                                                 {15, 16, 17, 18, 19},
                                                                 {20, 21, 22, 23, 24}}},
                                                               {{{25, 26, 27, 28, 29},
                                                                 {30, 31, 32, 33, 34},
                                                                 {35, 36, 37, 38, 39},
                                                                 {40, 41, 42, 43, 44},
                                                                 {45, 46, 47, 48, 49}}}}});

    std::shared_ptr<Tensor> weight1 = std::make_shared<Tensor>(
            Array4D<int, 3, 1, 3, 3>{{{{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}},
                                      {{{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}},
                                      {{{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}}});

    std::shared_ptr<Tensor> bias1 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});

    SECTION("Test Sequential graph") {
        std::shared_ptr<GraphView> g =
                Sequential({
                    Conv(1, 3, {3, 3}, "conv1"),
                    Conv(3, 4, {1, 1}, "conv2"),
                    Conv(4, 3, {1, 1}, "conv3"),
                    FC(27, 5, false, "fc")});

        g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
        g->getNode("conv1")->getOperator()->setInput(1, weight1);
        g->getNode("conv1")->getOperator()->setInput(2, bias1);

        std::shared_ptr<Tensor> weight2 =
                std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}},
                                                                   {{{4}}, {{5}}, {{6}}},
                                                                   {{{7}}, {{8}}, {{9}}},
                                                                   {{{10}}, {{11}}, {{12}}}}});
        std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}});
        g->getNode("conv2")->getOperator()->setInput(1, weight2);
        g->getNode("conv2")->getOperator()->setInput(2, bias2);
        // *(g->getNode("conv2")->getOperator()->input(1, weight2);

        std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>(
                Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}},
                                          {{{5}}, {{6}}, {{7}}, {{8}}},
                                          {{{9}}, {{10}}, {{11}}, {{12}}}}});
        std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
        g->getNode("conv3")->getOperator()->setInput(1, weight3);
        g->getNode("conv3")->getOperator()->setInput(2, bias3);

        std::shared_ptr<Tensor> weightfc = std::make_shared<Tensor>(
                Array2D<int, 5, 27>{{{1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                      15, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12},
                                     {13, 14, 15, 1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                      12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9},
                                     {10, 11, 12, 13, 14, 15, 1,  2, 3, 4, 5, 6, 7, 8,
                                      9,  10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6},
                                     {7, 8, 9, 10, 11, 12, 13, 14, 15, 1,  2, 3, 4, 5,
                                      6, 7, 8, 9,  10, 11, 12, 13, 14, 15, 1, 2, 3},
                                     {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2,
                                      3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14, 15}}});
        std::shared_ptr<Tensor> biasfc = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
        g->getNode("fc")->getOperator()->setInput(1, weightfc);
        g->getNode("fc")->getOperator()->setInput(2, biasfc);

        // input->addChild(g);
        g->setDataType(Aidge::DataType::Int32);
        g->setBackend("cpu");
        g->forwardDims();
        SequentialScheduler scheduler(g);
        REQUIRE_NOTHROW(scheduler.forward());
        scheduler.saveSchedulingDiagram("schedulingSequential");

        std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
                {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
                  {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
                  {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
                 {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
                  {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
                  {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}});

        std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{
                {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}},
                  {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}},
                  {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}},
                  {{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}},
                 {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}},
                  {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}},
                  {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}},
                  {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}});

        std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
                {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}},
                  {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}},
                  {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}},
                 {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}},
                  {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}},
                  {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}});

        Tensor expectedOutput4 = Array2D<int, 2, 5>{
                {{205050376, 198925904, 181355097, 196978090, 238868348},
                {598467376, 561797804, 560823897, 593043790, 698672948}}};
        std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0);
        bool equal1 = (*other1 == *expectedOutput1);
        REQUIRE(equal1);
        std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0);
        bool equal2 = (*other2 == *expectedOutput2);
        REQUIRE(equal2);
        std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0);
        bool equal3 = (*other3 == *expectedOutput3);
        REQUIRE(equal3);
        std::shared_ptr<Tensor> other4 = std::static_pointer_cast<OperatorTensor>(g->getNode("fc")->getOperator())->getOutput(0);
        bool equal4 = (*other4 == expectedOutput4);
        REQUIRE(equal4);
    }

    SECTION("Test Parallel graph") {
        std::shared_ptr<GraphView> g =
                Sequential({Conv(1, 3, {3, 3}, "inputConv"),
                            Parallel({
                                Sequential({
                                    Parallel({
                                        Conv(3, 3, {1, 1}, "conv1.1"),
                                        Conv(3, 3, {1, 1}, "conv1.2")}),
                                    Add("add1")}),
                                Conv(3, 3, {1, 1}, "conv1.3")}),
                            Add("add2"),
                            Conv(3, 2, {1, 1}, "conv2"),
                            FC(18, 5, false, "out")});

        g->getNode("inputConv")->getOperator()->setInput(0, inputTensor);
        g->getNode("inputConv")->getOperator()->setInput(1, weight1);
        g->getNode("inputConv")->getOperator()->setInput(2, bias1);

        std::shared_ptr<Tensor> conv11Weight = std::make_shared<Tensor>(Array4D<int, 3, 3, 1, 1>{
                {{{{1}}, {{2}}, {{3}}}, {{{4}}, {{5}}, {{6}}}, {{{7}}, {{8}}, {{9}}}}});
        g->getNode("conv1.1")->getOperator()->setInput(1, conv11Weight);
        g->getNode("conv1.1")->getOperator()->setInput(2, bias1);

        std::shared_ptr<Tensor> conv12Weight = std::make_shared<Tensor>(Array4D<int, 3, 3, 1, 1>{
                {{{{11}}, {{12}}, {{13}}}, {{{14}}, {{15}}, {{16}}}, {{{17}}, {{18}}, {{19}}}}});
        g->getNode("conv1.2")->getOperator()->setInput(1, conv12Weight);
        g->getNode("conv1.2")->getOperator()->setInput(2, bias1);

        std::shared_ptr<Tensor> conv13Weight = std::make_shared<Tensor>(Array4D<int, 3, 3, 1, 1>{
                {{{{21}}, {{22}}, {{23}}}, {{{24}}, {{25}}, {{26}}}, {{{27}}, {{28}}, {{29}}}}});
        g->getNode("conv1.3")->getOperator()->setInput(1, conv13Weight);
        g->getNode("conv1.3")->getOperator()->setInput(2, bias1);

        std::shared_ptr<Tensor> conv2Weight = std::make_shared<Tensor>(
                Array4D<int, 2, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}}, {{{4}}, {{5}}, {{6}}}}});
        std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 2>{{1, 2}});
        g->getNode("conv2")->getOperator()->setInput(1, conv2Weight);
        g->getNode("conv2")->getOperator()->setInput(2, bias2);

        std::shared_ptr<Tensor> fcWeight = std::make_shared<Tensor>(
                Array2D<int, 5, 18>{{{1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3},
                                     {4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1},
                                     {2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4},
                                     {5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2},
                                     {3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5}}});
        std::shared_ptr<Tensor> fcBias = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
        g->getNode("out")->getOperator()->setInput(1, fcWeight);
        g->getNode("out")->getOperator()->setInput(2, fcBias);

        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
                Array2D<int, 2, 5>{{{124324368, 130692907, 133325056, 125044620, 142843879},
                                    {369195468, 394615207, 382643056, 379441320, 416291779}}});

        g->setBackend("cpu");
        g->setDataType(Aidge::DataType::Int32);
        g->forwardDims();
        SequentialScheduler scheduler(g);
        REQUIRE_NOTHROW(scheduler.forward());
        scheduler.saveSchedulingDiagram("schedulingSequential");
        std::shared_ptr<Tensor> result =
                std::static_pointer_cast<Tensor>(g->getNode("out")->getOperator()->getRawOutput(0));
        bool equal = (*result == *expectedOutput);
        REQUIRE(equal);
    }

    SECTION("Test Residual graph") {
    }

    SECTION("Test Recurrent graph (sequential)") {
        std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
                Array2D<int, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});
        std::shared_ptr<Tensor> initTensor = std::make_shared<Tensor>(
                Array2D<int, 2, 3>{{{0, 0, 0}, {1, 1, 1}}});
        std::shared_ptr<Tensor> biasTensor = std::make_shared<Tensor>(
                Array2D<int, 2, 3>{{{2, 0, 0}, {1, 0, 0}}});

        auto add1 = Add("add1");
        auto mem = Memorize(3, "mem1");
        auto add2 = Add("add2");
        auto bias = Producer(biasTensor, "bias");
        auto init = Producer(initTensor, "init");
        auto input = Producer(in, "input");

        std::shared_ptr<GraphView> g = Sequential({add1, mem, add2});
        init->addChild(mem, 0, 1);
        mem->addChild(add1, 1, 1);
        bias->addChild(add2, 0, 1);
        input->addChild(add1, 0, 0);
        // Update GraphView inputs/outputs following previous connections:
        g->add({mem, add1, add2, init, bias, input});

        g->setBackend("cpu");
        g->setDataType(Aidge::DataType::Int32);
        g->save("graphRecurrent");
        g->forwardDims();

        SequentialScheduler scheduler(g);
        REQUIRE_NOTHROW(scheduler.forward(true));
        scheduler.saveStaticSchedulingDiagram("static_schedule");
        scheduler.saveSchedulingDiagram("schedulingRecurrent_seq");

        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
                Array2D<int, 2, 3>{{{5, 6, 9}, {14, 16, 19}}});
        std::shared_ptr<Tensor> result =
                std::static_pointer_cast<Tensor>(g->getNode("add2")->getOperator()->getRawOutput(0));
        result->print();
        expectedOutput->print();
        bool equal = (*result == *expectedOutput);
        REQUIRE(equal);
    }


    SECTION("Test Recurrent graph (parallel)") {
        std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
                Array2D<int, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});
        std::shared_ptr<Tensor> initTensor = std::make_shared<Tensor>(
                Array2D<int, 2, 3>{{{0, 0, 0}, {1, 1, 1}}});
        std::shared_ptr<Tensor> biasTensor = std::make_shared<Tensor>(
                Array2D<int, 2, 3>{{{2, 0, 0}, {1, 0, 0}}});

        auto add1 = Add("add1");
        auto mem = Memorize(3, "mem1");
        auto add2 = Add("add2");
        auto bias = Producer(biasTensor, "bias");
        auto init = Producer(initTensor, "init");
        auto input = Producer(in, "input");

        std::shared_ptr<GraphView> g = Sequential({add1, mem, add2});
        init->addChild(mem, 0, 1);
        mem->addChild(add1, 1, 1);
        bias->addChild(add2, 0, 1);
        input->addChild(add1, 0, 0);
        // Update GraphView inputs/outputs following previous connections:
        g->add({mem, add1, add2, init, bias, input});

        g->setBackend("cpu");
        g->setDataType(Aidge::DataType::Int32);
        g->save("graphRecurrent");
        g->forwardDims();

        ParallelScheduler scheduler(g);
        REQUIRE_NOTHROW(scheduler.forward(true));
        scheduler.saveSchedulingDiagram("schedulingRecurrent_par");

        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
                Array2D<int, 2, 3>{{{5, 6, 9}, {14, 16, 19}}});
        std::shared_ptr<Tensor> result =
                std::static_pointer_cast<Tensor>(g->getNode("add2")->getOperator()->getRawOutput(0));
        result->print();
        expectedOutput->print();
        bool equal = (*result == *expectedOutput);
        REQUIRE(equal);
    }

    SECTION("Test ConnectInput graph") {
        std::shared_ptr<GraphView> g =
                Sequential({
                    Conv(1, 3, {3, 3}, "conv1"),
                    Conv(3, 4, {1, 1}, "conv2"),
                    Conv(4, 3, {1, 1}, "conv3"),
                    FC(27, 5, false, "fc")});

        // g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
        g->getNode("conv1")->getOperator()->setInput(1, weight1);
        g->getNode("conv1")->getOperator()->setInput(2, bias1);

        std::shared_ptr<Tensor> weight2 =
                std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}},
                                                                   {{{4}}, {{5}}, {{6}}},
                                                                   {{{7}}, {{8}}, {{9}}},
                                                                   {{{10}}, {{11}}, {{12}}}}});
        std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}});
        g->getNode("conv2")->getOperator()->setInput(1, weight2);
        g->getNode("conv2")->getOperator()->setInput(2, bias2);
        // *(g->getNode("conv2")->getOperator()->input(1, weight2);

        std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>(
                Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}},
                                          {{{5}}, {{6}}, {{7}}, {{8}}},
                                          {{{9}}, {{10}}, {{11}}, {{12}}}}});
        std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
        g->getNode("conv3")->getOperator()->setInput(1, weight3);
        g->getNode("conv3")->getOperator()->setInput(2, bias3);

        std::shared_ptr<Tensor> weightfc = std::make_shared<Tensor>(
                Array2D<int, 5, 27>{{{1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                      15, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12},
                                     {13, 14, 15, 1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                      12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9},
                                     {10, 11, 12, 13, 14, 15, 1,  2, 3, 4, 5, 6, 7, 8,
                                      9,  10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6},
                                     {7, 8, 9, 10, 11, 12, 13, 14, 15, 1,  2, 3, 4, 5,
                                      6, 7, 8, 9,  10, 11, 12, 13, 14, 15, 1, 2, 3},
                                     {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2,
                                      3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14, 15}}});
        std::shared_ptr<Tensor> biasfc = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
        g->getNode("fc")->getOperator()->setInput(1, weightfc);
        g->getNode("fc")->getOperator()->setInput(2, biasfc);

        // input->addChild(g);
        g->setDataType(Aidge::DataType::Int32);
        g->setBackend("cpu");
        std::vector<std::vector<Aidge::DimSize_t>> dims = {inputTensor->dims()};
        g->forwardDims(dims);
        SequentialScheduler scheduler(g);

        std::vector<std::shared_ptr<Aidge::Tensor>> dataIn = {inputTensor};
        REQUIRE_NOTHROW(scheduler.forward(true, dataIn));

        scheduler.saveSchedulingDiagram("schedulingSequential");

        std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
                {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
                  {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
                  {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
                 {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
                  {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
                  {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}});

        std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{
                {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}},
                  {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}},
                  {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}},
                  {{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}},
                 {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}},
                  {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}},
                  {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}},
                  {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}});

        std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
                {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}},
                  {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}},
                  {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}},
                 {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}},
                  {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}},
                  {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}});

        Tensor expectedOutput4 = Array2D<int, 2, 5>{
                {{205050376, 198925904, 181355097, 196978090, 238868348},
                {598467376, 561797804, 560823897, 593043790, 698672948}}};
        std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0);
        bool equal1 = (*other1 == *expectedOutput1);
        REQUIRE(equal1);
        std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0);
        bool equal2 = (*other2 == *expectedOutput2);
        REQUIRE(equal2);
        std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0);
        bool equal3 = (*other3 == *expectedOutput3);
        REQUIRE(equal3);
        std::shared_ptr<Tensor> other4 = std::static_pointer_cast<OperatorTensor>(g->getNode("fc")->getOperator())->getOutput(0);
        bool equal4 = (*other4 == expectedOutput4);
        REQUIRE(equal4);
    }
}

TEST_CASE("[cpu/scheduler] SequentialScheduler(backward)", "[scheduler][backward]") {
    // create GraphView
    std::shared_ptr<GraphView> gv = Sequential({ReLU("relu0"), Sqrt("srqt0"), ReLU("relu1")});

    std::shared_ptr<Tensor> inputTensor =
            std::make_shared<Tensor>(Array4D<float, 2, 1, 5, 5>{{{{{0.0f,  1.0f,  2.0f,  3.0f,  4.0f},
                                                                 {5.0f,  6.0f,  7.0f,  8.0f,  9.0f},
                                                                {10.0f, 11.0f, 12.0f, 13.0f, 14.0f},
                                                                {15.0f, 16.0f, 17.0f, 18.0f, 19.0f},
                                                                {20.0f, 21.0f, 22.0f, 23.0f, 24.0f}}},
                                                              {{{25.0f, 26.0f, 27.0f, 28.0f, 29.0f},
                                                                {30.0f, 31.0f, 32.0f, 33.0f, 34.0f},
                                                                {35.0f, 36.0f, 37.0f, 38.0f, 39.0f},
                                                                {40.0f, 41.0f, 42.0f, 43.0f, 44.0f},
                                                                {45.0f, 46.0f, 47.0f, 48.0f, 49.0f}}}}});
    auto label = inputTensor;
    // implem already set to default
    auto myProd = Producer(inputTensor, "prod");
    myProd -> addChild(gv);
    gv -> compile("cpu", DataType::Float32);

    SequentialScheduler scheduler(gv);
    scheduler.forward();
    auto outNode = gv->getOrderedOutputs()[0].first;
    std::shared_ptr<Tensor> predictedOutput = std::dynamic_pointer_cast<OperatorTensor>(outNode->getOperator())->getOutput(0);
    std::shared_ptr<Tensor> targetOutput =
          std::make_shared<Tensor>(Array4D<float, 2, 1, 5, 5>{{{{{0.0f, 1.0f, 1.0f, 2.0f, 2.0f},
                                                                 {2.0f, 2.0f, 3.0f, 3.0f, 3.0f},
                                                                 {3.0f, 3.0f, 3.0f, 4.0f, 4.0f},
                                                                 {4.0f, 4.0f, 4.0f, 4.0f, 4.0f},
                                                                 {4.0f, 5.0f, 5.0f, 5.0f, 5.0f}}},
                                                               {{{5.0f, 5.0f, 5.0f, 5.0f, 5.0f},
                                                                 {5.0f, 6.0f, 6.0f, 6.0f, 6.0f},
                                                                 {6.0f, 6.0f, 6.0f, 6.0f, 6.0f},
                                                                 {6.0f, 6.0f, 6.0f, 7.0f, 7.0f},
                                                                 {7.0f, 7.0f, 7.0f, 7.0f, 7.0f}}}}});
    predictedOutput->setGrad(targetOutput);
    REQUIRE_NOTHROW(scheduler.backward());
}

std::shared_ptr<Node> Accumulate(int seqLength, const std::string& name) {
    auto input = Identity((!name.empty()) ? name + "_input" : "");
    auto hiddenState = Memorize(seqLength, (!name.empty()) ? name + "_hidden_state" : "");
    auto add = Add((!name.empty()) ? name + "_add" : "");

    input->addChild(add, 0, 0);
    add->addChild(hiddenState, 0,0);
    hiddenState->addChild(/*otherNode=*/add, /*outId=*/1, /*otherInId=*/1);

    std::shared_ptr<GraphView> microGraph = std::make_shared<GraphView>();
    microGraph->add(input);
    microGraph->add({hiddenState, add});
    microGraph->setOrderedInputs({{input, 0}, {hiddenState, 1}});
    microGraph->setOrderedOutputs({{hiddenState, 0}});

    auto metaOp = MetaOperator("Accumulate", microGraph, {}, name);
    return metaOp;
}

TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") {
    std::shared_ptr<Tensor> Input = std::make_shared<Tensor>(
        Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
                                 {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});

    std::shared_ptr<Tensor> MemInit =
        std::make_shared<Tensor>(Array2D<float, 3, 2>{
            {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}});

    auto meta = Accumulate(2, "accumulate");
    auto op = std::static_pointer_cast<MetaOperator_Op>(meta->getOperator());
    auto pop_i = Pop("pop_input");
    auto pop_o = Identity("pop_output"); // NOTE: Could be Identity/Stack/Whatever node you want, this is is not the problem here

    pop_i->getOperator()->associateInput(0, Input);
    pop_i->addChild(op->getMicroGraph()->getOrderedInputs()[0].first, 0, 0);
    op->getMicroGraph()->getOrderedOutputs()[0].first->addChild(pop_o, 0, 0);

    //pop_i->addChild(meta, 0, 0);
    //meta->addChild(pop_o, 0, 0);

    //op->associateInput(1, MemInit);
    op->getMicroGraph()->getNode("accumulate_hidden_state")->getOperator()->associateInput(1, MemInit);

    // Build the graph.
    auto myGraph = std::make_shared<GraphView>();
    myGraph->add(pop_i);
    myGraph->add(op->getMicroGraph());
    //myGraph->add(meta);
    myGraph->add(pop_o);
    myGraph->compile("cpu", DataType::Float32);

    myGraph->save("accumulate_graph", true);

    // Schedule and run
    auto scheduler = SequentialScheduler(myGraph);
    scheduler.generateScheduling();
    scheduler.saveStaticSchedulingDiagram("accumulate_scheduling");
    REQUIRE_NOTHROW(scheduler.forward(true));

    std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
        Array2D<float, 3, 2>{{{3.0, 5.0}, {7.0, 9.0}, {11.0, 13.0}}});
    std::shared_ptr<Tensor> output = std::static_pointer_cast<OperatorTensor>(pop_o->getOperator())->getOutput(0);
    REQUIRE(*output == *expectedOutput);
}

TEST_CASE("[cpu/scheduler] Branch", "[scheduler]") {
    std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
            Array2D<float, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});

    std::shared_ptr<GraphView> g = Sequential({
        Producer(in, "input"),
        Parallel({
            Sequential({
                GenericOperator("b0_op1", {InputCategory::Data}, 1),
                GenericOperator("b0_op2", {InputCategory::Data}, 1),
                GenericOperator("b0_op3", {InputCategory::Data}, 1),
                GenericOperator("b0_op4", {InputCategory::Data}, 1),
                GenericOperator("b0_op5", {InputCategory::Data}, 1)
            }),
            Sequential({
                GenericOperator("b1_op1", {InputCategory::Data}, 1),
                GenericOperator("b1_op2", {InputCategory::Data}, 1),
                GenericOperator("b1_op3", {InputCategory::Data}, 1)
            }),
            Sequential({
                GenericOperator("b2_op1", {InputCategory::Data}, 1)
            })
        }),
        GenericOperator("op1", {InputCategory::Data, InputCategory::Data, InputCategory::Data}, 1),
        GenericOperator("op2", {InputCategory::Data}, 1),
        GenericOperator("op3", {InputCategory::Data}, 1)
    });

    g->save("branch_forwarded");

    auto scheduler = SequentialScheduler(g);
    scheduler.generateScheduling();
    scheduler.saveStaticSchedulingDiagram("branch_scheduling");

    // Default scheduling order is not necessarily determinist, but is garanteed to be correct in every case.
    // This behavior might change in the future.
    auto seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::Default);
    fmt::println("seqSchedule = {}", seqSchedule);

    scheduler.tagForkBranches();
    g->save("branch_forwarded_tag");

    seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::ShortestBranchFirst);
    REQUIRE(nodePtrTo(seqSchedule, nodePtrToType) == std::vector<std::string>{
        "Producer", "b2_op1", "b1_op1", "b1_op2", "b1_op3", "b0_op1", "b0_op2", "b0_op3", "b0_op4", "b0_op5", "op1", "op2", "op3"});

    seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::LonguestBranchFirst);
    REQUIRE(nodePtrTo(seqSchedule, nodePtrToType) == std::vector<std::string>{
        "Producer", "b0_op1", "b0_op2", "b0_op3", "b0_op4", "b0_op5", "b1_op1", "b1_op2", "b1_op3", "b2_op1", "op1", "op2", "op3"});
}

#ifdef WITH_OPENSSL
TEST_CASE("[cpu/scheduler] Select", "[scheduler]") {
    std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
            Array2D<float, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});

    std::shared_ptr<GraphView> g = Sequential({
        Producer(in, "input"),
        Parallel({
            Sequential({
                CryptoHash("hash"),
                Mod("mod")
            }),
            ReLU("relu"),
            Tanh("tanh"),
            Sqrt("sqrt")
        }),
        Select(3, "select")
    });

    auto modProd = Producer(std::make_shared<Tensor>(Array1D<uint64_t, 1>{{3}}));
    modProd->addChild(g->getNode("mod"), 0, 1);
    g->add(modProd);

    g->getNode("hash")->getOperator()->setDataType(DataType::UInt64);
    g->getNode("mod")->getOperator()->setDataType(DataType::UInt64);
    g->setBackend("cpu");
    g->save("select");

    auto scheduler = SequentialScheduler(g);
    scheduler.generateScheduling();
    scheduler.saveStaticSchedulingDiagram("select_scheduling");
    REQUIRE_NOTHROW(scheduler.forward(true));
    
    g->save("select_forwarded");

    auto expectedOutputHash = std::make_shared<Tensor>(
        Array1D<uint64_t, 4>{{0x1b7cf58dfe2dae24, 0x3bac903def4ce580, 0x5f5a347389d97f41, 0x2c2dc759abc6b61}});
    auto outputHash = std::static_pointer_cast<OperatorTensor>(g->getNode("hash")->getOperator())->getOutput(0);
    REQUIRE(*outputHash == *expectedOutputHash);

    auto expectedOutputMod = std::make_shared<Tensor>(
        Array1D<uint64_t, 4>{{2, 1, 1, 2}});
    auto outputMod = std::static_pointer_cast<OperatorTensor>(g->getNode("mod")->getOperator())->getOutput(0);
    REQUIRE(*outputMod == *expectedOutputMod);

    auto expectedOutput = std::make_shared<Tensor>(
        Array2D<float, 2, 3>{{{std::sqrt(1), std::sqrt(2), std::sqrt(3)}, {std::sqrt(4), std::sqrt(5), std::sqrt(6)}}});
    auto output = std::static_pointer_cast<OperatorTensor>(g->getNode("select")->getOperator())->getOutput(0);
    REQUIRE(*output == *expectedOutput);

    scheduler.resetScheduling();
    scheduler.tagConditionalNodes();

    REQUIRE(g->getNode("relu")->attributes()->hasAttr("schedule.cond"));
    REQUIRE(g->getNode("relu")->attributes()->getAttr<std::set<std::pair<NodePtr, size_t>>>("schedule.cond")
        == std::set<std::pair<NodePtr, size_t>>{{g->getNode("select"), 0}});
    REQUIRE(g->getNode("tanh")->attributes()->hasAttr("schedule.cond"));
    REQUIRE(g->getNode("tanh")->attributes()->getAttr<std::set<std::pair<NodePtr, size_t>>>("schedule.cond")
        == std::set<std::pair<NodePtr, size_t>>{{g->getNode("select"), 1}});
    REQUIRE(g->getNode("sqrt")->attributes()->hasAttr("schedule.cond"));
    REQUIRE(g->getNode("sqrt")->attributes()->getAttr<std::set<std::pair<NodePtr, size_t>>>("schedule.cond")
        == std::set<std::pair<NodePtr, size_t>>{{g->getNode("select"), 2}});
    REQUIRE(!g->getNode("input")->attributes()->hasAttr("schedule.cond"));

    scheduler.generateScheduling();
    scheduler.saveStaticSchedulingDiagram("select_scheduling_tag");
    REQUIRE_NOTHROW(scheduler.forward(true));
}
#endif
} // namespace Aidge