/********************************************************************************
 * Copyright (c) 2023 CEA-List
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License 2.0 which is available at
 * http://www.eclipse.org/legal/epl-2.0.
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 ********************************************************************************/

#include <cassert>
#include <chrono>  // std::chrono::milliseconds
#include <numeric> // std::accumulate
#include <thread>  // std::this_thread::sleep_for
#include <vector>

#include "aidge/operator/FC.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cpu/data/GetCPUPtr.h"

#include "aidge/backend/cpu/operator/FCImpl.hpp"
#include "aidge/backend/cpu/operator/FCImpl_forward_kernels.hpp"

void Aidge::FCImpl_cpu::forward()
{
    const FC_Op& op_ = dynamic_cast<const FC_Op&>(mOp);
    assert((op_.getInput(0)) && "missing input #0");
    assert((op_.getInput(1)) && "missing input #1");
    assert((op_.getInput(2)) && "missing input #2");

    // Find the correct kernel type
    const auto outputDataType = op_.getOutput(0)->dataType();
    const Registrar<FCImplForward_cpu>::registrar_key registrarKey = {
        op_.getInput(0)->dataType(),
        op_.getInput(1)->dataType(),
        op_.getInput(2)->dataType(),
        outputDataType};

    Registrar<FCImplForward_cpu>::registrar_type kernelFunc;
    if (Registrar<FCImplForward_cpu>::exists(registrarKey)) {
        // One exists with the right inputs/output types
        kernelFunc = Registrar<FCImplForward_cpu>::create(registrarKey);
    }
    else {
        // Otherwise, fallback to the kernel with all types matching output type
        kernelFunc = Registrar<FCImplForward_cpu>::create({
            outputDataType, outputDataType, outputDataType, outputDataType});
    }

    // Convert input data (no overhead if not needed!)
    // TODO: right now, if needed, memory will be allocated/deallocated at each
    // call to forward(). We might put the following shared_ptr as members of
    // this class to avoid that.
    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
    const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *(op_.getOutput(0)));
    const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *(op_.getOutput(0)));
    const auto& input2 = op_.getInput(2)->refCastFrom(input2Fallback, *(op_.getOutput(0)));

    // Call kernel
    const auto batchSize = (input0.dims().size() > 1) ? input0.dims()[0] : 1;
    kernelFunc(dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(),
        batchSize,
        input0.size() / batchSize,
        input0.getImpl()->rawPtr(), input1.getImpl()->rawPtr(), input2.getImpl()->rawPtr(),
        getCPUPtr(mOp.getRawOutput(0)));
}

// void Aidge::FCImpl_cpu::backward()
// {
//     const FC_Op& op_ = dynamic_cast<const FC_Op&>(mOp);
//     const auto& fc_grad = op_.getOutput(0)->grad();
//     assert(fc_grad && "missing ouput #0 gradient");

//     // Find the correct kernel type
//     const Registrar<FCImplBackward_cpu>::registrar_key registrarKey = {
//         op_.getInput(0)->grad()->dataType(),
//         op_.getInput(1)->grad()->dataType(),
//         op_.getInput(2)->grad()->dataType(),
//         fc_grad->dataType()};

//     Registrar<FCImplBackward_cpu>::registrar_type kernelFunc;
//     if (Registrar<FCImplBackward_cpu>::exists(registrarKey)) {
//         // One exists with the right inputs/output types
//         kernelFunc = Registrar<FCImplBackward_cpu>::create(registrarKey);
//     }
//     else {
//         // Otherwise, fallback to the kernel with all types matching output type
//         kernelFunc = Registrar<FCImplBackward_cpu>::create({
//             fc_grad->dataType(), fc_grad->dataType(), fc_grad->dataType(), fc_grad->dataType()});
//     }

//     // Convert input data (no overhead if not needed!)
//     // TODO: right now, if needed, memory will be allocated/deallocated at each
//     // call to forward(). We might put the following shared_ptr as members of
//     // this class to avoid that.
//     std::shared_ptr<Tensor> input0gradFallback, input1gradFallback, input2gradFallback;
//     const auto& input0grad = op_.getInput(0)->grad()->refCastFrom(input0gradFallback, *(op_.getOutput(0)));
//     const auto& input1grad = op_.getInput(1)->grad()->refCastFrom(input1gradFallback, *(op_.getOutput(0)));
//     const auto& input2grad = op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0)));

//     // Call kernel
//     const auto batchSize = (input0.dims().size() > 1) ? input0.dims()[0] : 1;
//     kernelFunc(dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(),
//         batchSize,
//         input0.size() / batchSize,
//         input0.getImpl()->rawPtr(), input1.getImpl()->rawPtr(), input2.getImpl()->rawPtr(),
//         getCPUPtr(mOp.getRawOutput(0)));
// }