/********************************************************************************
 * Copyright (c) 2023 CEA-List
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License 2.0 which is available at
 * http://www.eclipse.org/legal/epl-2.0.
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 ********************************************************************************/

#include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"

#include <memory>
#include <vector>

#include "aidge/backend/cpu/data/GetCPUPtr.h"
#include "aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp"
#include "aidge/data/Tensor.hpp"
#include "aidge/operator/ConvDepthWise.hpp"
#include "aidge/utils/Log.hpp"
#include "aidge/utils/Types.h"

template <>
void Aidge::ConvDepthWiseImpl1D_cpu::forward() {
    const auto& op_ = dynamic_cast<const ConvDepthWise_Op<1>&>(mOp);

    AIDGE_ASSERT(op_.getInput(0), "missing input #0 in ConvDepthWise Operator");
    AIDGE_ASSERT(op_.getInput(1), "missing input #1 in ConvDepthWise Operator");

    AIDGE_ASSERT((op_.getInput(0)->nbDims() == 3), "support for 4-dimensions tensors only");

    // Find the correct kernel type
    const auto impl = Registrar<ConvDepthWiseImpl1D_cpu>::create(getBestMatch(getRequiredSpec()));

    // Convert input data (no overhead if not needed!)
    // TODO: right now, if needed, memory will be allocated/deallocated at each
    // call to forward(). We might put the following shared_ptr as members of
    // this class to avoid that.
    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
    const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *op_.getOutput(0));
    const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *op_.getOutput(0));
    const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();

    // Call kernel
    impl.forward(op_.strideDims(),
                op_.dilationDims(),
                op_.kernelDims(), // Conv attributes
               op_.getInput(0)->template dims<3>(), // input dimensions
               input0.getImpl()->rawPtr(), // input
               input1.getImpl()->rawPtr(), // weight
               (op_.getInput(2)) ? input2.getImpl()->rawPtr() : nullptr, // bias
               getCPUPtr(mOp.getRawOutput(0)) // output
            );
}

template <>
void Aidge::ConvDepthWiseImpl1D_cpu::backward() {
    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for ConvDepthWise_Op<1> on backend cpu");
}

template <>
void Aidge::ConvDepthWiseImpl2D_cpu::forward() {
    const auto& op_ = dynamic_cast<const ConvDepthWise_Op<2>&>(mOp);

    AIDGE_ASSERT(op_.getInput(0), "missing input #0 in ConvDepthWise Operator");
    AIDGE_ASSERT(op_.getInput(1), "missing input #1 in ConvDepthWise Operator");
    AIDGE_ASSERT(op_.getInput(2), "missing input #2 in ConvDepthWise Operator");

    AIDGE_ASSERT((op_.getInput(0)->nbDims() == 4), "support for 4-dimensions tensors only");

    // Find the correct kernel type
    const auto impl = Registrar<ConvDepthWiseImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));

        // Convert input data (no overhead if not needed!)
    // TODO: right now, if needed, memory will be allocated/deallocated at each
    // call to forward(). We might put the following shared_ptr as members of
    // this class to avoid that.
    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
    const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *op_.getOutput(0));
    const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *op_.getOutput(0));
    const auto& input2 = op_.getInput(2) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();

    // Call kernel
    impl.forward(op_.strideDims(),
            op_.dilationDims(),
            op_.kernelDims(),
            op_.getInput(0)->template dims<4>(),
            input0.getImpl()->rawPtr(),
            input1.getImpl()->rawPtr(),
            op_.getInput(2) ?  input2.getImpl()->rawPtr() : nullptr,
            getCPUPtr(op_.getRawOutput(0)));
}

template <>
void Aidge::ConvDepthWiseImpl2D_cpu::backward() {
    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for ConvDepthWise_Op<2> on backend cpu");
}