/********************************************************************************
 * Copyright (c) 2023 CEA-List
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License 2.0 which is available at
 * http://www.eclipse.org/legal/epl-2.0.
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 ********************************************************************************/

#include "aidge/data/Tensor.hpp"

#include <cstddef>
#include <vector>

#include "aidge/utils/ErrorHandling.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/operator/Add.hpp"
#include "aidge/operator/Div.hpp"
#include "aidge/operator/Mul.hpp"
#include "aidge/operator/Sub.hpp"
#include "aidge/operator/Transpose.hpp"
#include "aidge/utils/Types.h"

/**
 * @brief Transposition operation
 *
 * @return Tensor
 */
Aidge::Tensor Aidge::Tensor::transpose(const std::vector<Aidge::DimSize_t> &outputDimsOrder) const {
    auto transpose_ = Aidge::Transpose_Op(outputDimsOrder);
    transpose_.associateInput(0, std::make_shared<Tensor>(*this));
    transpose_.setDataType(dataType());
    transpose_.setBackend(mImpl->backend());
    transpose_.forward();
    // using add_backend = std::remove_reference_t<decltype(*Registrar<Add_Op>::create("cpu")(std::declval<const Add_Op&>()))>;
    return transpose_.getOutput(0)->clone();
}

/**
 * @brief Element-wise addition operation for two ``Tensor``s.
 * @note ``Tensor``s should be stored on the same backend.
 * @todo If input ``Tensor``s have a different dataType, the output should
 * have the dataType of the ``Tensor`` with the highest precision.
 *
 * @param other
 * @return Tensor
 */
Aidge::Tensor Aidge::Tensor::operator+(const Aidge::Tensor& other) const {
    AIDGE_ASSERT(hasImpl() && other.hasImpl(), "At least one Tensor cannot perform any binary operation because it has no implementation.");
    AIDGE_ASSERT(mImpl->backend() == other.mImpl->backend(), "Tensors must have the same backend");
    AIDGE_ASSERT(dataType() == other.dataType(), "Tensors must have the same data type");
    AIDGE_ASSERT(dataFormat() == other.dataFormat(), "Tensors must have the same data format");
    auto add_ = Add_Op(2);
    add_.associateInput(0, std::make_shared<Tensor>(*this));
    add_.associateInput(1, std::make_shared<Tensor>(other));
    add_.setDataType(dataType());
    add_.setDataFormat(dataFormat());
    add_.setBackend(mImpl->backend());
    add_.forward();
    // using add_backend = std::remove_reference_t<decltype(*Registrar<Add_Op>::create("cpu")(std::declval<const Add_Op&>()))>;
    return add_.getOutput(0)->clone();
}

/**
 * @brief Element-wise substraction operation for two ``Tensor``s.
 * @note ``Tensor``s should be stored on the same backend.
 * @todo If input ``Tensor``s have a different dataType, the output should
 * have the dataType of the ``Tensor`` with the highest precision.
 *
 * @param other
 * @return Tensor
 */
Aidge::Tensor Aidge::Tensor::operator-(const Aidge::Tensor& other) const {
    AIDGE_ASSERT(hasImpl() && other.hasImpl(), "At least one Tensor cannot perform any binary operation because it has no implementation.");
    AIDGE_ASSERT(mImpl->backend() == other.mImpl->backend(), "Tensors must have the same backend");
    AIDGE_ASSERT(dataType() == other.dataType(), "Tensors must have the same data type");
    AIDGE_ASSERT(dataFormat() == other.dataFormat(), "Tensors must have the same data format");
    auto sub_ = Sub_Op();
    sub_.associateInput(0, std::make_shared<Tensor>(*this));
    sub_.associateInput(1, std::make_shared<Tensor>(other));
    sub_.setDataType(dataType());
    sub_.setDataFormat(dataFormat());
    sub_.setBackend(mImpl->backend());
    sub_.forward();
    // using add_backend = std::remove_reference_t<decltype(*Registrar<Add_Op>::create("cpu")(std::declval<const Add_Op&>()))>;
    return sub_.getOutput(0)->clone();
}

/**
 * @brief Element-wise multiplication operation for two ``Tensor``s.
 * @note ``Tensor``s should be stored on the same backend.
 * @todo If input ``Tensor``s have a different dataType, the output should
 * have the dataType of the ``Tensor`` with the highest precision.
 *
 * @param other
 * @return Tensor
 */
Aidge::Tensor Aidge::Tensor::operator*(const Aidge::Tensor& other) const {
    AIDGE_ASSERT(hasImpl() && other.hasImpl(), "At least one Tensor cannot perform any binary operation because it has no implementation.");
    AIDGE_ASSERT(mImpl->backend() == other.mImpl->backend(), "Tensors must have the same backend");
    AIDGE_ASSERT(dataType() == other.dataType(), "Tensors must have the same data type");
    AIDGE_ASSERT(dataFormat() == other.dataFormat(), "Tensors must have the same data format");
    auto mul_ = Mul_Op();
    mul_.associateInput(0, std::make_shared<Tensor>(*this));
    mul_.associateInput(1, std::make_shared<Tensor>(other));
    mul_.setDataType(dataType());
    mul_.setDataFormat(dataFormat());
    mul_.setBackend(mImpl->backend());
    mul_.forward();
    // using add_backend = std::remove_reference_t<decltype(*Registrar<Add_Op>::create("cpu")(std::declval<const Add_Op&>()))>;
    return mul_.getOutput(0)->clone();
}

Aidge::Tensor Aidge::Tensor::operator/(const Aidge::Tensor& other) const {
    AIDGE_ASSERT(hasImpl() && other.hasImpl(), "At least one Tensor cannot perform any binary operation because it has no implementation.");
    AIDGE_ASSERT(mImpl->backend() == other.mImpl->backend(), "Tensors must have the same backend");
    AIDGE_ASSERT(dataType() == other.dataType(), "Tensors must have the same data type");
    AIDGE_ASSERT(dataFormat() == other.dataFormat(), "Tensors must have the same data format");
    auto div_ = Div_Op();
    div_.associateInput(0, std::make_shared<Tensor>(*this));
    div_.associateInput(1, std::make_shared<Tensor>(other));
    div_.setDataType(dataType());
    div_.setDataFormat(dataFormat());
    div_.setBackend(mImpl->backend());
    div_.forward();
    // using add_backend = std::remove_reference_t<decltype(*Registrar<Add_Op>::create("cpu")(std::declval<const Add_Op&>()))>;
    return div_.getOutput(0)->clone();
}

Aidge::Tensor& Aidge::Tensor::operator=(const Aidge::Tensor& other) {
    if (this == &other) {
        return *this;
    }
    resize(other.dims(), other.strides());
    setDataType(other.dataType(), false);  // do not convert existing data
    if (other.hasImpl()) {
        if (hasImpl()) {
            copyFrom(other);
        } else {
            // Perform a shallow copy only
            setImpl(other.mImpl, other.mImplOffset);
        }
    } else {
        setImpl(nullptr);
    }
    return *this;
}

Aidge::Tensor::~Tensor() noexcept = default;

void Aidge::Tensor::resize(const std::vector<Aidge::DimSize_t>& dims,
                           std::vector<Aidge::DimSize_t> strides) {
    // TODO: scalar Tensor not handled
    if (dims.empty()) {  // scalar
        mDims = std::vector<DimSize_t>(0);
        mStrides = std::vector<DimSize_t>({1});
        mContiguous = true;

        computeSize();
        if (mImpl) {
            mImpl->resize(mDims);
        }
        return;
    }

    bool checkContiguous = true;
    if (strides.empty()) {
        strides.resize(dims.size());
        size_t expectedStride = 1;
        for (int dim = dims.size() - 1; dim >= 0; --dim) {
            strides[dim] = expectedStride;
            expectedStride *= dims[dim];
        }
        checkContiguous = false;
    } else {
        AIDGE_ASSERT(strides.size() == dims.size(),
                     "Number of strides must match number of dims");
    }

    if (mImpl && mImpl.use_count() > 1) {
        // Here we could also create a new storage for this tensor in this case
        // But, is it more likely that the user really wants this, or that he
        // did a mistake?
        AIDGE_ASSERT(dims == mDims && strides == mStrides,
                     "Cannot resize Tensor with shared storage");
    } else {
        mDims = dims;
        mStrides = strides;

        mContiguous = true;
        if (checkContiguous) {
            std::size_t expectedStride = 1;
            // std::size_t i = dims.size();
            // while ((i-- > 0) && (strides[i] == expectedStride)) {
            //     mContiguous&= (strides[i] == expectedStride);
            //     expectedStride*= dims[i];
            // }
            for (std::size_t i = dims.size() - 1; i > 0; --i) {
                if (strides[i] != expectedStride) {
                    mContiguous = false;
                    break;
                }
                expectedStride *= dims[i];
            }
            mContiguous &= (strides[0] == expectedStride);
        }

        computeSize();
        if (mImpl) {
            mImpl->resize(mDims);
        }
    }
}

std::string Aidge::Tensor::toString() const {
    AIDGE_ASSERT(
        mImpl && (dims().empty() || (dims() == std::vector<DimSize_t>({0})) ||
                  (mImpl->hostPtr() != nullptr)),
        "tensor should have a valid host pointer");

    // TODO: move lambda elsewhere?
    auto ptrToString = [](DataType dt, void* ptr, std::size_t idx) {
        switch (dt) {
            case DataType::Float64:
                return std::to_string(static_cast<double*>(ptr)[idx]);
            case DataType::Float32:
                return std::to_string(static_cast<float*>(ptr)[idx]);
            case DataType::Float16:
                return std::to_string(static_cast<half_float::half*>(ptr)[idx]);
            case DataType::Int8:
                return std::to_string(static_cast<int8_t*>(ptr)[idx]);
            case DataType::Int16:
                return std::to_string(static_cast<int16_t*>(ptr)[idx]);
            case DataType::Int32:
                return std::to_string(static_cast<int32_t*>(ptr)[idx]);
            case DataType::Int64:
                return std::to_string(static_cast<int64_t*>(ptr)[idx]);
            case DataType::UInt8:
                return std::to_string(static_cast<uint8_t*>(ptr)[idx]);
            case DataType::UInt16:
                return std::to_string(static_cast<uint16_t*>(ptr)[idx]);
            case DataType::UInt32:
                return std::to_string(static_cast<uint32_t*>(ptr)[idx]);
            case DataType::UInt64:
                return std::to_string(static_cast<uint64_t*>(ptr)[idx]);
            default:
                AIDGE_ASSERT(true, "unsupported type to convert to string");
        }
        return std::string("?");  // To make Clang happy
    };

    if (dims().empty()) {
        return ptrToString(mDataType, mImpl->hostPtr(), 0);
    }
    std::string res;
    std::size_t dim = 0;
    std::size_t counter = 0;
    if (nbDims() >= 2) {
        std::vector<std::size_t> dimVals(nbDims(), 0);
        res += "{\n";
        while (counter < mSize) {
            std::string spaceString = std::string((dim + 1) << 1, ' ');
            if (dim < nbDims() - 2) {
                if (dimVals[dim] == 0) {
                    res += spaceString + "{\n";
                    ++dim;
                } else if (dimVals[dim] <
                           static_cast<std::size_t>(dims()[dim])) {
                    res += spaceString + "},\n" + spaceString + "{\n";
                    ++dim;
                } else {
                    res += spaceString + "}\n";
                    dimVals[dim--] = 0;
                    dimVals[dim]++;
                }
            } else {
                for (; dimVals[dim] < static_cast<std::size_t>(dims()[dim]);
                     ++dimVals[dim]) {
                    res += spaceString + "{";
                    for (DimSize_t j = 0; j < dims()[dim + 1] - 1; ++j) {
                        res +=
                            " " +
                            ptrToString(mDataType, mImpl->hostPtr(mImplOffset),
                                        counter++) +
                            ",";
                    }
                    res += " " +
                           ptrToString(mDataType, mImpl->hostPtr(mImplOffset),
                                       counter++) +
                           "}";
                    if (dimVals[dim] <
                        static_cast<std::size_t>(dims()[dim] - 1)) {
                        res += ",";
                    }
                    res += "\n";
                }
                if (dim == 0) {
                    break;
                }
                dimVals[dim--] = 0;
                dimVals[dim]++;
            }
        }
        if (nbDims() != 2) {  // If nbDims == 2, parenthesis is already closed
            for (int i = static_cast<int>(dim); i >= 0; --i) {
                res += std::string((i + 1) << 1, ' ') + "}\n";
            }
        }
    } else {
        res += "{";
        for (DimSize_t j = 0; j < dims()[0]; ++j) {
            res += " " +
                   ptrToString(mDataType, mImpl->hostPtr(mImplOffset), j) +
                   ((j < dims()[0] - 1) ? "," : " ");
        }
    }
    res += "}";
    return res;
}

Aidge::Tensor Aidge::Tensor::extract(
    const std::vector<std::size_t>& fixedCoord) const {
    AIDGE_ASSERT(isContiguous(), "Tensor must be contiguous");
    AIDGE_ASSERT(fixedCoord.size() <= mDims.size(),
                 "Number of coordinates is higher than number of dimensions");

    Tensor subTensor(mDataType);
    subTensor.resize(
        std::vector<size_t>(mDims.cbegin() + fixedCoord.size(), mDims.cend()),
        std::vector<size_t>(mStrides.cbegin() + fixedCoord.size(),
                            mStrides.cend()));
    subTensor.setBackend(mImpl->backend(), mImpl->device().second);
    subTensor.setImpl(mImpl, mImplOffset + getStorageIdx(fixedCoord));
    return subTensor;
}

Aidge::Tensor Aidge::Tensor::extract(
    const std::vector<std::size_t>& startCoord,
    const std::vector<std::size_t>& dims) const {
    AIDGE_ASSERT(isContiguous(), "Tensor must be contiguous");
    AIDGE_ASSERT(startCoord.size() == mDims.size(),
                 "Coordinates does not match number of dimensions");

    Tensor subTensor(mDataType);
    subTensor.resize(dims, mStrides);
    subTensor.setBackend(mImpl->backend(), mImpl->device().second);
    subTensor.setImpl(mImpl, mImplOffset + getStorageIdx(startCoord));
    return subTensor;
}

void Aidge::Tensor::makeContiguous() {
    if (!mImpl || isContiguous()) {
        return;
    }

    // Block so that mImpl ref count is 1 for resize()
    {
        // Create a new storage that will be contiguous
        std::shared_ptr<TensorImpl> newImpl = Registrar<Tensor>::create(
            {mImpl->backend(), mDataType})(mImpl->device().second, mDims);
        // Copy elements from old to new storage
        std::size_t idx = 0;
        while (idx < mSize) {
            const std::size_t storageIdx = getStorageIdx(getCoord(idx));

            // Determine the size of the contiguous chunk
            std::size_t copySize = 1;
            while (idx + copySize < mSize &&
                   getStorageIdx(getCoord(idx + copySize)) ==
                       storageIdx + copySize) {
                ++copySize;
            }

            // Perform a single copy for the contiguous chunk
            newImpl->copy(mImpl->rawPtr(mImplOffset + storageIdx), copySize,
                          idx);

            // Move to the next index after the contiguous chunk
            idx += copySize;
        }
        // Replace old storage by new, contiguous, storage
        setImpl(newImpl);
    }

    // Resize tensor without strides => tensor is now contiguous
    resize(mDims);
}

void Aidge::Tensor::copyCast(const Tensor& src) {
    if (&src == this) {
        return;
    }

    AIDGE_ASSERT(src.isContiguous(), "cannot copy-cast non-contiguous tensor");

    // Current Tensor has necessarily a data type, but may not have backend
    if (!hasImpl()) {
        // If no backend was set for the current tensor, use the same as src
        const auto deviceSrc = src.getImpl()->device();
        setBackend(deviceSrc.first, deviceSrc.second);
    }
    resize(src.dims());

    AIDGE_ASSERT(src.getImpl()->device() == getImpl()->device(),
                 "cannot copy-cast from a different backend/device");
    getImpl()->copyCast(src.getImpl()->rawPtr(src.mImplOffset), src.dataType(),
                        src.size(), mImplOffset);
}

void Aidge::Tensor::copyFrom(const Tensor& src) {
    if (&src == this) {
        return;
    }

    AIDGE_ASSERT(src.isContiguous(), "cannot copy from non-contiguous tensor");

    // Current Tensor has necessarily a data type, but may not have backend
    if (!hasImpl()) {
        // If no backend was set for the current tensor, use the same as src
        const auto deviceSrc = src.getImpl()->device();
        setBackend(deviceSrc.first, deviceSrc.second);
    }
    resize(src.dims());

    AIDGE_ASSERT(src.dataType() == dataType(),
                 "cannot copy from a different data type");
    getImpl()->copyFrom(*(src.getImpl()), src.size(), src.mImplOffset,
                        mImplOffset);
}

void Aidge::Tensor::copyTranspose(const Tensor& src, const std::vector<DimSize_t>& transpose) {
    std::vector<DimSize_t> newDims;
    for (std::size_t i = 0; i < src.dims().size(); ++i) {
        newDims.push_back(src.dims()[transpose[i]]);
    }

    std::vector<std::size_t> newStrides(newDims.size(), 1);
    for (size_t i = 0; i < newDims.size(); ++i) {
        for (size_t j = i + 1; j < newDims.size(); ++j) {
            newStrides[i] *= newDims[j];
        }
    }

    std::shared_ptr<TensorImpl> newImpl = Registrar<Tensor>::create({mImpl->backend(), mDataType})(mImpl->device().second, newDims);

    std::vector<size_t> indices(newDims.size(), 0);
    for (size_t i = 0; i < src.size(); ++i) {
        size_t idx = 0;
        // Permute indices based on OutputDimsOrder attr
        for (int j = newDims.size() -1; j >=0; --j) {
            idx += indices[transpose[j]] * newStrides[j];
        }

        // Copy the value in output
        newImpl->copy(src.getImpl()->rawPtr(i), 1, idx);

        // Update indices for the next iteration
        for (int j = newDims.size() - 1; j >= 0; --j) {
            if (indices[j] < src.dims()[j] - 1) {
                indices[j]++;
                break;
            }
            else {
                indices[j] = 0;
            }
        }
    }

    resize(newDims);
    setImpl(newImpl);
}

void Aidge::Tensor::copyTranspose(const Tensor& src, const DataFormatTranspose& transpose) {
    copyTranspose(src, std::vector<DimSize_t>(transpose.begin(), transpose.end()));
}

void Aidge::Tensor::copyCastFrom(const Tensor& src,
                                 std::shared_ptr<Tensor>& movedSrcPtr) {
    if (&src == this) {
        return;
    }

    AIDGE_ASSERT(src.isContiguous(),
                 "cannot copy-cast from non-contiguous tensor");

    // Current Tensor has necessarily a data type, but may not have backend
    if (!getImpl()) {
        // If no backend was set for the current tensor, use the same as src
        const auto deviceSrc = src.getImpl()->device();
        setBackend(deviceSrc.first, deviceSrc.second);
    }
    resize(src.dims());

    if (dataType() != src.dataType()) {
        // First move data to the target device (only if needed)
        const auto device = getImpl()->device();
        const Tensor& movedSrc =
            src.refFrom(movedSrcPtr, device.first, device.second);
        // Second, copy-cast data (necessary)
        getImpl()->copyCast(movedSrc.getImpl()->rawPtr(movedSrc.mImplOffset),
                            movedSrc.dataType(), movedSrc.size(), mImplOffset);
    } else {
        // Directly copy, no conversion necessary
        // Avoid making a double copy if both data type and device are the same
        getImpl()->copyFrom(*(src.getImpl()), src.size(), src.mImplOffset,
                            mImplOffset);
    }
}

Aidge::Tensor& Aidge::Tensor::refContiguous(std::shared_ptr<Tensor>& fallback) {
    // Scott Meyers' solution to avoid code duplication
    return const_cast<Tensor&>(
        static_cast<const Tensor&>(*this).refContiguous(fallback));
}

const Aidge::Tensor& Aidge::Tensor::refContiguous(
    std::shared_ptr<Tensor>& fallback) const {
    AIDGE_ASSERT(getImpl(),
                 "no backend was set for tensor, cannot refCast() it");

    if (isContiguous()) {
        return *this;
    } else {
        if (this != fallback.get()) {
            // Shallow copy to fallback
            *fallback = *this;
        }

        // Make fallback contiguous
        fallback->makeContiguous();
        return *fallback;
    }
}

Aidge::Tensor& Aidge::Tensor::refCast(std::shared_ptr<Tensor>& fallback,
                                      const Aidge::DataType& dt) {
    // Scott Meyers' solution to avoid code duplication
    return const_cast<Tensor&>(
        static_cast<const Tensor&>(*this).refCast(fallback, dt));
}

const Aidge::Tensor& Aidge::Tensor::refCast(std::shared_ptr<Tensor>& fallback,
                                            const Aidge::DataType& dt) const {
    AIDGE_ASSERT(getImpl(),
                 "no backend was set for tensor, cannot refCast() it");

    if (dt == dataType()) {
        return *this;
    } else {
        if (this == fallback.get()) {
            // if refFrom() was called before, just change the type
            fallback->setDataType(dt);
        } else {
            AIDGE_ASSERT(isContiguous(),
                         "cannot refCast non-contiguous tensor");

            if (!fallback) {
                fallback = std::make_shared<Tensor>(dt);
            } else {
                fallback->setDataType(
                    dt, false);  // don't keep previous data (no copy)
            }

            const auto device = getImpl()->device();
            fallback->setBackend(device.first, device.second,
                                 false);  // don't keep previous data (no copy)
            fallback->resize(dims());
            fallback->getImpl()->copyCast(getImpl()->rawPtr(mImplOffset),
                                          dataType(), size(),
                                          fallback->mImplOffset);
        }
        return *fallback;
    }
}

Aidge::Tensor& Aidge::Tensor::refFrom(std::shared_ptr<Tensor>& fallback,
                                      const std::string& backend,
                                      DeviceIdx_t device) {
    // Scott Meyers' solution to avoid code duplication
    return const_cast<Tensor&>(
        static_cast<const Tensor&>(*this).refFrom(fallback, backend, device));
}

const Aidge::Tensor& Aidge::Tensor::refFrom(std::shared_ptr<Tensor>& fallback,
                                            const std::string& backend,
                                            DeviceIdx_t device) const {
    AIDGE_ASSERT(getImpl(),
                 "no backend was set for tensor, cannot refFrom() it");

    if (std::make_pair(backend, device) == getImpl()->device()) {
        return *this;
    } else {
        if (this == fallback.get()) {
            // if refCast() was called before, just change the backend
            fallback->setBackend(backend, device);
        } else {
            AIDGE_ASSERT(isContiguous(),
                         "cannot refFrom non-contiguous tensor");

            if (!fallback) {
                fallback = std::make_shared<Tensor>(dataType());
            } else {
                fallback->setDataType(
                    dataType(), false);  // don't keep previous data (no copy)
            }

            fallback->setBackend(backend, device,
                                 false);  // don't keep previous data (no copy)
            fallback->resize(dims());
            fallback->getImpl()->copyFrom(*getImpl(), size(), mImplOffset,
                                          fallback->mImplOffset);
        }
        return *fallback;
    }
}

Aidge::Tensor& Aidge::Tensor::ref(std::shared_ptr<Tensor>& fallback,
                                  const Aidge::DataType& dt,
                                  const std::string& backend,
                                  DeviceIdx_t device) {
    // Scott Meyers' solution to avoid code duplication
    return const_cast<Tensor&>(
        static_cast<const Tensor&>(*this).ref(fallback, dt, backend, device));
}

const Aidge::Tensor& Aidge::Tensor::ref(std::shared_ptr<Tensor>& fallback,
                                        const Aidge::DataType& dt,
                                        const std::string& backend,
                                        DeviceIdx_t device) const {
    AIDGE_ASSERT(getImpl(), "no backend was set for tensor, cannot ref() it");

    if (dt == dataType() &&
        std::make_pair(backend, device) == getImpl()->device()) {
        return *this;
    } else {
        // Change fallback type, backend & device, without any data copy
        if (!fallback) {
            fallback = std::make_shared<Tensor>(dt);
        } else {
            fallback->setDataType(dt,
                                  false);  // don't keep previous data (no copy)
        }

        fallback->setBackend(backend, device,
                             false);  // don't keep previous data (no copy)
        fallback->resize(dims());
        return *fallback;
    }
}

std::set<std::string> Aidge::Tensor::getAvailableBackends() {
    std::set<std::string> backendsList;
    for (const auto& tupleKey : Registrar<Tensor>::getKeys())
        backendsList.insert(std::get<0>(tupleKey));
    return backendsList;
}