/********************************************************************************
 * Copyright (c) 2023 CEA-List
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License 2.0 which is available at
 * http://www.eclipse.org/legal/epl-2.0.
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 ********************************************************************************/

#include <aidge/utils/Types.h>
#include <catch2/catch_test_macros.hpp>
#include <chrono>
#include <cmath>
#include <cstddef> // std::size_t
#include <cstdint> // std::uint16_t
#include <iostream>
#include <memory>
#include <numeric> // std::accumulate
#include <ostream>
#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution

#include "aidge/data/Tensor.hpp"
#include "aidge/operator/GlobalAveragePooling.hpp"
#include "aidge/utils/TensorUtils.hpp"

// debug print function
void print_tensor(Aidge::Tensor &T) {
  // Print tensors
  std::cout << "Tensor : size =  [";
  for (auto &dim : T.dims()) {
    std::cout << dim << " , ";
  }
  std::cout << "]" << std::endl;
  T.print();
}

namespace Aidge {
TEST_CASE("[cpu/operator] GlobalAveragePooling",
          "[GlobalAveragePooling][CPU]") {
  constexpr std::uint16_t NBTRIALS = 10;
  // Create a random number generator
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_real_distribution<float> valueDist(
      0.1f, 1.1f); // Random float distribution between 0 and 1
  std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2),
                                                         std::size_t(10));

  std::uniform_int_distribution<std::size_t> nbLowDimsDist(std::size_t(1),
                                                           std::size_t(2));
  std::uniform_int_distribution<std::size_t> nbHighDimsDist(std::size_t(3),
                                                            std::size_t(7));

  // Create MatGlobalAveragePooling Operator
  std::shared_ptr<Node> globAvgPool = GlobalAveragePooling();
  auto op =
      std::static_pointer_cast<OperatorTensor>(globAvgPool->getOperator());
  op->setDataType(DataType::Float32);
  op->setBackend("cpu");

  // Create the input Tensor
  std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
  op->associateInput(0, T0);
  T0->setDataType(DataType::Float32);
  T0->setBackend("cpu");

  // Create results Tensor
  std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
  Tres->setDataType(DataType::Float32);
  Tres->setBackend("cpu");

  // To measure execution time of 'MatGlobalAveragePooling_Op::forward()' member
  // function call
  std::chrono::time_point<std::chrono::system_clock> start;
  std::chrono::time_point<std::chrono::system_clock> end;
  std::chrono::duration<double, std::micro> duration{};
  int number_of_operation{0};

  SECTION("GlobalAveragePoolingImpl_cpu::forward()") {
    SECTION(
        "1-2Dim > not enough dimensions leads to function throwing an error") {
      // generate a random tensors
      const std::size_t nbDims = nbLowDimsDist(gen);
      std::vector<std::size_t> dims;
      for (std::size_t i = 0; i < nbDims; ++i) {
        dims.push_back(dimSizeDist(gen));
      }
      const std::size_t nb_elements =
          std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1),
                          std::multiplies<std::size_t>());

      float *array0 = new float[nb_elements];
      for (std::size_t i = 0; i < nb_elements; ++i) {
        array0[i] = valueDist(gen);
      }
      // input0
      T0->resize(dims);
      T0->getImpl()->setRawPtr(array0, nb_elements);

      REQUIRE_THROWS(globAvgPool->forward());
      delete[] array0;
    }

    SECTION("3+Dim") {
      SECTION("Fill a tensor with all values set as N will result with every "
              "output being N") {
        // generate the tensor
        const std::size_t nbDims = nbHighDimsDist(gen);
        std::vector<std::size_t> dims_in;
        for (std::size_t i = 0; i < nbDims; ++i) {
          dims_in.push_back(dimSizeDist(gen));
        }
        // create in nb_elems
        const std::size_t in_nb_elems =
            std::accumulate(dims_in.cbegin(), dims_in.cend(), std::size_t(1),
                            std::multiplies<std::size_t>());
        const DimSize_t in_batch_nb_elems = in_nb_elems / dims_in[0];
        const DimSize_t in_channel_nb_elems = in_batch_nb_elems / dims_in[1];

        number_of_operation +=
            in_nb_elems +
            dims_in[1]; //  averaging per channel : 1 addition per element in
                        //  the channel + 1 division this for every batch
        // create out nb_elems
        std::vector<std::size_t> dims_out{dims_in[0], dims_in[1]};
        const std::size_t out_nb_elems =
            std::accumulate(dims_out.cbegin(), dims_out.cend(), std::size_t(1),
                            std::multiplies<std::size_t>());
        const DimSize_t out_batch_nb_elems = out_nb_elems / dims_out[0];

        // iterate over each batch/channel
        float *array0 = new float[in_nb_elems];
        float *result = new float[out_nb_elems];
        float val = valueDist(gen);
        for (std::size_t batch = 0; batch < dims_in[0]; ++batch) {
          for (std::size_t channel = 0; channel < dims_in[1]; ++channel) {
            for (std::size_t i = 0; i < in_channel_nb_elems; ++i)

            {
              array0[batch * in_batch_nb_elems + channel * in_channel_nb_elems +
                     i] = val;
            }
            result[batch * out_batch_nb_elems + channel] = val;
          }
        }

        // input0
        T0->resize(dims_in);
        T0->getImpl()->setRawPtr(array0, in_nb_elems);

        // results
        Tres->resize(dims_out);
        Tres->getImpl()->setRawPtr(result, out_nb_elems);

        op->computeOutputDims();
        start = std::chrono::system_clock::now();
        REQUIRE_NOTHROW(globAvgPool->forward());
        end = std::chrono::system_clock::now();
        duration +=
            std::chrono::duration_cast<std::chrono::microseconds>(end - start);

        REQUIRE(Tres->nbDims() == op->getOutput(0)->nbDims());
        for (DimSize_t i = 0; i < op->getOutput(0)->nbDims(); ++i) {
          REQUIRE(Tres->dims().at(i) == op->getOutput(0)->dims().at(i));
        }

        REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));

        delete[] array0;
        delete[] result;
      }

      SECTION("random testing") {
        for (int trial = 0; trial < NBTRIALS; ++trial) {
          // generate the tensor
          const std::size_t nbDims = nbHighDimsDist(gen);
          std::vector<std::size_t> dims_in;
          for (std::size_t i = 0; i < nbDims; ++i) {
            dims_in.push_back(dimSizeDist(gen));
          }
          // create in nb_elems
          const std::size_t in_nb_elems =
              std::accumulate(dims_in.cbegin(), dims_in.cend(), std::size_t(1),
                              std::multiplies<std::size_t>());
          const DimSize_t in_batch_nb_elems = in_nb_elems / dims_in[0];
          const DimSize_t in_channel_nb_elems = in_batch_nb_elems / dims_in[1];
          number_of_operation +=
              in_nb_elems +
              dims_in[1]; //  averaging per channel : 1 addition per element in
                          //  the channel + 1 division this for every batch

          // create out nb_elems
          std::vector<std::size_t> dims_out{dims_in[0], dims_in[1]};
          const std::size_t out_nb_elems =
              std::accumulate(dims_out.cbegin(), dims_out.cend(),
                              std::size_t(1), std::multiplies<std::size_t>());
          const DimSize_t out_batch_nb_elems = out_nb_elems / dims_out[0];

          // iterate over each batch/channel
          float *array0 = new float[in_nb_elems];
          float *result = new float[out_nb_elems];
          for (std::size_t batch = 0; batch < dims_in[0]; ++batch) {
            for (std::size_t channel = 0; channel < dims_in[1]; ++channel) {
              float channel_sum = 0;
              for (std::size_t i = 0; i < in_channel_nb_elems; ++i)

              {
                float val = valueDist(gen);
                array0[batch * in_batch_nb_elems +
                       channel * in_channel_nb_elems + i] = val;
                channel_sum += val;
              }
              result[batch * out_batch_nb_elems + channel] =
                  channel_sum / in_channel_nb_elems;
            }
          }

          // input0
          T0->resize(dims_in);
          T0->getImpl()->setRawPtr(array0, in_nb_elems);

          // results
          Tres->resize(dims_out);
          Tres->getImpl()->setRawPtr(result, out_nb_elems);

          op->computeOutputDims();
          start = std::chrono::system_clock::now();
          REQUIRE_NOTHROW(globAvgPool->forward());
          end = std::chrono::system_clock::now();
          duration += std::chrono::duration_cast<std::chrono::microseconds>(
              end - start);

          REQUIRE(Tres->nbDims() == op->getOutput(0)->nbDims());
          for (DimSize_t i = 0; i < op->getOutput(0)->nbDims(); ++i) {
            REQUIRE(Tres->dims().at(i) == op->getOutput(0)->dims().at(i));
          }

          REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));

          delete[] array0;
          delete[] result;
        }
      }
      SECTION("Using result from a pytorch function as groundtruth") {
        DimSize_t batch_size = 2;
        DimSize_t channels = 3;
        DimSize_t height = 4;
        DimSize_t width = 3;
        DimSize_t depth = 2;

        SECTION("2D_img") {
          const std::vector<DimSize_t> in_dims{batch_size, channels, height,
                                               width};
          const std::vector<DimSize_t> out_dims{batch_size, channels};
          DimSize_t in_nb_elems = batch_size * channels * height * width;
          DimSize_t out_nb_elems = batch_size * channels;
          number_of_operation +=
              in_nb_elems +
              channels; //  averaging per channel : 1 addition per element in
                        //  the channel + 1 division this for every batch
          auto input = new float[in_nb_elems];
          auto result = new float[out_nb_elems];
          input[0] = 0.1807716;
          input[1] = -0.0699881;
          input[2] = -0.3596235;
          input[3] = -0.9152045;
          input[4] = 0.6257653;
          input[5] = 0.0255099;
          input[6] = 0.9545137;
          input[7] = 0.0643485;
          input[8] = 0.3611506;
          input[9] = 1.1678782;
          input[10] = -1.3498932;
          input[11] = -0.5101767;
          input[12] = 0.2359577;
          input[13] = -0.2397784;
          input[14] = -0.9211147;
          input[15] = 1.5432971;
          input[16] = 1.3488258;
          input[17] = -0.1396417;
          input[18] = 0.2857972;
          input[19] = 0.9651205;
          input[20] = -2.0371499;
          input[21] = 0.4931363;
          input[22] = 1.4869986;
          input[23] = 0.5910330;
          input[24] = 0.1260297;
          input[25] = -1.5626874;
          input[26] = -1.1601028;
          input[27] = -0.3348408;
          input[28] = 0.4477722;
          input[29] = -0.8016447;
          input[30] = 1.5236114;
          input[31] = 2.5085869;
          input[32] = -0.6630959;
          input[33] = -0.2512752;
          input[34] = 1.0101448;
          input[35] = 0.1215468;
          input[36] = 0.1583993;
          input[37] = 1.1340188;
          input[38] = -1.1538976;
          input[39] = -0.2983968;
          input[40] = -0.5075365;
          input[41] = -0.9239212;
          input[42] = 0.5467061;
          input[43] = -1.4947776;
          input[44] = -1.2057148;
          input[45] = 0.5718198;
          input[46] = -0.5973545;
          input[47] = -0.6936757;
          input[48] = 1.6455388;
          input[49] = -0.8029931;
          input[50] = 1.3514109;
          input[51] = -0.2759193;
          input[52] = -1.5108346;
          input[53] = 2.1047730;
          input[54] = 2.7629590;
          input[55] = -1.7465292;
          input[56] = 0.8353187;
          input[57] = -1.9560477;
          input[58] = -0.8002653;
          input[59] = -0.5044988;
          input[60] = -0.0711742;
          input[61] = -0.5130699;
          input[62] = -1.0307810;
          input[63] = 0.9154347;
          input[64] = -0.2282317;
          input[65] = -0.6884708;
          input[66] = 0.1832259;
          input[67] = 0.6003584;
          input[68] = -1.5429375;
          input[69] = -0.3465560;
          input[70] = -0.1476223;
          input[71] = 0.6469797;

          result[0] = 0.0145876;
          result[1] = 0.3010401;
          result[2] = 0.0803371;

          result[3] = -0.3720275;
          result[4] = 0.0919094;
          result[5] = -0.1852371;

          // input0
          T0->resize(in_dims);
          T0->getImpl()->setRawPtr(input, in_nb_elems);

          // results
          Tres->resize(out_dims);
          Tres->getImpl()->setRawPtr(result, out_nb_elems);
          op->computeOutputDims();
          start = std::chrono::system_clock::now();
          REQUIRE_NOTHROW(globAvgPool->forward());
          end = std::chrono::system_clock::now();
          duration += std::chrono::duration_cast<std::chrono::microseconds>(
              end - start);

          REQUIRE(Tres->nbDims() == op->getOutput(0)->nbDims());
          for (DimSize_t i = 0; i < op->getOutput(0)->nbDims(); ++i) {
            REQUIRE(Tres->dims().at(i) == op->getOutput(0)->dims().at(i));
          }
          REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
          delete[] input;
          delete[] result;
        }
        SECTION("3D_img") {
          const std::vector<DimSize_t> in_dims{batch_size, channels, height,
                                               width, depth};
          const std::vector<DimSize_t> out_dims{batch_size, channels};
          DimSize_t in_nb_elems =
              batch_size * channels * height * width * depth;
          number_of_operation +=
              in_nb_elems +
              channels; //  averaging per channel : 1 addition per element in
                        //  the channel + 1 division this for every batch
          DimSize_t out_nb_elems = batch_size * channels;
          auto input = new float[in_nb_elems];
          auto result = new float[out_nb_elems];
          input[0] = 0.0061403;
          input[1] = -0.9665052;
          input[2] = 0.3582928;
          input[3] = 0.1072854;
          input[4] = 1.2463317;
          input[5] = 1.2460036;
          input[6] = 0.3534451;
          input[7] = 0.9425349;
          input[8] = -0.2103887;
          input[9] = -0.7959853;
          input[10] = 0.1297970;
          input[11] = -1.9445597;
          input[12] = 0.0609514;
          input[13] = -0.2379328;
          input[14] = 1.9020044;
          input[15] = -1.1762751;
          input[16] = 0.3404147;
          input[17] = 1.1685153;
          input[18] = -0.6526139;
          input[19] = 0.3767620;
          input[20] = 0.1887376;
          input[21] = 0.5154487;
          input[22] = 0.6371427;
          input[23] = -0.3948864;
          input[24] = -1.1571540;
          input[25] = 0.2896117;
          input[26] = 0.6163548;
          input[27] = -0.4370409;
          input[28] = 0.6589766;
          input[29] = 0.6587803;
          input[30] = -1.3702172;
          input[31] = -1.6210355;
          input[32] = 0.5872851;
          input[33] = 0.2860694;
          input[34] = 0.0082870;
          input[35] = -0.2523253;
          input[36] = -1.3247224;
          input[37] = 0.1891782;
          input[38] = 0.0211001;
          input[39] = 0.9404197;
          input[40] = -0.5576900;
          input[41] = -0.6939272;
          input[42] = -0.3252473;
          input[43] = 1.2439330;
          input[44] = -1.1671864;
          input[45] = -0.4091243;
          input[46] = 1.2600617;
          input[47] = -1.5630058;
          input[48] = 1.1346143;
          input[49] = -0.0823837;
          input[50] = 0.2893163;
          input[51] = 0.8357732;
          input[52] = -0.2449911;
          input[53] = 0.2712233;
          input[54] = 0.0936364;
          input[55] = -0.8834321;
          input[56] = -0.3274170;
          input[57] = 0.0783938;
          input[58] = -0.3807656;
          input[59] = 0.3775077;
          input[60] = 0.1119123;
          input[61] = 2.3142793;
          input[62] = -0.7989057;
          input[63] = -0.5643027;
          input[64] = -1.1346605;
          input[65] = 0.1705271;
          input[66] = 0.9946650;
          input[67] = 1.2625724;
          input[68] = 1.6218156;
          input[69] = 1.0774711;
          input[70] = 0.5947813;
          input[71] = -1.5290873;
          input[72] = 2.0437069;
          input[73] = -0.1656267;
          input[74] = 0.0870704;
          input[75] = -0.5276564;
          input[76] = -0.1002882;
          input[77] = 1.0539219;
          input[78] = -0.6230739;
          input[79] = -1.5905718;
          input[80] = -0.9741858;
          input[81] = -0.1869211;
          input[82] = 0.5816050;
          input[83] = -2.6339815;
          input[84] = -1.0764544;
          input[85] = 2.5903966;
          input[86] = 0.4940658;
          input[87] = 0.4671729;
          input[88] = 0.6588292;
          input[89] = -0.7257792;
          input[90] = 1.4280071;
          input[91] = -1.2187740;
          input[92] = 0.7380729;
          input[93] = -1.1599953;
          input[94] = -1.4355115;
          input[95] = -1.5304037;
          input[96] = 0.8474578;
          input[97] = 0.0774260;
          input[98] = 0.5433396;
          input[99] = -0.8438400;
          input[100] = -0.1089903;
          input[101] = -0.6354192;
          input[102] = 0.8772392;
          input[103] = 0.2844733;
          input[104] = 0.0975270;
          input[105] = -0.9785872;
          input[106] = -0.4320499;
          input[107] = -1.4937501;
          input[108] = -2.0644901;
          input[109] = 0.0851217;
          input[110] = 0.6644159;
          input[111] = 0.4168026;
          input[112] = 0.0958830;
          input[113] = -1.5699565;
          input[114] = 0.3739572;
          input[115] = -0.1420672;
          input[116] = -0.7864021;
          input[117] = 0.2443752;
          input[118] = -0.9811850;
          input[119] = -0.0698569;
          input[120] = 0.1463890;
          input[121] = 0.2536245;
          input[122] = 0.2136150;
          input[123] = 0.3113698;
          input[124] = 1.8353856;
          input[125] = 1.4473228;
          input[126] = -0.7373698;
          input[127] = 0.2485314;
          input[128] = -0.4789796;
          input[129] = -0.3396149;
          input[130] = 0.6438198;
          input[131] = 0.7287521;
          input[132] = -1.5119252;
          input[133] = -0.1006494;
          input[134] = 1.8955028;
          input[135] = 1.0871323;
          input[136] = 0.3620502;
          input[137] = -0.8826663;
          input[138] = 1.2220223;
          input[139] = -1.2817260;
          input[140] = 1.4153577;
          input[141] = 0.4148015;
          input[142] = 1.3458617;
          input[143] = 1.9718349;

          result[0] = 0.1333608;
          result[1] = -0.1716091;
          result[2] = 0.2201060;
          result[3] = -0.1585989;
          result[4] = -0.2291074;
          result[5] = 0.4254351;

          // input0
          T0->resize(in_dims);
          T0->getImpl()->setRawPtr(input, in_nb_elems);

          // results
          Tres->resize(out_dims);
          Tres->getImpl()->setRawPtr(result, out_nb_elems);
          op->computeOutputDims();
          start = std::chrono::system_clock::now();
          REQUIRE_NOTHROW(globAvgPool->forward());
          end = std::chrono::system_clock::now();
          duration += std::chrono::duration_cast<std::chrono::microseconds>(
              end - start);

          REQUIRE(Tres->nbDims() == op->getOutput(0)->nbDims());
          for (DimSize_t i = 0; i < op->getOutput(0)->nbDims(); ++i) {
            REQUIRE(Tres->dims().at(i) == op->getOutput(0)->dims().at(i));
          }
          REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
          delete[] input;
          delete[] result;
        }
      }
      std::cout << "GlobalAveragePooling total execution time : "
                << duration.count() << "µs" << std::endl;
      std::cout << "Number of operations : " << number_of_operation
                << std::endl;
      std::cout << "Operation / µs = " << number_of_operation / duration.count()
                << std::endl;
    }
  }
}
} // namespace Aidge