//+------------------------------------------------------------------+
//|                                                         gmdh.mqh |
//|                                  Copyright 2024, MetaQuotes Ltd. |
//|                                             https://www.mql5.com |
//+------------------------------------------------------------------+
#property copyright "Copyright 2024, MetaQuotes Ltd."
#property link      "https://www.mql5.com"
#include<GMDH\gmdh_internal.mqh>

/**
 *  Divide the input data into 2 parts
 *
 * param x Matrix of input data containing predictive variables
 * param y Vector of the taget values for the corresponding x data
 * param testSize Fraction of the input data that should be placed into the second part
 * param shuffle True if data should be shuffled before splitting into 2 parts, otherwise false
 * param randomSeed Seed number for the random generator to get the same division every time
 * return SplittedData object containing 4 elements of data: train x, train y, test x, test y
 */
SplittedData splitData(matrix& x,  vector& y, double testSize = 0.2, bool shuffle = false, int randomSeed = 0)
  {
   SplittedData data;

   if(validateInputData(testSize))
      return data;

   string errorMsg = "";
   if(x.Rows() != y.Size())
      errorMsg = " x rows number and y size must be equal";
   else
      if(round(x.Rows() * testSize) == 0 || round(x.Rows() * testSize) == x.Rows())
         errorMsg = "Result contains an empty array. Change the arrays size or the  value for correct splitting";
   if(errorMsg != "")
     {
      Print(__FUNCTION__," ",errorMsg);
      return data;
     }


   if(!shuffle)
      data = GmdhModel::internalSplitData(x, y, testSize);
   else
     {
      if(randomSeed == 0)
         randomSeed = int(GetTickCount64());
      MathSrand(uint(randomSeed));

      int shuffled_rows_indexes[],shuffled[];
      MathSequence(0,int(x.Rows()-1),1,shuffled_rows_indexes);
      MathSample(shuffled_rows_indexes,int(shuffled_rows_indexes.Size()),shuffled);

      int testItemsNumber = (int)round(x.Rows() * testSize);


      matrix Train,Test;
      vector train,test;

      Train.Resize(x.Rows()-ulong(testItemsNumber),x.Cols());
      Test.Resize(ulong(testItemsNumber),x.Cols());

      train.Resize(x.Rows()-ulong(testItemsNumber));
      test.Resize(ulong(testItemsNumber));

      for(ulong i = 0; i<Train.Rows(); i++)
        {
         Train.Row(x.Row(shuffled[i]),i);
         train[i] = y[shuffled[i]];
        }

      for(ulong i = 0; i<Test.Rows(); i++)
        {
         Test.Row(x.Row(shuffled[Train.Rows()+i]),i);
         test[i] = y[shuffled[Train.Rows()+i]];
        }

      data.xTrain = Train;
      data.xTest = Test;
      data.yTrain = train;
      data.yTest = test;
     }

   return data;
  }

//+------------------------------------------------------------------+
//| Class implementing the general logic of GMDH algorithms          |
//+------------------------------------------------------------------+

class  GmdhModel
  {
protected:

   string            modelName; // model name
   int               level; // Current number of the algorithm training level
   int               inputColsNumber; // The number of predictive variables in the original data
   double            lastLevelEvaluation; // The external criterion value of the previous training level
   double            currentLevelEvaluation; // The external criterion value of the current training level
   bool              training_complete; // flag indicator successful completion of model training
   CVector2d         bestCombinations; // Storage for the best models of previous levels

   /**
    *struct for generating vector sequence
    */
   struct unique
     {
   private:
      int            current;

      int            run(void)
        {
         return ++current;
        }

   public:
                     unique(void)
        {
         current = -1;
        }

      vector         generate(ulong t)
        {
         ulong s=0;
         vector ret(t);

         while(s<t)
            ret[s++] = run();

         return ret;
        }
     };

   /**
    *  Find all combinations of k elements from n
    *
    * param n Number of all elements
    * param k Number of required elements
    * return Vector of all combinations of k elements from n
    */
   void              nChooseK(int n, int k, vector &combos[])
     {
      if(n<=0 || k<=0 || n<k)
        {
         Print(__FUNCTION__," invalid parameters for n and or k", "n ",n , " k ", k);
         return;
        }

      unique q;

      vector comb = q.generate(ulong(k));

      ArrayResize(combos,combos.Size()+1,100);

      long first, last;

      first = 0;
      last = long(k);
      combos[combos.Size()-1]=comb;

      while(comb[first]!= double(n - k))
        {
         long mt = last;
         while(comb[--mt] == double(n - (last - mt)));
         comb[mt]++;
         while(++mt != last)
            comb[mt] = comb[mt-1]+double(1);
         ArrayResize(combos,combos.Size()+1,100);
         combos[combos.Size()-1]=comb;
        }

      for(uint i = 0; i<combos.Size(); i++)
        {
         combos[i].Resize(combos[i].Size()+1);
         combos[i][combos[i].Size()-1] = n;
        }

      return;
     }

   /**
    *  Get the mean value of extrnal criterion of the k best models
    *
    * param sortedCombinations Sorted vector of current level models
    * param k The numebr of the best models
    * return Calculated mean value of extrnal criterion of the k best models
    */
   double            getMeanCriterionValue(CVector &sortedCombinations, int k)
     {
      k = MathMin(k, sortedCombinations.size());

      double crreval=0;

      for(int i = 0; i<k; i++)
         crreval +=sortedCombinations[i].evaluation();
      if(k)
         return crreval/double(k);
      else
        {
         Print(__FUNCTION__, " Zero divide error ");
         return 0.0;
        }
     }

   /**
    *  Get the sign of the polynomial variable coefficient
    *
    * param coeff Selected coefficient
    * param isFirstCoeff True if the selected coefficient will be the first in the polynomial representation, otherwise false
    * return String containing the sign of the coefficient
    */
   string            getPolynomialCoeffSign(double coeff, bool isFirstCoeff)
     {
      return ((coeff >= 0) ? ((isFirstCoeff) ? " " : " + ") : " - ");
     }

   /**
    *  Get the rounded value of the polynomial variable coefficient without sign
    *
    * param coeff Selected coefficient
    * param isLastCoeff True if the selected coefficient will be the last one in the polynomial representation, otherwise false
    * return String containing the rounded value of the coefficient without sign
    */
   string            getPolynomialCoeffValue(double coeff, bool isLastCoeff)
     {
      string stringCoeff = StringFormat("%e",MathAbs(coeff));
      return ((stringCoeff != "1" || isLastCoeff) ? stringCoeff : "");
     }

   /**
    *  Train given subset of models and calculate external criterion for them
    *
    * param data Data used for training and evaulating models
    * param criterion Selected external criterion
    * param beginCoeffsVec Iterator indicating the beginning of a subset of models
    * param endCoeffsVec Iterator indicating the end of a subset of models
    * param leftTasks The number of remaining untrained models at the entire level
    * param verbose 1 if the printing detailed infomation about training process is needed, otherwise 0
    */
   bool              polynomialsEvaluation(SplittedData& data,  Criterion& criterion,  CVector &combos, uint beginCoeffsVec,
                                           uint endCoeffsVec)
     {
      vector cmb,ytrain,ytest;
      matrix x1,x2;
      for(uint i = beginCoeffsVec; i<endCoeffsVec; i++)
        {
         cmb = combos[i].combination();
         x1 = xDataForCombination(data.xTrain,cmb);
         x2 = xDataForCombination(data.xTest,cmb);
         ytrain = data.yTrain;
         ytest = data.yTest;
         PairDVXd pd = criterion.calculate(x1,x2,ytrain,ytest);

         if(pd.second.HasNan()>0)
            {
             Print(__FUNCTION__," No solution found for coefficient at ", i, "\n xTrain \n", x1, "\n xTest \n", x2, "\n yTrain \n", ytrain, "\n yTest \n", ytest);
             combos[i].setEvaluation(DBL_MAX);
             combos[i].setBestCoeffs(vector::Ones(3));
            }
         else
            {
             combos[i].setEvaluation(pd.first);
             combos[i].setBestCoeffs(pd.second);
            } 
        }

      return true;
     }

   /**
   *  Determine the need to continue training and prepare the algorithm for the next level
   *
   * param kBest The number of best models based of which new models of the next level will be constructed
   * param pAverage The number of best models based of which the external criterion for each level will be calculated
   * param combinations Trained models of the current level
   * param criterion Selected external criterion
   * param data Data used for training and evaulating models
   * param limit The minimum value by which the external criterion should be improved in order to continue training
   * return True if the algorithm needs to continue training, otherwise fasle
   */
   bool              nextLevelCondition(int kBest, int pAverage, CVector &combinations,
                                        Criterion& criterion, SplittedData& data, double limit)
     {
      MatFunc fun = NULL;
      CVector bestcombinations;
      criterion.getBestCombinations(combinations,bestcombinations,data, fun, kBest);
      currentLevelEvaluation = getMeanCriterionValue(bestcombinations, pAverage);

      if(lastLevelEvaluation - currentLevelEvaluation > limit)
        {
         lastLevelEvaluation = currentLevelEvaluation;
         if(preparations(data,bestcombinations))
           {
            ++level;
            return true;
           }
        }
      removeExtraCombinations();
      return false;

     }

   /**
    *  Fit the algorithm to find the best solution
    *
    * param x Matrix of input data containing predictive variables
    * param y Vector of the taget values for the corresponding x data
    * param criterion Selected external criterion
    * param kBest The number of best models based of which new models of the next level will be constructed
    * param testSize Fraction of the input data that should be used to evaluate models at each level
    * param pAverage The number of best models based of which the external criterion for each level will be calculated
    * param limit The minimum value by which the external criterion should be improved in order to continue training
    * return A pointer to the algorithm object for which the training was performed
    */
   bool              gmdhFit(matrix& x,  vector& y,  Criterion& criterion, int kBest,
                             double testSize, int pAverage, double limit)
     {
      if(x.Rows() != y.Size())
        {
         Print("X rows number and y size must be equal");
         return false;
        }

      level = 1; // reset last training
      inputColsNumber = int(x.Cols());
      lastLevelEvaluation = DBL_MAX;

      SplittedData data = internalSplitData(x, y, testSize, true) ;
      training_complete = false;
      bool goToTheNextLevel;
      CVector evaluationCoeffsVec;
      do
        {
         vector combinations[];
         generateCombinations(int(data.xTrain.Cols() - 1),combinations);
         
         if(combinations.Size()<1)
           {
            Print(__FUNCTION__," Training aborted");
            return training_complete;
           }  

         evaluationCoeffsVec.clear();

         int currLevelEvaluation = 0;
         for(int it = 0; it < int(combinations.Size()); ++it, ++currLevelEvaluation)
           {
            Combination ncomb(combinations[it]);
            evaluationCoeffsVec.push_back(ncomb);
           }

         if(!polynomialsEvaluation(data,criterion,evaluationCoeffsVec,0,uint(currLevelEvaluation)))
           {
            Print(__FUNCTION__," Training aborted");
            return training_complete;
           }

         goToTheNextLevel = nextLevelCondition(kBest, pAverage, evaluationCoeffsVec, criterion, data, limit); // checking the results of the current level for improvement
        }
      while(goToTheNextLevel);

      training_complete = true;

      return true;
     }

   /**
    *  Get new model structures for the new level of training
    *
    * param n_cols The number of existing predictive variables at the current training level
    * return Vector of new model structures
    */
   virtual void      generateCombinations(int n_cols,vector &out[])
     {
      return;
     }


   ///  Removed the saved models that are no longer needed
   virtual void      removeExtraCombinations(void)
     {
      return;
     }

   /**
    *  Prepare data for the next training level
    *
    * param data Data used for training and evaulating models at the current level
    * param _bestCombinations Vector of the k best models of the current level
    * return True if the training process can be continued, otherwise false
    */
   virtual bool      preparations(SplittedData& data, CVector &_bestCombinations)
     {
      return false;
     }

   /**
    *  Get the data constructed according to the model structure from the original data
    *
    * param x Training data at the current level
    * param comb Vector containing the indexes of the x matrix columns that should be used in the model
    * return Constructed data
    */
   virtual matrix    xDataForCombination(matrix& x,  vector& comb)
     {
      return matrix::Zeros(10,10);
     }

   /**
    *  Get the designation of polynomial equation
    *
    * param levelIndex The number of the level counting from 0
    * param combIndex The number of polynomial in the level counting from 0
    * return The designation of polynomial equation
    */
   virtual string    getPolynomialPrefix(int levelIndex, int combIndex)
     {
      return NULL;
     }

   /**
    *  Get the string representation of the polynomial variable
    *
    * param levelIndex The number of the level counting from 0
    * param coeffIndex The number of the coefficient related to the selected variable in the polynomial counting from 0
    * param coeffsNumber The number of coefficients in the polynomial
    * param bestColsIndexes Indexes of the data columns used to construct polynomial of the model
    * return The string representation of the polynomial variable
    */
   virtual string    getPolynomialVariable(int levelIndex, int coeffIndex, int coeffsNumber,
                                           vector& bestColsIndexes)
     {
      return NULL;
     }

   /*
    *  Transform model data to JSON format for further saving
    *
    * return JSON value of model data
    */
   virtual CJAVal    toJSON(void)
     {
      CJAVal json_obj_model;

      json_obj_model["modelName"] = getModelName();
      json_obj_model["inputColsNumber"] = inputColsNumber;
      json_obj_model["bestCombinations"] = CJAVal(jtARRAY,"");


      for(int i = 0; i<bestCombinations.size(); i++)
        {

         CJAVal Array(jtARRAY,"");

         for(int k = 0; k<bestCombinations[i].size(); k++)
           {
            CJAVal collection;
            collection["combination"] = CJAVal(jtARRAY,"");
            collection["bestCoeffs"] = CJAVal(jtARRAY,"");
            vector combination = bestCombinations[i][k].combination();
            vector bestcoeff = bestCombinations[i][k].bestCoeffs();
            for(ulong j=0; j<combination.Size(); j++)
               collection["combination"].Add(int(combination[j]));
            for(ulong j=0; j<bestcoeff.Size(); j++)
               collection["bestCoeffs"].Add(bestcoeff[j],-15);
            Array.Add(collection);
           }

         json_obj_model["bestCombinations"].Add(Array);

        }

      return json_obj_model;

     }

   /**
    *  Set up model from JSON format model data
    *
    * param jsonModel Model data in JSON format
    * return Method exit status
    */
   virtual bool      fromJSON(CJAVal &jsonModel)
     {
      modelName = jsonModel["modelName"].ToStr();
      bestCombinations.clear();
      inputColsNumber = int(jsonModel["inputColsNumber"].ToInt());

      for(int i = 0; i<jsonModel["bestCombinations"].Size(); i++)
        {
         CVector member;
         for(int j = 0; j<jsonModel["bestCombinations"][i].Size(); j++)
           {
            Combination cb;
            vector c(ulong(jsonModel["bestCombinations"][i][j]["combination"].Size()));
            vector cf(ulong(jsonModel["bestCombinations"][i][j]["bestCoeffs"].Size()));
            for(int k = 0; k<jsonModel["bestCombinations"][i][j]["combination"].Size(); k++)
               c[k] = jsonModel["bestCombinations"][i][j]["combination"][k].ToDbl();
            for(int k = 0; k<jsonModel["bestCombinations"][i][j]["bestCoeffs"].Size(); k++)
               cf[k] = jsonModel["bestCombinations"][i][j]["bestCoeffs"][k].ToDbl();
            cb.setBestCoeffs(cf);
            cb.setCombination(c);
            member.push_back(cb);
           }
         bestCombinations.push_back(member);
        }
      return true;
     }



   /**
    *  Compare the number of required and actual columns of the input matrix
    *
    * param x Given matrix of input data
    */
   bool              checkMatrixColsNumber(matrix& x)
     {
      if(ulong(inputColsNumber) != x.Cols())
        {
         Print("Matrix  must have " + string(inputColsNumber) + " columns because there were " + string(inputColsNumber) + " columns in the training  matrix");
         return false;
        }

      return true;
     }
     
     

public:
   ///  Construct a new Gmdh Model object
                     GmdhModel() : level(1), lastLevelEvaluation(0) {}

   /**
   *  Get full class name
   *
   * return String containing the name of the model class
   */
   string            getModelName(void)
     {
      return modelName;
     }
   /**
     *Get number of inputs required for model
     */
    int getNumInputs(void)
     {
      return inputColsNumber;
     }

   /**
    *  Save model data into regular file
    *
    * param path Path to regular file
    */
   bool              save(string file_name)
     {

      CFileTxt modelFile;

      if(modelFile.Open(file_name,FILE_WRITE|FILE_COMMON,0)==INVALID_HANDLE)
        {
         Print("failed to open file ",file_name," .Error - ",::GetLastError());
         return false;
        }
      else
        {
         CJAVal js=toJSON();
         if(modelFile.WriteString(js.Serialize())==0)
           {
            Print("failed write to ",file_name,". Error -",::GetLastError());
            return false;
           }
        }

      return true;
     }

   /**
    *  Load model data from regular file
    *
    * param path Path to regular file
    */
   bool               load(string file_name)
     {
      training_complete = false;
      CFileTxt modelFile;
      CJAVal js;

      if(modelFile.Open(file_name,FILE_READ|FILE_COMMON,0)==INVALID_HANDLE)
        {
         Print("failed to open file ",file_name," .Error - ",::GetLastError());
         return false;
        }
      else
        {
         if(!js.Deserialize(modelFile.ReadString()))
           {
            Print("failed to read from ",file_name,".Error -",::GetLastError());
            return false;
           }
         training_complete = fromJSON(js);
        }
      return training_complete;
     }
   /**
    *  Divide the input data into 2 parts without shuffling
    *
    * param x Matrix of input data containing predictive variables
    * param y Vector of the taget values for the corresponding x data
    * param testSize Fraction of the input data that should be placed into the second part
    * param addOnesCol True if it is needed to add a column of ones to the x data, otherwise false
    * return SplittedData object containing 4 elements of data: train x, train y, test x, test y
    */
   static SplittedData internalSplitData(matrix& x,  vector& y, double testSize, bool addOnesCol = false)
     {
      SplittedData data;
      ulong testItemsNumber = ulong(round(double(x.Rows()) * testSize));
      matrix Train,Test;
      vector train,test;

      if(addOnesCol)
        {
         Train.Resize(x.Rows() - testItemsNumber, x.Cols() + 1);
         Test.Resize(testItemsNumber, x.Cols() + 1);

         for(ulong i = 0; i<Train.Rows(); i++)
            Train.Row(x.Row(i),i);

         Train.Col(vector::Ones(Train.Rows()),x.Cols());

         for(ulong i = 0; i<Test.Rows(); i++)
            Test.Row(x.Row(Train.Rows()+i),i);

         Test.Col(vector::Ones(Test.Rows()),x.Cols());

        }
      else
        {
         Train.Resize(x.Rows() - testItemsNumber, x.Cols());
         Test.Resize(testItemsNumber, x.Cols());

         for(ulong i = 0; i<Train.Rows(); i++)
            Train.Row(x.Row(i),i);

         for(ulong i = 0; i<Test.Rows(); i++)
            Test.Row(x.Row(Train.Rows()+i),i);
        }

      train.Init(y.Size() - testItemsNumber,slice,y,0,y.Size() - testItemsNumber - 1);
      test.Init(testItemsNumber,slice,y,y.Size() - testItemsNumber);

      data.yTrain = train;
      data.yTest = test;

      data.xTrain = Train;
      data.xTest = Test;

      return data;
     }

   /**
    *  Get long-term forecast for the time series
    *
    * param x One row of the test time series data
    * param lags The number of lags (steps) to make a forecast for
    * return Vector containing long-term forecast
    */
   virtual vector    predict(vector& x, int lags)
     {
      return vector::Zeros(1);
     }

   /**
    *  Get the String representation of the best polynomial
    *
    * return String representation of the best polynomial
    */
   string            getBestPolynomial(void)
     {
      string polynomialStr = "";
      int ind = 0;
      for(int i = 0; i < bestCombinations.size(); ++i)
        {
         for(int j = 0; j < bestCombinations[i].size(); ++j)
           {
            vector bestColsIndexes = bestCombinations[i][j].combination();
            vector bestCoeffs = bestCombinations[i][j].bestCoeffs();
            polynomialStr += getPolynomialPrefix(i, j);
            bool isFirstCoeff = true;
            for(int k = 0; k < int(bestCoeffs.Size()); ++k)
              {
               if(bestCoeffs[k])
                 {
                  polynomialStr += getPolynomialCoeffSign(bestCoeffs[k], isFirstCoeff);
                  string coeffValuelStr = getPolynomialCoeffValue(bestCoeffs[k], (k == (bestCoeffs.Size() - 1)));
                  polynomialStr += coeffValuelStr;
                  if(coeffValuelStr != "" && k != bestCoeffs.Size() - 1)
                     polynomialStr += "*";
                  polynomialStr += getPolynomialVariable(i, k, int(bestCoeffs.Size()), bestColsIndexes);
                  isFirstCoeff = false;
                 }
              }
            if(i < bestCombinations.size() - 1 || j < (bestCombinations[i].size() - 1))
               polynomialStr += "\n";
           }//j
         if(i < bestCombinations.size() - 1 && bestCombinations[i].size() > 1)
            polynomialStr += "\n";
        }//i
      return polynomialStr;
     }

                    ~GmdhModel()
     {
      for(int i = 0; i<bestCombinations.size(); i++)
         bestCombinations[i].clear();

      bestCombinations.clear();
     }
  };


//+------------------------------------------------------------------+
