//+------------------------------------------------------------------+
//|                                                     norm2ord.mqh |
//|                                  Copyright 2024, MetaQuotes Ltd. |
//|                                             https://www.mql5.com |
//+------------------------------------------------------------------+
#property copyright "Copyright 2024, MetaQuotes Ltd."
#property link      "https://www.mql5.com"
#include<Math/Stat/Uniform.mqh>
#include<np.mqh>

//+------------------------------------------------------------------+
//| Nominal-to-ordinal conversion                                    |
//+------------------------------------------------------------------+
class CNomOrd
  {
private:
   int m_rows ;
   int               m_cols;
   int m_pred ;
   matrix            m_class_ids;
   double m_rankings[] ;
   int m_indices[] ;
   vector m_mean_rankings[] ;
   vector m_class_counts[] ;
   vector m_median ;
   vector            m_target;
   vector            m_mapping[];
   ulong             m_colindices[];

   matrix shuffle_target ;
   int count_max_class ;
   double            orig_max_class;
   uint              m_dim_reduce;
   bool              mapped;

   //+------------------------------------------------------------------+
   //|  sort data                                                       |
   //+------------------------------------------------------------------+

   void              qsortdsi(int first, int last, double &data[], int &slave[])
     {
      int lower, upper, itemp ;
      double ftemp, split ;

      split = data[(first+last)/2] ;
      lower = first ;
      upper = last ;

      do
        {
         while(split > data[lower])
            ++lower ;
         while(split < data[upper])
            --upper ;
         if(lower == upper)
           {
            ++lower ;
            --upper ;
           }
         else
            if(lower < upper)
              {
               itemp = slave[lower] ;
               slave[lower] = slave[upper] ;
               slave[upper] = itemp ;
               ftemp = data[lower] ;
               data[lower++] = data[upper] ;
               data[upper--] = ftemp ;
              }
        }
      while(lower <= upper) ;

      if(first < upper)
         qsortdsi(first, upper, data, slave) ;
      if(lower < last)
         qsortdsi(lower, last, data, slave) ;
     }
   //+------------------------------------------------------------------+
   //|  re-sort                                                         |
   //+------------------------------------------------------------------+

   void              qsortisd(int first, int last, int &data[], double &slave[])
     {
      int lower, upper ;
      int ftemp, split ;
      double dtemp ;

      split = data[(first+last)/2] ;
      lower = first ;
      upper = last ;

      do
        {
         while(split > data[lower])
            ++lower ;
         while(split < data[upper])
            --upper ;
         if(lower == upper)
           {
            ++lower ;
            --upper ;
           }
         else
            if(lower < upper)
              {
               dtemp = slave[lower] ;
               slave[lower] = slave[upper] ;
               slave[upper] = dtemp ;
               ftemp = data[lower] ;
               data[lower++] = data[upper] ;
               data[upper--] = ftemp ;
              }
        }
      while(lower <= upper) ;

      if(first < upper)
         qsortisd(first, upper, data, slave) ;
      if(lower < last)
         qsortisd(lower, last, data, slave) ;
     }
   //+------------------------------------------------------------------+
   //| train                                                            |
   //+------------------------------------------------------------------+

   vector            train(vector &classid, vector &classcounts,vector &target, double &halfpercentile)
     {
      int i, k ;
      double val, rank ;
      vector meanranks(target.Size());
      int nclasses = int(classcounts.Size());

      for(int i=0 ; i<m_rows ; i++)
        {
         m_rankings[i] = target[i] ;
         m_indices[i] = i ;
        }


      qsortdsi(0, m_rows-1, m_rankings, m_indices) ;

      for(i=0 ; i<m_rows ;)
        {
         val = m_rankings[i] ;
         for(k=i+1 ; k<m_rows ; k++)     // Find all ties
           {
            if(m_rankings[k] > val)
               break ;
           }
         rank = 0.5 * ((double) i + (double) k + 1.0) ;
         while(i < k)
            m_rankings[i++] = 100.0 * rank / m_rows ;
        } // For each case in sorted y array

      if(MathMod(m_rows,2)>0)
         halfpercentile = m_rankings[m_rows/2] ;
      else
         halfpercentile = 0.5 * (m_rankings[m_rows/2-1] + m_rankings[m_rows/2]) ;

      qsortisd(0, m_rows-1, m_indices, m_rankings) ;

      /*
         Pass through the data, cumulating the mean rank for each class bin.
      */

      meanranks.Fill(0.0);

      for(i=0 ; i<m_rows ; i++)
        {
         k = (int)classid[i] ;           // Class of this case
         if(k < 0  ||  k >= nclasses)
           {
            Print(__FUNCTION__ " k index out of bounds, ", k);
            return vector::Zeros(1);
           }
         meanranks[k] += m_rankings[i] ; // Cumulate target rank of this case
        }

      for(i=0 ; i<nclasses ; i++)
        {
         if(classcounts[i] > 0)
            meanranks[i] /= classcounts[i] ;
         else
            meanranks[i] = halfpercentile ;
        }

      return meanranks;
     }


public:
   //+------------------------------------------------------------------+
   //|  constructor                                                     |
   //+------------------------------------------------------------------+

                     CNomOrd(void)
     {
     }
   //+------------------------------------------------------------------+
   //|  destructor                                                      |
   //+------------------------------------------------------------------+

                    ~CNomOrd(void)
     {
     }
   //+------------------------------------------------------------------+
   //| fit mapping to training data                                     |
   //+------------------------------------------------------------------+

   bool              fit(matrix &preds_in, ulong &cols[], vector &target)
     {
      m_dim_reduce = 0;
      mapped = false;
      //---
      if(cols.Size()==0 && preds_in.Cols())
        {
         m_pred = int(preds_in.Cols());
         if(!np::arange(m_colindices,m_pred,ulong(0),ulong(1)))
           {
            Print(__FUNCTION__, " arange error ");
            return mapped;
           }
        }
      else
        {
         m_pred = int(cols.Size());
        }
      //---
      m_rows = int(preds_in.Rows()) ;
      m_cols = int(preds_in.Cols());
      //---
      if(ArrayResize(m_mean_rankings,m_pred)<0 ||
         ArrayResize(m_rankings,m_rows)<0 ||
         ArrayResize(m_indices,m_rows)<0 ||
         ArrayResize(m_mapping,m_pred)<0  ||
         ArrayResize(m_class_counts,m_pred)<0  ||
         !m_median.Resize(m_pred) ||
         !m_class_ids.Resize(m_rows,m_pred) ||
         !shuffle_target.Resize(m_rows,2) ||
         (cols.Size()>0 && ArrayCopy(m_colindices,cols)<0) ||
         !ArraySort(m_colindices))
        {
         Print(__FUNCTION__, " Memory allocation failure ", GetLastError());
         return mapped;
        }
      //---
      for(uint col = 0; col<m_colindices.Size(); col++)
        {
         vector var = preds_in.Col(m_colindices[col]);
         m_mapping[col] = np::unique(var);
         m_class_counts[col] = vector::Zeros(m_mapping[col].Size());
         for(ulong i = 0; i<var.Size(); i++)
           {
            for(ulong j = 0; j<m_mapping[col].Size(); j++)
              {
               if(MathAbs(var[i]-m_mapping[col][j])<=1.e-15)
                 {
                  m_class_ids[i][col]=double(j);
                  ++m_class_counts[col][j];
                  break;
                 }
              }
           }
        }

      m_target = target;

      for(uint i = 0; i<m_colindices.Size(); i++)
        {
         vector cid = m_class_ids.Col(i);
         vector ccounts = m_class_counts[i];
         m_mean_rankings[i] = train(cid,ccounts,m_target,m_median[i]);
        }

      mapped = true;
      return mapped;
     }

   //+------------------------------------------------------------------+
   //| categorical conversion with dimensionality reduction             |
   //+------------------------------------------------------------------+
   matrix            fitTransform(matrix &preds_in, ulong &cols[], vector &target)
     {
      //---
      if(preds_in.Cols()<2)
        {
         if(!fit(preds_in,cols,target))
           {
            Print(__FUNCTION__, " error at ", __LINE__);
            return matrix::Zeros(1,1);
           }
         //---
         return transform(preds_in);
        }
      //---
      m_dim_reduce = 1;
      mapped = false;
      //---
      if(cols.Size()==0 && preds_in.Cols())
        {
         m_pred = int(preds_in.Cols());
         if(!np::arange(m_colindices,m_pred,ulong(0),ulong(1)))
           {
            Print(__FUNCTION__, " arange error ");
            return matrix::Zeros(1,1);
           }
        }
      else
        {
         m_pred = int(cols.Size());
        }
      //---
      m_rows = int(preds_in.Rows()) ;
      m_cols = int(preds_in.Cols());
      //---
      if(ArrayResize(m_mean_rankings,1)<0 ||
         ArrayResize(m_rankings,m_rows)<0 ||
         ArrayResize(m_indices,m_rows)<0 ||
         ArrayResize(m_mapping,m_pred)<0  ||
         ArrayResize(m_class_counts,1)<0  ||
         !m_median.Resize(m_pred) ||
         !m_class_ids.Resize(m_rows,m_pred) ||
         !shuffle_target.Resize(m_rows,2) ||
         (cols.Size()>0 && ArrayCopy(m_colindices,cols)<0) ||
         !ArraySort(m_colindices))
        {
         Print(__FUNCTION__, " Memory allocation failure ", GetLastError());
         return matrix::Zeros(1,1);
        }
      //---
      for(uint col = 0; col<m_colindices.Size(); col++)
        {
         vector var = preds_in.Col(m_colindices[col]);
         m_mapping[col] = np::unique(var);
         for(ulong i = 0; i<var.Size(); i++)
           {
            for(ulong j = 0; j<m_mapping[col].Size(); j++)
              {
               if(MathAbs(var[i]-m_mapping[col][j])<=1.e-15)
                 {
                  m_class_ids[i][col]=double(j);
                  break;
                 }
              }
           }
        }

      m_class_counts[0] = vector::Zeros(ulong(m_colindices.Size()));

      if(!m_class_ids.Col(m_class_ids.ArgMax(1),0))
        {
         Print(__FUNCTION__, " failed to insert new class id values ", GetLastError());
         return matrix::Zeros(1,1);
        }

      for(ulong i = 0; i<m_class_ids.Rows(); i++)
         ++m_class_counts[0][ulong(m_class_ids[i][0])];

      m_target = target;

      vector cid = m_class_ids.Col(0);
      m_mean_rankings[0] = train(cid,m_class_counts[0],m_target,m_median[0]);

      mapped = true;

      ulong unchanged_feature_cols[];

      for(ulong i = 0; i<preds_in.Cols(); i++)
        {
         int found = ArrayBsearch(m_colindices,i);
         if(m_colindices[found]!=i)
           {
            if(!unchanged_feature_cols.Push(i))
              {
               Print(__FUNCTION__, " Failed array insertion ", GetLastError());
               return matrix::Zeros(1,1);
              }
           }
        }

      matrix out(preds_in.Rows(),unchanged_feature_cols.Size()+1);
      ulong nfeatureIndex = unchanged_feature_cols.Size();

      if(nfeatureIndex)
        {
         matrix input_copy = np::selectMatrixCols(preds_in,unchanged_feature_cols);
         if(!np::matrixCopyCols(out,input_copy,0,nfeatureIndex))
           {
            Print(__FUNCTION__, " failed to copy matrix columns ");
            return matrix::Zeros(1,1);
           }
        }

      for(ulong i = 0; i<out.Rows(); i++)
        {
         ulong r = ulong(m_class_ids[i][0]);
         if(r>=m_mean_rankings[0].Size())
           {
            Print(__FUNCTION__, " critical error , index out of bounds ");
            return matrix::Zeros(1,1);
           }
         out[i][nfeatureIndex] = m_mean_rankings[0][r];
        }

      return out;
     }


   //+------------------------------------------------------------------+
   //|  transform nominal to ordinal based on learned mapping           |
   //+------------------------------------------------------------------+

   matrix            transform(matrix &data_in)
     {
      if(m_dim_reduce)
        {
         Print(__FUNCTION__, " Invalid method call, Use fitTransform() or call fit() ");
         return matrix::Zeros(1,1);
        }

      if(!mapped)
        {
         Print(__FUNCTION__, " Invalid method call, training either failed or was not done. Call fit() first. ");
         return matrix::Zeros(1,1);
        }

      if(data_in.Cols()!=ulong(m_cols))
        {
         Print(__FUNCTION__, " Unexpected input data shape, doesnot match training data ");
         return matrix::Zeros(1,1);
        }
      //---
      matrix out = data_in;
      //---
      for(uint col = 0; col<m_colindices.Size(); col++)
        {
         vector var = data_in.Col(m_colindices[col]);
         for(ulong i = 0; i<var.Size(); i++)
           {
            for(ulong j = 0; j<m_mapping[col].Size(); j++)
              {
               if(MathAbs(var[i]-m_mapping[col][j])<=1.e-15)
                 {
                  out[i][m_colindices[col]]=m_mean_rankings[col][j];
                  break;
                 }
              }
           }
        }
      //---
      return out;
     }

   //+------------------------------------------------------------------+
   //|   test for a genuine relationship between predictor and target   |
   //+------------------------------------------------------------------+

   double            score(int reps, vector &test_target,ulong selectedVar=0)
     {
      if(!mapped)
        {
         Print(__FUNCTION__, " Invalid method call, training either failed or was not done. Call fit() first. ");
         return -1.0;
        }

      if(m_dim_reduce==0 && selectedVar>=ulong(m_colindices.Size()))
        {
         Print(__FUNCTION__, " invalid predictor selection ");
         return -1.0;
        }

      if(test_target.Size()!=m_rows)
        {
         Print(__FUNCTION__, " invalid targets parameter, Does not match shape of training data. ");
         return -1.0;
        }

      int i, j, irep, unif_error ;
      double dtemp, min_neg, max_neg, min_pos, max_pos, medn ;
      dtemp = 0.0;
      min_neg = 0.0;
      max_neg = -DBL_MIN;
      min_pos = DBL_MAX;
      max_pos = DBL_MIN ;

      vector id = (m_dim_reduce)?m_class_ids.Col(0):m_class_ids.Col(selectedVar);
      vector cc = (m_dim_reduce)?m_class_counts[0]:m_class_counts[selectedVar];
      int nclasses = int(cc.Size());

      if(reps < 1)
         reps = 1 ;

      for(irep=0 ; irep<reps ; irep++)
        {

         if(!shuffle_target.Col(test_target,0))
           {
            Print(__FUNCTION__, " error filling shuffle_target column ", GetLastError());
            return -1.0;
           }

         if(irep)
           {
            i = m_rows ;
            while(i > 1)
              {
               j = (int)(MathRandomUniform(0.0,1.0,unif_error) * i) ;
               if(unif_error)
                 {
                  Print(__FUNCTION__, " mathrandomuniform() error ", unif_error);
                  return -1.0;
                 }
               if(j >= i)
                  j = i - 1 ;
               dtemp = shuffle_target[--i][0] ;
               shuffle_target[i][0] = shuffle_target[j][0] ;
               shuffle_target[j][0] = dtemp ;
              }
           }

         vector totrain = shuffle_target.Col(0);
         vector m_ranks = train(id,cc,totrain,medn) ;

         if(irep == 0)
           {
            for(i=0 ; i<nclasses ; i++)
              {
               if(i == 0)
                  min_pos = max_pos = m_ranks[i] ;
               else
                 {
                  if(m_ranks[i] > max_pos)
                     max_pos = m_ranks[i] ;
                  if(m_ranks[i] < min_pos)
                     min_pos = m_ranks[i] ;
                 }
              } // For i<nclasses
            orig_max_class = max_pos - min_pos ;
            count_max_class = 1 ;

           }
         else
           {
            for(i=0 ; i<nclasses ; i++)
              {
               if(i == 0)
                  min_pos = max_pos = m_ranks[i];
               else
                 {
                  if(m_ranks[i] > max_pos)
                     max_pos = m_ranks[i] ;
                  if(m_ranks[i] < min_pos)
                     min_pos = m_ranks[i] ;
                 }
              } // For i<nclasses
            if(max_pos - min_pos >= orig_max_class)
               ++count_max_class ;

           }
        }



      if(reps <= 1)
         return -1.0;

      return double(count_max_class)/ double(reps);

     }

  };
//+------------------------------------------------------------------+
//| one hot encoder class                                            |
//+------------------------------------------------------------------+
class COneHotEncoder
  {
private:
   vector            m_mapping[];
   ulong             m_cat_cols[];
   ulong             m_vars,m_cols;
public:
   //+------------------------------------------------------------------+
   //|  Constructor                                                     |
   //+------------------------------------------------------------------+
                     COneHotEncoder(void)
     {
     }
   //+------------------------------------------------------------------+
   //|  Destructor                                                      |
   //+------------------------------------------------------------------+
                    ~COneHotEncoder(void)
     {
      ArrayFree(m_mapping);
     }
   //+------------------------------------------------------------------+
   //| map categorical features of a training dataset                   |
   //+------------------------------------------------------------------+
   bool              fit(matrix &in_data,ulong &cols[])
     {
      m_cols = in_data.Cols();
      matrix data = np::selectMatrixCols(in_data,cols);

      if(data.Cols()!=ulong(cols.Size()))
        {
         Print(__FUNCTION__, " invalid input data ");
         return false;
        }

      m_vars = ulong(cols.Size());

      if(ArrayCopy(m_cat_cols,cols)<=0 || !ArraySort(m_cat_cols))
        {
         Print(__FUNCTION__, " ArrayCopy or ArraySort failure ", GetLastError());
         return false;
        }

      if(ArrayResize(m_mapping,int(m_vars))<0)
        {
         Print(__FUNCTION__, " Vector array resize failure ", GetLastError());
         return false;
        }

      for(ulong i = 0; i<m_vars; i++)
        {
         vector unique = data.Col(i);
         m_mapping[i] = np::unique(unique);
        }

      return true;
     }
   //+------------------------------------------------------------------+
   //| Transform abitrary feature matrix to learned category m_mapping  |
   //+------------------------------------------------------------------+

   matrix            transform(matrix &in_data)
     {
      if(in_data.Cols()!=m_cols)
        {
         Print(__FUNCTION__," Column dimension of input not equal to ", m_cols);
         return matrix::Zeros(1,1);
        }

      matrix out,input_copy;
      matrix data = np::selectMatrixCols(in_data,m_cat_cols);

      if(data.Cols()!=ulong(m_cat_cols.Size()))
        {
         Print(__FUNCTION__, " invalid input data ");
         return matrix::Zeros(1,1);
        }

      ulong unchanged_feature_cols[];

      for(ulong i = 0; i<in_data.Cols(); i++)
        {
         int found = ArrayBsearch(m_cat_cols,i);
         if(m_cat_cols[found]!=i)
           {
            if(!unchanged_feature_cols.Push(i))
              {
               Print(__FUNCTION__, " Failed array insertion ", GetLastError());
               return matrix::Zeros(1,1);
              }
           }
        }

      input_copy = unchanged_feature_cols.Size()?np::selectMatrixCols(in_data,unchanged_feature_cols):input_copy;
      ulong numcols = 0;
      vector cumsum = vector::Zeros(ulong(MathMin(m_vars,data.Cols())));

      for(ulong i = 0; i<cumsum.Size(); i++)
        {
         cumsum[i] = double(numcols);
         numcols+=m_mapping[i].Size();
        }

      out = matrix::Zeros(data.Rows(),numcols);

      for(ulong i = 0;i<data.Rows(); i++)
        {
         vector row = data.Row(i);
         for(ulong col = 0; col<row.Size(); col++)
           {
            for(ulong k = 0; k<m_mapping[col].Size(); k++)
              {
               if(MathAbs(row[col]-m_mapping[col][k])<=1.e-15)
                 {
                  out[i][ulong(cumsum[col])+k]=1.0;
                  break;
                 }
              }
           }
        }

      matrix newfeaturematrix(out.Rows(),input_copy.Cols()+out.Cols());

      if((input_copy.Cols()>0 && !np::matrixCopyCols(newfeaturematrix,input_copy,0,input_copy.Cols())) || !np::matrixCopyCols(newfeaturematrix,out,input_copy.Cols()))
        {
         Print(__FUNCTION__, " Failed matrix copy ");
         return matrix::Zeros(1,1);
        }

      return newfeaturematrix;

     }

  };
//+------------------------------------------------------------------+
