//+------------------------------------------------------------------+
//|                                                          Cql.mqh |
//|                                  Copyright 2024, MetaQuotes Ltd. |
//|                                             https://www.mql5.com |
//+------------------------------------------------------------------+
#property copyright "Copyright 2024, MetaQuotes Ltd."
#property link      "https://www.mql5.com"
#property version   "1.00"
//+------------------------------------------------------------------+
//| PPO                                                              |
//+------------------------------------------------------------------+
struct Sppo
{  matrix            policy[];
   matrix            gradient[];
};
//+------------------------------------------------------------------+
//| Constructor Parameters                                           |
//+------------------------------------------------------------------+
struct Sql
{  int               actions;
   int               environments;
   double            alpha;
   double            gamma;
   double            epsilon;
   int               episodes;
   bool              use_markov;

                     Sql()
   {  actions = 0;
      environments = 0;
      alpha = 0.1;
      gamma = 0.5;
      epsilon = 0.0;
      episodes = 250;
      use_markov = false;
   };
                    ~Sql() {};
};
//+------------------------------------------------------------------+
//| Q_SA-Learning Class Interface.                                   |
//+------------------------------------------------------------------+
class Cql
{
protected:
   matrix            markov;
   void              LetMarkov(int OldRow, int OldCol, vector &E);

   matrix            environment;
   matrix            Q_SA[];
   matrix            Q_V;
   Sppo              Q_PPO;

public:
   void              Action(vector &E);
   void              Environment(vector &E_Row, vector &E_Col, vector &E);

   void              SetOffPolicy(double Reward, vector &E);
   void              SetOnPolicy(double Reward, vector &E);

   double            GetReward(double MaxProfit, double MaxLoss, double Float, int Sign=1);
   vector            SetTarget(vector &Rewards, vector &TargetOutput);

   void              SetMarkov(int Index, int &Row, int &Col);
   int               GetMarkov(int Row, int Col);

   double            GetClipping(double OldProbability, double NewProbability, double Advantage);
   void              SetPolicy();

   Sql               THIS;

   int               states[2];
   int               acts[2];
   int               act[2];

   int               e_row[2];
   int               e_col[2];

   int               q_sa_act;
   int               q_ppo_act;
   //
   double            td_value;
   matrix            td_policies;

   vector            Q_Loss()
   {  vector _loss;
      _loss.Init(THIS.actions);
      _loss.Fill(0.0);
      for(int i = 0; i < THIS.actions; i++)
      {  _loss[i] = Q_SA[e_row[0]][e_col[0]][i];
      }
      return(_loss);
   }


   void              Cql(Sql &RL)
   {  //
      if(RL.actions > 0 && RL.environments > 0)
      {  td_policies.Init(2, 2 + 1);
         td_policies.Fill(0.0);
         ArrayInitialize(states, 0);
         ArrayInitialize(acts, 1);
         ArrayResize(Q_SA, RL.actions);
         ArrayResize(Q_PPO.policy, RL.actions);
         ArrayResize(Q_PPO.gradient, RL.actions);
         THIS = RL;
         for(int i = 0; i < RL.actions; i++)
         {  Q_SA[i].Init(RL.environments, RL.environments);
            Q_PPO.policy[i].Init(RL.environments, RL.environments);
            Q_PPO.gradient[i].Init(RL.environments, RL.environments);
            //
            Q_SA[i].Fill(0.0);
            Q_PPO.policy[i].Fill(0.0);
            for (int ii = 0; ii < int(Q_PPO.policy[i].Rows()); ii++)
            {  for (int iii = 0; iii < int(Q_PPO.policy[i].Cols()); iii++)
               {  Q_PPO.policy[i][ii][iii] += 0.000125 * fabs((ii + iii)-int(round(0.5*(Q_PPO.policy[i].Rows()+Q_PPO.policy[i].Cols()-2))));
               }
            }
            Q_PPO.gradient[i].Fill(0.0);
         }
         Q_V.Init(RL.environments, RL.environments);
         environment.Init(RL.environments, RL.environments);
         for(int i = 0; i < RL.environments; i++)
         {  for(int ii = 0; ii < RL.environments; ii++)
            {  environment[i][ii] = ii + (i * RL.environments) + 1;
            }
         }
         markov.Init(RL.environments * RL.environments, RL.environments * RL.environments);
         markov.Fill(0.0);
         THIS = RL;
         ArrayFill(e_row, 0, 2, 0);
         ArrayFill(e_col, 0, 2, 0);
         ArrayFill(act, 0, 2, 1);
         q_sa_act = 1;
         q_ppo_act = 1;
      }
   };
   void              ~Cql(void) {};
};
//+------------------------------------------------------------------+
// Setting environment row & col from markov index
//+------------------------------------------------------------------+
void Cql::SetMarkov(int Index, int &Row, int &Col)
{  Col = int(floor(Index / THIS.environments));
   Row = int(fmod(Index, THIS.environments));
}
//+------------------------------------------------------------------+
// Getting markov index from environment row & col
//+------------------------------------------------------------------+
int Cql::GetMarkov(int Row, int Col)
{  return(Row + (THIS.environments * Col));
}
//+------------------------------------------------------------------+
// Function to update markov matrix
//+------------------------------------------------------------------+
void Cql::LetMarkov(int OldRow, int OldCol, vector &E)  //
{  matrix _transitions;  // Count the transitions
   _transitions.Init(markov.Rows(), markov.Cols());
   _transitions.Fill(0.0);
   vector _states;  // Count the occurrences of each state
   _states.Init(markov.Rows());
   _states.Fill(0.0);
// Count transitions from state i to state ii
   for (int i = 0; i < int(E.Size()) - 1; i++)
   {  int _old_state = int(E[i]);
      int _new_state = int(E[i + 1]);
      _transitions[_old_state][_new_state]++;
      _states[_old_state]++;
   }
// Reset prior values to zero.
   markov.Fill(0.0);
// Compute probabilities by normalizing transition counts
   for (int i = 0; i < int(markov.Rows()); i++)
   {  for (int ii = 0; ii < int(markov.Cols()); ii++)
      {  if (_states[i] > 0)
         {  markov[i][ii] = double(_transitions[i][ii] / _states[i]);
         }
         else
         {  markov[i][ii] = 0.0;  // No transitions from this state
         }
      }
   }
}
//+------------------------------------------------------------------+
// Choose an action using epsilon-greedy approach
//+------------------------------------------------------------------+
void Cql::Action(vector &E)
{  int _best_act = 0;
   if (double((rand() % SHORT_MAX) / SHORT_MAX) < THIS.epsilon)
   {  // Explore: Choose random action
      _best_act = (rand() % THIS.actions);
   }
   else
   {  // Exploit: Choose best action
      double _best_value = Q_SA[0][e_row[0]][e_col[0]];
      for (int i = 1; i < THIS.actions; i++)
      {  if (Q_SA[i][e_row[0]][e_col[0]] > _best_value)
         {  _best_value = Q_SA[i][e_row[0]][e_col[0]];
            _best_act = i;
         }
      }
   }
//update last action
   act[1] = act[0];
   act[0] = _best_act;
//markov decision process
   e_row[1] = e_row[0];
   e_col[1] = e_col[0];
   LetMarkov(e_row[1], e_col[1], E);
   int _next_state = 0;
   for (int i = 0; i < int(markov.Cols()); i++)
   {  if(markov[int(E[0])][i] > markov[int(E[0])][_next_state])
      {  _next_state = i;
      }
   }
   //printf(__FUNCSIG__+" next state is: %i, with best act as: %i ",_next_state,_best_act);
   int _next_row = 0, _next_col = 0;
   SetMarkov(_next_state, _next_row, _next_col);
   e_row[0] = _next_row;
   e_col[0] = _next_col;
   states[1] = states[0];
   states[0] = GetMarkov(_next_row, _next_col);
   td_value = Q_V[_next_row][_next_col];
   td_policies[1][0] = td_policies[0][0];
   td_policies[1][1] = td_policies[0][1];
   td_policies[1][2] = td_policies[0][2];
   td_policies[0][0] = _next_row;
   td_policies[0][1] = td_value;
   td_policies[0][2] = _next_col;
   q_sa_act = 1;
   q_ppo_act = 1;
   for (int i = 0; i < THIS.actions; i++)
   {  if(Q_SA[i][_next_row][_next_col] > Q_SA[q_sa_act][_next_row][_next_col])
      {  q_sa_act = i;
      }
      if(Q_PPO.policy[i][_next_row][_next_col] > Q_PPO.policy[q_ppo_act][_next_row][_next_col])
      {  q_ppo_act = i;
      }
   }
//update last acts
   acts[1] = acts[0];
   acts[0] = q_ppo_act;
}
//+------------------------------------------------------------------+
// Update using Off-policy
//+------------------------------------------------------------------+
void Cql::SetOffPolicy(double Reward, vector &E)
{  Action(E);
   SetPolicy();
//where 'act' index 0 represents highest valued Q_SA-action from Q_SA-Map
//as determined from Action() function above.
   double _sa = Q_SA[q_sa_act][e_row[0]][e_col[0]];
   double _v = Q_V[e_row[0]][e_col[0]];
   if(THIS.use_markov)
   {  int _old_index = GetMarkov(e_row[1], e_col[1]);
      int _new_index = GetMarkov(e_row[0], e_col[0]);
      _sa *= markov[_old_index][_new_index];
      _v *= markov[_old_index][_new_index];
   }
   for (int i = 0; i < THIS.actions; i++)
   {  if(i == q_sa_act){continue;}
      Q_SA[i][e_row[0]][e_col[0]] += THIS.alpha * ((Reward + (THIS.gamma * _sa)) - Q_SA[i][e_row[0]][e_col[0]]);
   }
   Q_V[e_row[0]][e_col[0]] += THIS.alpha * ((Reward + (THIS.gamma * _v)) - Q_V[e_row[0]][e_col[0]]);
}
//+------------------------------------------------------------------+
// Update using On-policy
//+------------------------------------------------------------------+
void Cql::SetOnPolicy(double Reward, vector &E)
{  Action(E);
   SetPolicy();
//where 'act' index 1 represents the current Q_SA-action from Q_SA-Map
   double _sa = Q_SA[q_sa_act][e_row[1]][e_col[1]];
   double _v = Q_V[e_row[1]][e_col[1]];
   if(THIS.use_markov)
   {  int _old_index = GetMarkov(e_row[1], e_col[1]);
      int _new_index = GetMarkov(e_row[0], e_col[0]);
      _sa *= markov[_old_index][_new_index];
      _v *= markov[_old_index][_new_index];
   }
   for (int i = 0; i < THIS.actions; i++)
   {  if(i == q_sa_act){continue;}
      Q_SA[i][e_row[1]][e_col[1]] += THIS.alpha * ((Reward + (THIS.gamma * _sa)) - Q_SA[i][e_row[1]][e_col[1]]);
   }
   Q_V[e_row[1]][e_col[1]] += THIS.alpha * ((Reward + (THIS.gamma * _v)) - Q_V[e_row[1]][e_col[1]]);
}
//+------------------------------------------------------------------+
// PPO policy update function
//+------------------------------------------------------------------+
void Cql::SetPolicy()
{  matrix _policies;
   _policies.Init(THIS.actions, Q_PPO.policy[acts[0]].Rows()*Q_PPO.policy[acts[0]].Cols());
   _policies.Fill(0.0);
   for(int ii = 0; ii < int(Q_PPO.policy[acts[0]].Rows()); ii++)
   {  for(int iii = 0; iii < int(Q_PPO.policy[acts[0]].Cols()); iii++)
      {  for(int i = 0; i < THIS.actions; i++)
         {  _policies[i][GetMarkov(ii, iii)] += Q_PPO.policy[i][ii][iii];
         }
      }
   }
   vector _probabilities;
   _probabilities.Init(Q_PPO.policy[acts[0]].Rows()*Q_PPO.policy[acts[0]].Cols());
   _probabilities.Fill(0.0);
   for(int ii = 0; ii < int(Q_PPO.policy[acts[0]].Rows()); ii++)
   {  for(int iii = 0; iii < int(Q_PPO.policy[acts[0]].Cols()); iii++)
      {  for(int i = 0; i < THIS.actions; i++)
         {  _policies.Row(i).Activation(_probabilities, AF_SOFTMAX);
            double _old = _probabilities[states[1]];
            double _new = _probabilities[states[0]];
            double _advantage = Q_SA[i][ii][iii] - Q_V[ii][iii];
            double _clip = GetClipping(_old, _new, _advantage);
            Q_PPO.gradient[i][ii][iii] = (_new - _old) * _clip;
         }
      }
   }
   for(int i = 0; i < THIS.actions; i++)
   {  for(int ii = 0; ii < int(Q_PPO.policy[i].Rows()); ii++)
      {  for(int iii = 0; iii < int(Q_PPO.policy[i].Cols()); iii++)
         {  Q_PPO.policy[i][ii][iii] += THIS.alpha * Q_PPO.gradient[i][ii][iii];
         }
      }
   }
}
//+------------------------------------------------------------------+
// Helper function to compute the clipped PPO objective
//+------------------------------------------------------------------+
double Cql::GetClipping(double OldProbability, double NewProbability, double Advantage)
{  double _ratio = NewProbability / OldProbability;
   double _clipped_ratio = fmin(fmax(_ratio, 1 - THIS.epsilon), 1 + THIS.epsilon);
   return fmin(_ratio * Advantage, _clipped_ratio * Advantage);
}
//+------------------------------------------------------------------+
// Indexing new Environment data to conform with states
//+------------------------------------------------------------------+
void Cql::Environment(vector &E_Row, vector &E_Col, vector &E)
{  if(E_Row.Size() >= E.Size() && E_Col.Size() >= E.Size())
   {  E.Fill(0.0);
      for(int i = 0; i < int(E.Size()); i++)
      {  if(E_Row[E_Row.Size() - 1 - i] > 0.0 && E_Col[E_Col.Size() - 1 - i] > 0.0)
         {  E[i] = 0.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] > 0.0 && E_Col[E_Col.Size() - 1 - i] == 0.0)
         {  E[i] = 1.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] > 0.0 && E_Col[E_Col.Size() - 1 - i] < 0.0)
         {  E[i] = 2.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] == 0.0 && E_Col[E_Col.Size() - 1 - i] > 0.0)
         {  E[i] = 3.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] == 0.0 && E_Col[E_Col.Size() - 1 - i] == 0.0)
         {  E[i] = 4.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] == 0.0 && E_Col[E_Col.Size() - 1 - i] < 0.0)
         {  E[i] = 5.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] < 0.0 && E_Col[E_Col.Size() - 1 - i] > 0.0)
         {  E[i] = 6.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] < 0.0 && E_Col[E_Col.Size() - 1 - i] == 0.0)
         {  E[i] = 7.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] < 0.0 && E_Col[E_Col.Size() - 1 - i] < 0.0)
         {  E[i] = 8.0;
         }
      }
   }
}
//+------------------------------------------------------------------+
// Normalize reward
//+------------------------------------------------------------------+
double Cql::GetReward(double MaxProfit, double MaxLoss, double Float, int Sign=1)
{  double _reward = 0.0;
   if(Sign == 1 && MaxProfit >= Float && Float >= MaxLoss && MaxLoss < MaxProfit)
   {  _reward = (Float - MaxLoss) / (MaxProfit - MaxLoss);
   }
   if(Sign == -1 && MaxProfit <= Float && Float <= MaxLoss && MaxLoss > MaxProfit)
   {  _reward = (MaxLoss - Float) / (MaxLoss - MaxProfit);
   }
   return(_reward);
}
//+------------------------------------------------------------------+
// Set Target for DQN
//+------------------------------------------------------------------+
vector Cql::SetTarget(vector &Rewards, vector &TargetOutput)
{  vector _target = Rewards + (THIS.gamma * TargetOutput);
   return(_target);
}
//+------------------------------------------------------------------+
