//+------------------------------------------------------------------+
//|                                                          Cql.mqh |
//|                                  Copyright 2024, MetaQuotes Ltd. |
//|                                             https://www.mql5.com |
//+------------------------------------------------------------------+
#property copyright "Copyright 2024, MetaQuotes Ltd."
#property link      "https://www.mql5.com"
#property version   "1.00"
//+------------------------------------------------------------------+
//| Constructor Parameters                                           |
//+------------------------------------------------------------------+
struct Sql
{  int               actions;
   int               environments;
   double            alpha;
   double            gamma;
   double            epsilon;
   int               episodes;
   bool              use_markov;

                     Sql()
   {  actions = 0;
      environments = 0;
      alpha = 0.1;
      gamma = 0.5;
      epsilon = 0.0;
      episodes = 250;
      use_markov = false;
   };
                    ~Sql() {};

};
//+------------------------------------------------------------------+
//| Q_SA-Learning Class Interface.                                      |
//+------------------------------------------------------------------+
class Cql
{
protected:
   matrix            markov;
   void              LetMarkov(int OldRow, int OldCol, vector &E);

   vector            acts;
   matrix            environment;
   matrix            Q_SA[];
   matrix            Q_V;

public:
   void              Action(vector &E);
   void              Environment(vector &E_Row, vector &E_Col, vector &E);

   void              SetOffPolicy(double Reward, vector &E);
   void              SetOnPolicy(double Reward, vector &E);

   double            GetReward(double MaxProfit, double MaxLoss, double Float);
   vector            SetTarget(vector &Rewards, vector &TargetOutput);

   void              SetMarkov(int Index, int &Row, int &Col);
   int               GetMarkov(int Row, int Col);


   Sql               THIS;

   int               act[2];

   int               e_row[2];
   int               e_col[2];

   int               transition_act;
   double            transition_value;
   
   matrix            policy_history;

   vector            Q_Loss()
   {  vector _loss;
      _loss.Init(THIS.actions);
      _loss.Fill(0.0);
      for(int i = 0; i < THIS.actions; i++)
      {  _loss[i] = Q_SA[e_row[0]][e_col[0]][i];
      }
      return(_loss);
   }


   void              Cql(Sql &RL)
   {  //
      if(RL.actions > 0 && RL.environments > 0)
      {  policy_history.Init(2,2+1);
         policy_history.Fill(0.0);
         acts.Init(RL.actions);
         ArrayResize(Q_SA, RL.actions);
         for(int i = 0; i < RL.actions; i++)
         {  acts[i] = i + 1;
            Q_SA[i].Init(RL.environments, RL.environments);
         }
         Q_V.Init(RL.environments, RL.environments);
         environment.Init(RL.environments, RL.environments);
         for(int i = 0; i < RL.environments; i++)
         {  for(int ii = 0; ii < RL.environments; ii++)
            {  environment[i][ii] = ii + (i * RL.environments) + 1;
            }
         }
         markov.Init(RL.environments * RL.environments, RL.environments * RL.environments);
         markov.Fill(0.0);
         THIS = RL;
         ArrayFill(e_row, 0, 2, 0);
         ArrayFill(e_col, 0, 2, 0);
         ArrayFill(act, 0, 2, 1);
         transition_act = 1;
      }
   };
   void              ~Cql(void) {};
};
//+------------------------------------------------------------------+
// Setting environment row & col from markov index
//+------------------------------------------------------------------+
void Cql::SetMarkov(int Index, int &Row, int &Col)
{  Col = int(floor(Index / THIS.environments));
   Row = int(fmod(Index, THIS.environments));
}
//+------------------------------------------------------------------+
// Getting markov index from environment row & col
//+------------------------------------------------------------------+
int Cql::GetMarkov(int Row, int Col)
{  return(Row + (THIS.environments * Col));
}
//+------------------------------------------------------------------+
// Function to update markov matrix
//+------------------------------------------------------------------+
void Cql::LetMarkov(int OldRow, int OldCol, vector &E)  //
{  matrix _transitions;  // Count the transitions
   _transitions.Init(markov.Rows(), markov.Cols());
   _transitions.Fill(0.0);
   vector _states;  // Count the occurrences of each state
   _states.Init(markov.Rows());
   _states.Fill(0.0);
// Count transitions from state i to state ii
   for (int i = 0; i < int(E.Size()) - 1; i++)
   {  int _old_state = int(E[i]);
      int _new_state = int(E[i + 1]);
      _transitions[_old_state][_new_state]++;
      _states[_old_state]++;
   }
// Reset prior values to zero.
   markov.Fill(0.0);
// Compute probabilities by normalizing transition counts
   for (int i = 0; i < int(markov.Rows()); i++)
   {  for (int ii = 0; ii < int(markov.Cols()); ii++)
      {  if (_states[i] > 0)
         {  markov[i][ii] = double(_transitions[i][ii] / _states[i]);
         }
         else
         {  markov[i][ii] = 0.0;  // No transitions from this state
         }
      }
   }
}
//+------------------------------------------------------------------+
// Choose an action using epsilon-greedy approach
//+------------------------------------------------------------------+
void Cql::Action(vector &E)
{  int _best_act = 0;
   if (double((rand() % SHORT_MAX) / SHORT_MAX) < THIS.epsilon)
   {  // Explore: Choose random action
      _best_act = (rand() % THIS.actions);
   }
   else
   {  // Exploit: Choose best action
      double _best_value = Q_SA[0][e_row[0]][e_col[0]];
      for (int i = 1; i < THIS.actions; i++)
      {  if (Q_SA[i][e_row[0]][e_col[0]] > _best_value)
         {  _best_value = Q_SA[i][e_row[0]][e_col[0]];
            _best_act = i;
         }
      }
   }
//update last action
   act[1] = act[0];
   act[0] = _best_act;
//markov decision process
   int _e_row_new = 0, _e_col_new = 0;
   SetMarkov(int(E[E.Size() - 1]), _e_row_new, _e_col_new);
   e_row[1] = e_row[0];
   e_col[1] = e_col[0];
   e_row[0] = _e_row_new;
   e_col[0] = _e_col_new;
   LetMarkov(e_row[1], e_col[1], E);
   int _next_state = 0;
   for (int i = 0; i < int(markov.Cols()); i++)
   {  if(markov[int(E[0])][i] > markov[int(E[0])][_next_state])
      {  _next_state = i;
      }
   }
   int _next_row = 0, _next_col = 0;
   SetMarkov(_next_state, _next_row, _next_col);
   transition_value = Q_V[_next_row][_next_col];
   policy_history[1][0] = policy_history[0][0];
   policy_history[1][1] = policy_history[0][1];
   policy_history[1][2] = policy_history[0][2];
   policy_history[0][0] = _next_row;
   policy_history[0][1] = transition_value;
   policy_history[0][2] = _next_col;
   transition_act = 1;
   for (int i = 0; i < THIS.actions; i++)
   {  if(Q_SA[i][_next_row][_next_col] > Q_SA[transition_act][_next_row][_next_col])
      {  transition_act = i;
      }
   }
   //if(transition_act!=1)printf(__FUNCSIG__+ " act is : %i ",transition_act);
}
//+------------------------------------------------------------------+
// Update Q_SA-value using Off-policy
//+------------------------------------------------------------------+
void Cql::SetOffPolicy(double Reward, vector &E)
{  Action(E);
//where 'act' index 0 represents highest valued Q_SA-action from Q_SA-Map
//as determined from Action() function above.
   double _action_sa = Q_SA[transition_act][e_row[0]][e_col[0]];
   double _action_v = Q_V[e_row[0]][e_col[0]];
   if(THIS.use_markov)
   {  int _old_index = GetMarkov(e_row[1], e_col[1]);
      int _new_index = GetMarkov(e_row[0], e_col[0]);
      _action_sa *= markov[_old_index][_new_index];
      _action_v *= markov[_old_index][_new_index];
   }
   for (int i = 0; i < THIS.actions; i++)
   {  if(i == act[0])
      {  continue;
      }
      Q_SA[i][e_row[0]][e_col[0]] += THIS.alpha * ((Reward + (THIS.gamma * _action_sa)) - Q_SA[transition_act][e_row[0]][e_col[0]]);
   }
   Q_V[e_row[0]][e_col[0]] += THIS.alpha * ((Reward + (THIS.gamma * _action_v)) - Q_V[e_row[0]][e_col[0]]);
}
//+------------------------------------------------------------------+
// Update Q_SA-value using On-policy
//+------------------------------------------------------------------+
void Cql::SetOnPolicy(double Reward, vector &E)
{  Action(E);
//where 'act' index 1 represents the current Q_SA-action from Q_SA-Map
   double _action_sa = Q_SA[transition_act][e_row[1]][e_col[1]];
   double _action_v = Q_V[e_row[1]][e_col[1]];
   if(THIS.use_markov)
   {  int _old_index = GetMarkov(e_row[1], e_col[1]);
      int _new_index = GetMarkov(e_row[0], e_col[0]);
      _action_sa *= markov[_old_index][_new_index];
      _action_v *= markov[_old_index][_new_index];
   }
   for (int i = 0; i < THIS.actions; i++)
   {  if(i == act[0])
      {  continue;
      }
      Q_SA[i][e_row[1]][e_col[1]] += THIS.alpha * ((Reward + (THIS.gamma * _action_sa)) - Q_SA[transition_act][e_row[1]][e_col[1]]);
   }
   Q_V[e_row[1]][e_col[1]] += THIS.alpha * ((Reward + (THIS.gamma * _action_v)) - Q_V[e_row[1]][e_col[1]]);
}
//+------------------------------------------------------------------+
// Indexing new Environment data to conform with states
//+------------------------------------------------------------------+
void Cql::Environment(vector &E_Row, vector &E_Col, vector &E)
{  if(E_Row.Size() >= E.Size() && E_Col.Size() >= E.Size())
   {  E.Fill(0.0);
      for(int i = 0; i < int(E.Size()); i++)
      {  if(E_Row[E_Row.Size() - 1 - i] > 0.0 && E_Col[E_Col.Size() - 1 - i] > 0.0)
         {  E[i] = 0.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] > 0.0 && E_Col[E_Col.Size() - 1 - i] == 0.0)
         {  E[i] = 1.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] > 0.0 && E_Col[E_Col.Size() - 1 - i] < 0.0)
         {  E[i] = 2.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] == 0.0 && E_Col[E_Col.Size() - 1 - i] > 0.0)
         {  E[i] = 3.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] == 0.0 && E_Col[E_Col.Size() - 1 - i] == 0.0)
         {  E[i] = 4.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] == 0.0 && E_Col[E_Col.Size() - 1 - i] < 0.0)
         {  E[i] = 5.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] < 0.0 && E_Col[E_Col.Size() - 1 - i] > 0.0)
         {  E[i] = 6.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] < 0.0 && E_Col[E_Col.Size() - 1 - i] == 0.0)
         {  E[i] = 7.0;
         }
         else if(E_Row[E_Row.Size() - 1 - i] < 0.0 && E_Col[E_Col.Size() - 1 - i] < 0.0)
         {  E[i] = 8.0;
         }
      }
   }
}
//+------------------------------------------------------------------+
// Normalize reward
//+------------------------------------------------------------------+
double Cql::GetReward(double MaxProfit, double MaxLoss, double Float)
{  double _reward = 0.0;
   if(MaxProfit >= Float && Float >= MaxLoss && MaxLoss < MaxProfit)
   {  _reward = (Float - MaxLoss) / (MaxProfit - MaxLoss);
   }
   return(_reward);
}
//+------------------------------------------------------------------+
// Set Target for DQN
//+------------------------------------------------------------------+
vector Cql::SetTarget(vector &Rewards, vector &TargetOutput)
{  vector _target = Rewards + (THIS.gamma * TargetOutput);
   return(_target);
}
//+------------------------------------------------------------------+
