//+------------------------------------------------------------------+
//|                                                    SignalPPO.mqh |
//|                   Copyright 2009-2017, MetaQuotes Software Corp. |
//|                                              http://www.mql5.com |
//+------------------------------------------------------------------+
#include <Expert\ExpertSignal.mqh>
#include <My\Cql.mqh>
//+------------------------------------------------------------------+
// wizard description start
//+------------------------------------------------------------------+
//| Description of the class                                         |
//| Title=Signals based on Reinforcement-Learning with Proximal Policy Optimization.|
//| Type=SignalAdvanced                                              |
//| Name=Reinforcement-Learning with Proximal Policy Optimization    |
//| ShortName=PPO                                                    |
//| Class=CSignalPPO                                                 |
//| Page=signal_proximal_policy_optimization                         |
//| Parameter=RL_Scale,int,5, Matrix Row-to-Col Scale Ratio          |
//| Parameter=RL_Markov,bool,true, Use Markov                        |
//| Parameter=RL_Epsilon,double,0.15, Exploration/ Exploitation Ratio|
//| Parameter=RL_Alpha,double,0.25, Learning Rate                    |
//| Parameter=RL_Policy,bool,true, Use Policy                        |
//+------------------------------------------------------------------+
// wizard description end
//+------------------------------------------------------------------+
//| PPOs CSignalPPO.                                                 |
//| Purpose: Proximal Policy Optimization for Reinforcement-Learning.|
//|            Derives from class CExpertSignal.                     |
//+------------------------------------------------------------------+
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
class CSignalPPO   : public CExpertSignal
{
protected:

   int                           m_actions;           // LetMarkov possible actions
   int                           m_environments;      // Environments, per matrix axis
   int                           m_scale;             // Environments, row-to-col scale
   bool                          m_use_markov;        // Use Markov
   double                        m_alpha;             // Alpha
   double                        m_epsilon;           // Epsilon
   bool                          m_policy;            // On Policy
   
public:
   void                          CSignalPPO(void);
   void                          ~CSignalPPO(void);

   //--- methods of setting adjustable parameters
   void                          RL_Scale(int value)
   {  m_scale = value;
   }
   void                          RL_Markov(bool value)
   {  m_use_markov = value;
   }
   void                          RL_Alpha(bool value)
   {  m_alpha = value;
   }
   void                          RL_Epsilon(bool value)
   {  m_epsilon = value;
   }
   void                          RL_Policy(bool value)
   {  m_policy = value;
   }

   //--- method of verification of arch
   virtual bool      ValidationSettings(void);
   //--- method of creating the indicator and timeseries
   virtual bool      InitIndicators(CIndicators *indicators);
   //--- methods of checking if the market models are formed
   virtual int       LongCondition(void);
   virtual int       ShortCondition(void);

protected:
   void              GetOutput(Cql *QL, int RewardSign);
   Sql               RL;
   Cql               *RL_BUY, *RL_SELL;
};
//+------------------------------------------------------------------+
//| Constructor                                                      |
//+------------------------------------------------------------------+
void CSignalPPO::CSignalPPO(void) :    m_scale(5),
   m_use_markov(true),
   m_policy(true),
   m_alpha(0.25),
   m_epsilon(0.15)

{
//--- initialization of protected data
   m_used_series = USE_SERIES_OPEN + USE_SERIES_HIGH + USE_SERIES_LOW + USE_SERIES_CLOSE + USE_SERIES_SPREAD + USE_SERIES_TIME;
   //
   RL.actions  = 3;//buy, sell, do nothing
   RL.environments = 3;//bullish, bearish, flat
   RL.use_markov = m_use_markov;
   RL.alpha = m_alpha;
   RL.epsilon = m_epsilon;
   RL_BUY = new Cql(RL);
   RL_SELL = new Cql(RL);
   //
}
//+------------------------------------------------------------------+
//| Destructor                                                       |
//+------------------------------------------------------------------+
void CSignalPPO::~CSignalPPO(void)
{  delete RL_BUY;
   delete RL_SELL;
}
//+------------------------------------------------------------------+
//| Validation arch protected data.                                  |
//+------------------------------------------------------------------+
bool CSignalPPO::ValidationSettings(void)
{  if(!CExpertSignal::ValidationSettings())
      return(false);
//--- initial data checks
//read best weights
//--- ok
   return(true);
}
//+------------------------------------------------------------------+
//| Create indicators.                                               |
//+------------------------------------------------------------------+
bool CSignalPPO::InitIndicators(CIndicators *indicators)
{
//--- check pointer
   if(indicators == NULL)
      return(false);
//--- initialization of indicators and timeseries of additional filters
   if(!CExpertSignal::InitIndicators(indicators))
      return(false);
//--- ok
   return(true);
}
//+------------------------------------------------------------------+
//| "Voting" that price will grow.                                   |
//+------------------------------------------------------------------+
int CSignalPPO::LongCondition(void)
{  int result = 0;
   GetOutput(RL_BUY, 1);
   if(RL_BUY.q_ppo_act==0)
   {  result = 100;
   }
   return(result);
}
//+------------------------------------------------------------------+
//| "Voting" that price will fall.                                   |
//+------------------------------------------------------------------+
int CSignalPPO::ShortCondition(void)
{  int result = 0;
   GetOutput(RL_SELL, -1);
   if(RL_SELL.q_ppo_act==2)
   {  result = 100;
   }
   return(result);
}
//+------------------------------------------------------------------+
//| This function calculates the next actions to be selected from    |
//| the Reinforcement Learning Cycle where:                          |
//| INPUT PARAMETERS:                                                |
//|   QL          -  Reinforcement Learning Class for the            | 
//|                  trade actions.                                  |
//|   RewardSign  -  Signed integer -1 or +1 to help properly        | 
//|                  reward outputs of the learning cycles           |
//+------------------------------------------------------------------+
void CSignalPPO::GetOutput(Cql *QL, int RewardSign)
{  vector _in, _in_row, _in_row_old, _in_col, _in_col_old;
   if
   (
      _in_row.Init(m_scale) &&
      _in_row.CopyRates(m_symbol.Name(), m_period, 8, 0, m_scale) &&
      _in_row.Size() == m_scale
      &&
      _in_row_old.Init(m_scale) &&
      _in_row_old.CopyRates(m_symbol.Name(), m_period, 8, 1, m_scale) &&
      _in_row_old.Size() == m_scale
      &&
      _in_col.Init(m_scale) &&
      _in_col.CopyRates(m_symbol.Name(), m_period, 8, 0, m_scale) &&
      _in_col.Size() == m_scale
      &&
      _in_col_old.Init(m_scale) &&
      _in_col_old.CopyRates(m_symbol.Name(), m_period, 8, m_scale, m_scale) &&
      _in_col_old.Size() == m_scale
   )
   {  _in_row -= _in_row_old;
      _in_col -= _in_col_old;
      vector _in_e;
      _in_e.Init(m_scale);
      QL.Environment(_in_row, _in_col, _in_e);
      int _row = 0, _col = 0;
      QL.SetMarkov(int(_in_e[m_scale - 1]), _row, _col);
      double _reward_float = RewardSign*_in_row[m_scale - 1];
      double _reward_max = RewardSign*_in_row.Max();
      double _reward_min = RewardSign*_in_row.Min();
      double _reward = QL.GetReward(_reward_max, _reward_min, _reward_float, RewardSign);
      if(m_policy)
      {  QL.SetOnPolicy(_reward, _in_e);
      }
      else if(!m_policy)
      {  QL.SetOffPolicy(_reward, _in_e);
      }
   }
}
//+------------------------------------------------------------------+
