//+------------------------------------------------------------------+
//|                                                     PipeLine.mq5 |
//|                                  Copyright 2025, MetaQuotes Ltd. |
//|                                             https://www.mql5.com |
//+------------------------------------------------------------------+
#property copyright "Copyright 2025, MetaQuotes Ltd."
#property link      "https://www.mql5.com"
#property version   "1.00"

//+------------------------------------------------------------------+
//| Preprocessing Pipeline for MQL5                                  |
//+------------------------------------------------------------------+

// Constants
const double EPSILON = 1e-8;

// Enum for preprocessing step types
enum ENUM_PREPROCESSOR_TYPE
{  PREPROCESSOR_IMPUTE_MEDIAN,
   PREPROCESSOR_IMPUTE_MODE,
   PREPROCESSOR_STANDARD_SCALER,
   PREPROCESSOR_ROBUST_SCALER,
   PREPROCESSOR_MINMAX_SCALER,
   PREPROCESSOR_ONEHOT_ENCODER
};

// Structure to store preprocessing step
struct SPreprocessorStep
{  ENUM_PREPROCESSOR_TYPE type;
   int                    column; // For column-specific preprocessors
};

// Base interface for preprocessors
interface IPreprocessor
{  bool Fit(matrix &data);
   bool Transform(matrix &data, matrix &out);
   bool FitTransform(matrix &data, matrix &out);
};

// Utility class for array operations
class CVectorUtils
{
public:
   //+------------------------------------------------------------------+
   //| Find index of Key in sorted Sranges array using binary search    |
   //+------------------------------------------------------------------+
   static int BinarySearch(vector &V, double Key)
   {  int left = 0;
      int right = int(V.Size()) - 1;
      int _i = -1; // Return -1 if not found
      while(left <= right)
      {  int mid = (left + right) >> 1; // Equivalent to (left + right) / 2
         if(V[mid] == Key)
         {  _i = mid;
            break;
         }
         else if(V[mid] < Key)
            left = mid + 1;
         else
            right = mid - 1;
      }
      return _i;
   }

   // Custom quantile-based mode function
   static double QuantileMode(vector &data, double quantile)
   {  // Input validation
      if(data.Size() == 0 || quantile <= 0.0 || quantile > 1.0)
         return EMPTY_VALUE;
      // Filter out invalid values
      if(data.HasNan())
         return EMPTY_VALUE;
      // Sort data for quantile computation
      Sort(data);
      // Calculate number of bins and edges
      int num_bins = (int)(1.0 / quantile);
      if(num_bins < 1)
         num_bins = 1;
      vector edges(num_bins + 1);
      for(int i = 0; i <= num_bins; i++)
         edges[i] = data.Quantile((double)i / num_bins);
      // Count frequencies in bins
      vector counts(num_bins);
      counts.Fill(0.0);
      for(ulong i = 0; i < data.Size(); i++)
      {  double value = data[i];
         // Find bin: value in [edges[j], edges[j+1])
         for(int j = 0; j < num_bins; j++)
         {  if(value >= edges[j] && (j == num_bins - 1 || value < edges[j + 1]))
            {  counts[j]++;
               break;
            }
         }
      }
      // Find bin with maximum count
      int max_count = int(counts.Max());
      int mode_bin = -1;
      for(int j = 0; j < num_bins; j++)
      {  if(counts[j] == max_count)
         {  mode_bin = j;
            break;
         }
      }
      // Return midpoint of mode bin
      if(mode_bin < 0)
         return EMPTY_VALUE;
      return (edges[mode_bin] + edges[mode_bin + 1]) / 2.0;
   }

// Sorts an input vector in ascending order, returning a new sorted vector
   static void Sort(vector &data)
   {  // Input validation
      if(data.Size() == 0)
         return; // Return empty vector for empty input
      // Create a copy to avoid modifying the input
      if(data.HasNan())
         return; // Return empty vector if all values are invalid
      // Sort using MQL5's built-in vector.Sort() (ascending)
      double sorter[];
      data.Swap(sorter);
      ArraySort(sorter);
      data.Swap(sorter);
   }
};

// Standard Scaler
class CStandardScaler : public IPreprocessor
{
private:
   double m_means[];
   double m_stds[];
   bool   m_is_fitted;

public:
   CStandardScaler() : m_is_fitted(false) {}

   bool Fit(matrix &data)
   {  int rows = int(data.Rows());
      int cols = int(data.Cols());
      ArrayResize(m_means, cols);
      ArrayResize(m_stds, cols);
      for(int j = 0; j < cols; j++)
      {  vector column(rows);
         for(int i = 0; i < rows; i++) column[i] = data[i][j];
         m_means[j] = column.Mean();
         m_stds[j] = column.Std();
         if(m_stds[j] == 0.0) m_stds[j] = EPSILON;
      }
      m_is_fitted = true;
      return true;
   }

   bool Transform(matrix &data, matrix &out)
   {  if(!m_is_fitted) return false;
      int rows = int(data.Rows());
      int cols = int(data.Cols());
      out.Init(rows, cols);
      for(int j = 0; j < cols; j++)
         for(int i = 0; i < rows; i++)
            out[i][j] = (!MathIsValidNumber(data[i][j]) ? DBL_MIN : (data[i][j] - m_means[j]) / m_stds[j]);
      return true;
   }

   bool FitTransform(matrix &data, matrix &out)
   {  if(!Fit(data)) return false;
      return Transform(data, out);
   }
};

// Robust Scaler
class CRobustScaler : public IPreprocessor
{
private:
   double m_medians[];
   double m_iqrs[];
   bool   m_is_fitted;

public:
   CRobustScaler() : m_is_fitted(false) {}

   bool Fit(matrix &data)
   {  int rows = int(data.Rows());
      int cols = int(data.Cols());
      ArrayResize(m_medians, cols);
      ArrayResize(m_iqrs, cols);
      for(int j = 0; j < cols; j++)
      {  vector column(rows);
         for(int i = 0; i < rows; i++) column[i] = data[i][j];
         m_medians[j] = column.Median();
         double q25 = column.Quantile(0.25);
         double q75 = column.Quantile(0.75);
         m_iqrs[j] = q75 - q25;
         if(m_iqrs[j] == 0.0) m_iqrs[j] = EPSILON;
      }
      m_is_fitted = true;
      return true;
   }

   bool Transform(matrix &data, matrix &out)
   {  if(!m_is_fitted) return false;
      int rows = int(data.Rows());
      int cols = int(data.Cols());
      out.Init(rows, cols);
      for(int j = 0; j < cols; j++)
         for(int i = 0; i < rows; i++)
            out[i][j] = (!MathIsValidNumber(data[i][j]) ? DBL_MIN : (data[i][j] - m_medians[j]) / m_iqrs[j]);
      return true;
   }

   bool FitTransform(matrix &data, matrix &out)
   {  if(!Fit(data)) return false;
      return Transform(data, out);
   }
};

// MinMax Scaler
class CMinMaxScaler : public IPreprocessor
{
private:
   double m_mins[];
   double m_maxs[];
   double m_new_min;
   double m_new_max;
   bool   m_is_fitted;

public:
   CMinMaxScaler(double new_min = 0.0, double new_max = 1.0) : m_new_min(new_min), m_new_max(new_max), m_is_fitted(false) {}

   bool Fit(matrix &data)
   {  int rows = int(data.Rows());
      int cols = int(data.Cols());
      ArrayResize(m_mins, cols);
      ArrayResize(m_maxs, cols);
      for(int j = 0; j < cols; j++)
      {  vector column(rows);
         for(int i = 0; i < rows; i++) column[i] = data[i][j];
         m_mins[j] = column.Min();
         m_maxs[j] = column.Max();
         if(m_maxs[j] - m_mins[j] == 0.0) m_maxs[j] += EPSILON;
      }
      m_is_fitted = true;
      return true;
   }

   bool Transform(matrix &data, matrix &out)
   {  if(!m_is_fitted) return false;
      int rows = int(data.Rows());
      int cols = int(data.Cols());
      out.Init(rows, cols);
      for(int j = 0; j < cols; j++)
         for(int i = 0; i < rows; i++)
         {  if(!MathIsValidNumber(data[i][j])) out[i][j] = DBL_MIN;
            else
            {  double scale = (m_new_max - m_new_min) / (m_maxs[j] - m_mins[j]);
               out[i][j] = (data[i][j] - m_mins[j]) * scale + m_new_min;
            }
         }
      return true;
   }

   bool FitTransform(matrix &data, matrix &out)
   {  if(!Fit(data)) return false;
      return Transform(data, out);
   }
};

// One-Hot Encoder
class COneHotEncoder : public IPreprocessor
{
private:
   int    m_column;
   double m_categories[];
   bool   m_is_fitted;

public:
   COneHotEncoder(int column) : m_column(column), m_is_fitted(false) {}

   bool Fit(matrix &data)
   {  int rows = int(data.Rows());
      vector values;
      int unique = 0;
      for(int i = 0; i < rows; i++)
      {  if(!MathIsValidNumber(data[i][m_column])) continue;
         int idx = CVectorUtils::BinarySearch(values, data[i][m_column]);
         if(idx == -1)
         {  values.Resize(unique + 1);
            values[unique] = data[i][m_column];
            unique++;
         }
      }
      values.Swap(m_categories);
      //ArrayCopy(m_categories, values);
      m_is_fitted = true;
      return true;
   }

   bool Transform(matrix &data, matrix &out)
   {  if(!m_is_fitted) return false;
      int rows = int(data.Rows());
      int cols = int(data.Cols());
      int cat_count = ArraySize(m_categories);
      if(data.Cols() == cols - 1 + cat_count) return false;
      out.Resize( rows, cols - 1 + cat_count);
      out.Fill(0.0);
      for(int i = 0; i < rows; i++)
      {  int out_col = 0;
         for(int j = 0; j < cols; j++)
         {  if(j == m_column) continue;
            out[i][out_col] = data[i][j];
            out_col++;
         }
         for(int k = 0; k < cat_count; k++)
            if(data[i][m_column] == m_categories[k])
            {  out[i][out_col + k] = 1.0;
               break;
            }
      }
      m_is_fitted = true;
      return true;
   }

   bool FitTransform(matrix &data, matrix &out)
   {  if(!Fit(data)) return false;
      return Transform(data, out);
   }
};

// Impute Median
class CImputeMedian : public IPreprocessor
{
private:
   int    m_column;
   double m_median;
   bool   m_is_fitted;

public:
   CImputeMedian(int column) : m_column(column), m_is_fitted(false) {}

   bool Fit(matrix &data)
   {  int rows = int(data.Rows());
      vector column(rows);
      for(int i = 0; i < rows; i++) column[i] = data[i][m_column];
      m_median = column.Median();
      m_is_fitted = true;
      return true;
   }

   bool Transform(matrix &data, matrix &out)
   {  if(!m_is_fitted) return false;
      int rows = int(data.Rows());
      int cols = int(data.Cols());
      out.Init(rows, cols);
      for(int i = 0; i < rows; i++)
         for(int j = 0; j < cols; j++)
            out[i][j] = (j == m_column && !MathIsValidNumber(data[i][j])) ? m_median : data[i][j];
      return true;
   }

   bool FitTransform(matrix &data, matrix &out)
   {  if(!Fit(data)) return false;
      return Transform(data, out);
   }
};

// Impute Mode
class CImputeMode : public IPreprocessor
{
private:
   int    m_column;
   double m_mode;
   bool   m_is_fitted;

public:
   CImputeMode(int column) : m_column(column), m_is_fitted(false) {}

   bool Fit(matrix &data)
   {  int rows = int(data.Rows());
      vector column(rows);
      for(int i = 0; i < rows; i++) column[i] = data[i][m_column];
      m_mode = CVectorUtils::QuantileMode(column, 0.02);
      m_is_fitted = true;
      return true;
   }

   bool Transform(matrix &data, matrix &out)
   {  if(!m_is_fitted) return false;
      int rows = int(data.Rows());
      int cols = int(data.Cols());
      out.Init(rows, cols);
      for(int i = 0; i < rows; i++)
         for(int j = 0; j < cols; j++)
            out[i][j] = (j == m_column && !MathIsValidNumber(data[i][j])) ? m_mode : data[i][j];
      return true;
   }

   bool FitTransform(matrix &data, matrix &out)
   {  if(!Fit(data)) return false;
      return Transform(data, out);
   }
};

// Preprocessing Pipeline
class CPreprocessingPipeline
{
private:
   SPreprocessorStep m_steps[];
   IPreprocessor    *m_preprocessors[];
   int              m_step_count;

public:
   CPreprocessingPipeline() : m_step_count(0) {}

   ~CPreprocessingPipeline()
   {  for(int i = 0; i < m_step_count; i++)
         delete m_preprocessors[i];
   }

   void AddImputeMedian(int column)
   {  ArrayResize(m_steps, m_step_count + 1);
      ArrayResize(m_preprocessors, m_step_count + 1);
      m_steps[m_step_count].type = PREPROCESSOR_IMPUTE_MEDIAN;
      m_steps[m_step_count].column = column;
      m_preprocessors[m_step_count] = new CImputeMedian(column);
      m_step_count++;
   }

   void AddImputeMode(int column)
   {  ArrayResize(m_steps, m_step_count + 1);
      ArrayResize(m_preprocessors, m_step_count + 1);
      m_steps[m_step_count].type = PREPROCESSOR_IMPUTE_MODE;
      m_steps[m_step_count].column = column;
      m_preprocessors[m_step_count] = new CImputeMode(column);
      m_step_count++;
   }

   void AddStandardScaler()
   {  ArrayResize(m_steps, m_step_count + 1);
      ArrayResize(m_preprocessors, m_step_count + 1);
      m_steps[m_step_count].type = PREPROCESSOR_STANDARD_SCALER;
      m_steps[m_step_count].column = -1;
      m_preprocessors[m_step_count] = new CStandardScaler();
      m_step_count++;
   }

   void AddRobustScaler()
   {  ArrayResize(m_steps, m_step_count + 1);
      ArrayResize(m_preprocessors, m_step_count + 1);
      m_steps[m_step_count].type = PREPROCESSOR_ROBUST_SCALER;
      m_steps[m_step_count].column = -1;
      m_preprocessors[m_step_count] = new CRobustScaler();
      m_step_count++;
   }

   void AddMinMaxScaler(double new_min = 0.0, double new_max = 1.0)
   {  ArrayResize(m_steps, m_step_count + 1);
      ArrayResize(m_preprocessors, m_step_count + 1);
      m_steps[m_step_count].type = PREPROCESSOR_MINMAX_SCALER;
      m_steps[m_step_count].column = -1;
      m_preprocessors[m_step_count] = new CMinMaxScaler(new_min, new_max);
      m_step_count++;
   }

   void AddOneHotEncoder(int column)
   {  ArrayResize(m_steps, m_step_count + 1);
      ArrayResize(m_preprocessors, m_step_count + 1);
      m_steps[m_step_count].type = PREPROCESSOR_ONEHOT_ENCODER;
      m_steps[m_step_count].column = column;
      m_preprocessors[m_step_count] = new COneHotEncoder(column);
      m_step_count++;
   }

   bool FitPipeline(matrix &data)
   {  matrix temp;
      temp.Copy(data);
      for(int i = 0; i < m_step_count; i++)
      {  matrix out;
         if(!m_preprocessors[i].Fit(temp)) return false;
         if(!m_preprocessors[i].Transform(temp, out)) return false;
         temp.Copy(out);
      }
      return true;
   }

   bool TransformPipeline(matrix &data, matrix &out)
   {  out.Copy(data);
      for(int i = 0; i < m_step_count; i++)
      {  matrix temp;
         if(!m_preprocessors[i].Transform(out, temp)) return false;
         out.Copy(temp);
      }
      return true;
   }

   bool FitTransformPipeline(matrix &data, matrix &out)
   {  if(!FitPipeline(data)) return false;
      return TransformPipeline(data, out);
   }
};
//+------------------------------------------------------------------+
