# 数据科学和机器学习（第 17 部分）：摇钱树？外汇交易中随机森林的艺术与科学

MetaTrader 5交易 | 9 八月 2024, 17:03
241 0

### 01：融合学习

```for (uint i=0; i<m_ntrees; i++) //Build a given x number of trees
{
temp_data = data;
matrix_utils.Randomize(temp_data, m_random_state, replace); //Get randomized subsets

forest[i] = new CDecisionTreeClassifier(this.m_minsplit, this.m_maxdepth); //Add the tree to the forest

forest[i].fit(x_subset, y_subset); //Add the trained tree to the forest
preds = forest[i].predict(x_subset);
}```

### 02：自举聚合（Bagging）

```matrix_utils.Randomize(temp_data, m_random_state, replace); //Get randomized subsets
```

```template<typename T>
void CMatrixutils::Randomize(matrix<T> &matrix_,int random_state=-1, bool replace=false)```

replace = true 参数允许多次选择同一索引，从而模拟自举过程。

### 04： 投票（或平均）机制

```double CRandomForestClassifier::predict(vector &x)
{
vector predictions(m_ntrees); //predictions from all the trees

for (uint i=0; i<this.m_ntrees; i++) //all trees make the predictions
predictions[i] = forest[i].predict(x);

vector uniques = matrix_utils.Unique(predictions);

return uniques[matrix_utils.Unique_count(predictions).ArgMax()]; //select the majority decision
}```

### 扩展决策树类

```class CDecisionTreeRegressor: public CDecisionTreeClassifier
{
private:
double            calculate_leaf_value(vector &Y);
split_info        get_best_split(matrix &data, uint num_features);
double            variance_reduction(vector &parent, vector &l_child, vector &r_child);

Node              *build_tree(matrix &data, uint curr_depth = 0);
public:
CDecisionTreeRegressor(uint min_samples_split = 2, uint max_depth = 2);
~CDecisionTreeRegressor(void);

void              fit(matrix &x, vector &y);
};```

### 回归器决策树中的叶值

```double CDecisionTreeRegressor::calculate_leaf_value(vector &Y)
{
return Y.Mean();
```

### 计算信息增益

```double CDecisionTreeRegressor::variance_reduction(vector &parent, vector &l_child, vector &r_child)
{
double weight_l = l_child.Size() / (double)parent.Size(),
weight_r = r_child.Size() / (double)parent.Size();

return parent.Var() - ((weight_l * l_child.Var()) + (weight_r * r_child.Var()));
}```

### 构建树 & Fit 函数

```Node *CDecisionTreeRegressor::build_tree(matrix &data, uint curr_depth=0)
{
matrix X;
vector Y;

if (!matrix_utils.XandYSplitMatrices(data,X,Y)) //Split the input matrix into feature matrix X and target vector Y.
{
#ifdef DEBUG_MODE
printf("%s Line %d Failed to build a tree Data Empty",__FUNCTION__,__LINE__);
#endif

return NULL; //Return a NULL pointer
}

ulong samples = X.Rows(), features = X.Cols(); //Get the number of samples and features in the dataset.

ArrayResize(nodes, nodes.Size()+1); //Append the nodes to memory
Node *left_child, *right_child;

if (samples >= m_min_samples_split && curr_depth<=m_max_depth)
{
split_info best_split = this.get_best_split(data, (uint)features);

#ifdef DEBUG_MODE
Print(__FUNCTION__," | ",__LINE__,"\nbest_split left: [",best_split.dataset_left.Rows(),"x",best_split.dataset_left.Cols(),"]\nbest_split right: [",best_split.dataset_right.Rows(),"x",best_split.dataset_right.Cols(),"]\nfeature_index: ",best_split.feature_index,"\nInfo gain: ",best_split.info_gain,"\nThreshold: ",best_split.threshold);
#endif

if (best_split.info_gain > 0)
{
left_child = this.build_tree(best_split.dataset_left, curr_depth+1);
right_child = this.build_tree(best_split.dataset_right, curr_depth+1);

nodes[nodes.Size()-1] = new Node(best_split.feature_index,best_split.threshold,left_child,right_child,best_split.info_gain);
return nodes[nodes.Size()-1];
}
}

nodes[nodes.Size()-1] = new Node();
nodes[nodes.Size()-1].leaf_value = this.calculate_leaf_value(Y);

return nodes[nodes.Size()-1];
}```

Fit 函数

```void CDecisionTreeRegressor::fit(matrix &x, vector &y)
{
matrix data = matrix_utils.concatenate(x, y, 1);

this.root = this.build_tree(data);

is_fitted = true;
}```

Regressor 类和 Classifier 类之间 build_tree 函数的唯一区别是 variance_reduction 函数。

```   matrix data = matrix_utils.ReadCsv("airfoil_noise_data.csv");

matrix x; vector y;

if (!matrix_utils.XandYSplitMatrices(data, x, y))
return INIT_FAILED;

regressor_tree = new CDecisionTreeRegressor(3,3);
regressor_tree.fit(x, y);
regressor_tree.print_tree(regressor_tree.root);

vector preds = regressor_tree.predict(x);

Print("r-squared: ",metrics.r_squared(y, preds));```

```KS      0       00:04:11.402    RandomForest Test (EURUSD,H1)     : X_0<=3150.0?7.6482714516406745
FI      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->  left: X_4<=0.0150478?4.070223732531591
ME      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->   left: X_2<=0.1016?2.453283788183441
RR      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->    left: X_0<=630.0?2.3366165961173238
JR      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    left: 126.94465000000002
MF      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    right: 130.51523904382472
II      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->    right: X_0<=1600.0?4.999630155449349
HF      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    left: 127.90983653846149
JM      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    right: 122.97036507936505
JR      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->   right: X_4<=0.0483159?6.040280153408631
FI      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->    left: X_0<=1250.0?5.315257051142112
IG      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    left: 125.68045918367342
GM      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    right: 120.69493181818189
NQ      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->    right: X_0<=1250.0?13.291165881821172
GK      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    left: 117.69977777777775
GH      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    right: 109.80075000000001
EL      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->  right: X_4<=0.00152689?28.997059993530435
OL      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->   left: X_0<=6300.0?11.053304033466667
HK      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->    left: X_4<=0.000930789?9.067095683299033
FG      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    left: 134.9866388888889
NO      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    right: 128.59900000000002
QS      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->    right: X_4<=0.000930789?9.783359845444707
NI      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    left: 128.05125581395347
GJ      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    right: 120.90806666666667
RM      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->   right: X_4<=0.0341183?5.715854852017056
LN      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->    left: X_0<=5000.0?5.190320913085316
GN      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    left: 120.08625170068028
NE      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    right: 115.52968965517242
MI      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->    right: X_4<=0.0483159?4.450134400476193
IS      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    left: 109.44371428571428
GQ      0       00:04:11.402    RandomForest Test (EURUSD,H1)   --->--->--->--->    right: 104.84033333333332
PH      0       00:04:11.403    RandomForest Test (EURUSD,H1)   r-squared: 0.5937442611327515
```

### 构建随机森林分类器

```class CRandomForestClassifier
{
CMetrics metrics;

protected:
uint  m_ntrees;
uint  m_maxdepth;
uint  m_minsplit;
int   m_random_state;

CMatrixutils matrix_utils;
CDecisionTreeClassifier *forest[];
string ConvertTime(double seconds);

public:
CRandomForestClassifier(uint n_trees=100, uint minsplit=NULL, uint max_depth=NULL, int random_state=-1);
~CRandomForestClassifier(void);

void fit(matrix &x, vector &y, bool replace=true);
double predict(vector &x);
vector predict(matrix &x);
};
```

n_trees = 100（默认），这意味着随机森林分类器森林中有 100 棵树。

min_splitmax_depth 是我们在上一篇文章中讨论的每棵树的参数。min_split 是树应该具有的最小树枝数量，而 max_depth 则是树的树枝应有的长度。

### 将树拟合到随机森林之中

```void CRandomForestClassifier::fit(matrix &x, vector &y, bool replace=true)
{
matrix x_subset;
vector y_subset;
matrix data = this.matrix_utils.concatenate(x, y, 1);
matrix temp_data = data;
vector preds;

datetime time_start = GetTickCount(), current_time;

Print("[ Classifier Random Forest Building ]");

for (uint i=0; i<m_ntrees; i++) //Build a given x number of trees
{
time_start = GetTickCount();

temp_data = data;
matrix_utils.Randomize(temp_data, m_random_state, replace); //Get randomized subsets

if (!this.matrix_utils.XandYSplitMatrices(temp_data, x_subset, y_subset)) //split the random subset into x and y subsets
{
ArrayRemove(forest,i,1); //Delete the invalid tree in a forest
printf("%s %d Failed to split data for a tree ",__FUNCTION__,__LINE__);
continue;
}

forest[i] = new CDecisionTreeClassifier(this.m_minsplit, this.m_maxdepth); //Add the tree to the forest

forest[i].fit(x_subset, y_subset); //Add the trained tree to the forest
preds = forest[i].predict(x_subset);

current_time = GetTickCount();

printf("   ==> Tree <%d> Rand Seed <%s> Accuracy Score: %.3f Time taken: %s",i+1,m_random_state==-1?"None":string(m_random_state),metrics.accuracy_score(y_subset, preds), ConvertTime((current_time - time_start) / 1000.0));
}

m_ntrees = ArraySize(forest); //The successfully build trees

}```

### 决策树分类器对比随机森林分类器

 决策树 73.8% 40% 随机森林 83% 45%

 决策树 73.8% 40% 随机森林 80% 45%

 决策树 73.8% 40% 随机森林 78.8% 45%

 决策树 73.8% 40% 随机森林 78.8% 45%

### 导致随机森林无法提供比单棵决策树准确性更好的因素：

fit() 函数的最后一个参数有一个 error 参数，它允许您选择适当的度量来衡量森林中每棵树的准确性。

```enum errors_classifier
{
ERR_ACCURACY
};

enum errors_regressor
{
ERR_R2_SCORE,
};  ```

• 延迟：随机延迟
• 建模：仅限开盘价
• 本金： 1000\$
• 杠杆：1/100

### 后记

forest.mqh（可在 include 文件夹下找到）  包含随机森林类，包括 CRandomForestClassifier 和 CRandomForestRegressor
matrix_utils.mqh（包含）  包含用于矩阵操作的附加函数。
metrics.mqh（包含）  包含用于衡量 ML 模型性能的函数和代码。
preprocessing.mqh（包含）  预处理原始输入数据的函数库，令其适合机器学习模型的用法。
tree.mqh（包含）  包含决策树类。
RandomForest Test.mq5（智能系统）  运行和测试随机森林模型的最终智能系统。

Code.zip (22.86 KB)