RandomForestClassifier
class sklearn.ensemble.RandomForestClassifier (n_estimators=’10’, criterion=’gini’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None)
n_estimators
这是森林中树木的数量,即基评估器的数量。这个参数对随机森林模型的精确性影响是单调的,n_estimators越 大,模型的效果往往越好。但是相应的,任何模型都有决策边界,n_estimators达到一定的程度之后,随机森林的 精确性往往不在上升或开始波动,并且,n_estimators越大,需要的计算量和内存也越大,训练的时间也会越来越 长。对于这个参数,我们是渴望在训练难度和模型效果之间取得平衡。 n_estimators的默认值在现有版本的sklearn中是10,但是在即将更新的0.22版本中,这个默认值会被修正为 100。这个修正显示出了使用者的调参倾向:要更大的n_estimators。
%matplotlib inline from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import load_wine
wine = load_wine() wine.data.shape >>>(178, 13) wine.target >>>array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
建模
实例化
训练集带入实例化后的模型取训练,使用的接口是fit
使用其他接口将测试集导入我们训练好的模型,去获取我们希望获取的结果(score,Y_test)
from sklearn.model_selection import train_test_split # Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data, wine.target, test_size=0.3) # 划分数据 clf = DecisionTreeClassifier(random_state=0) rfc = RandomForestClassifier(random_state=0) # 实例化 clf = clf.fit(Xtrain, Ytrain) rfc = rfc.fit(Xtrain, Ytrain) # 训练模型 score_c = clf.score(Xtest, Ytest) score_r = rfc.score(Xtest, Ytest) # 打分 print ("single tree:{}".format(score_c) ,"random forest:{}".format(score_r)) # 将format()带入到前面的大括号里面,并将数值转成字符串 >>>single tree:0.9074074074074074 random forest:0.9814814814814815
看出随机森林比决策树的打分高很多
交叉验证
from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt # 导入模块
rfc = RandomForestClassifier(n_estimators=25) # 实例化,做25棵树 rfc_s = cross_val_score(rfc, wine.data, wine.target, cv=10) # 交叉验证 clf = DecisionTreeClassifier() clf_s = cross_val_score(clf, wine.data, wine.target, cv=10) plt.plot(range(1,11), rfc_s, label="RandomForest") plt.plot(range(1,11), clf_s, label="DecisionTree") plt.legend() plt.show()
每次验证中,随机森林比决策树的效果要更好
n_estimators的学习曲线
在不同的n_estimators下跑200次。第一次建1棵,第二次建2颗……,然后输出最高的分值为0.99,以及此时的n_estimators值为30
superpa = [] for i in range(200): rfc = RandomForestClassifier(n_estimators=i+1,n_jobs=-1) rfc_s = cross_val_score(rfc,wine.data,wine.target,cv=10).mean() superpa.append(rfc_s) print(max(superpa),superpa.index(max(superpa))) plt.figure(figsize=[20,5]) plt.plot(range(1,201),superpa) plt.show() >>>0.9888888888888889 29
画出随机森林和决策树在十组交叉验证下的效果对比图
rfc_1 = [] clf_1 = [] for i in range(10): rfc = RandomForestClassifier(n_estimators=25) rfc_s = cross_val_score(rfc, wine.data, wine.target, cv=10).mean() rfc_1.append(rfc_s) clf = DecisionTreeClassifier() clf_s = cross_val_score(clf, wine.data, wine.target, cv=10).mean() clf_1.append(clf_s) plt.plot(range(1,11), rfc_1, label="Random Forest") plt.plot(range(1,11), clf_1, label="Decicison Tree") plt.legend() plt.show()
通过100次验证,可以看出随机森林与决策树之间的差异
随机森林是通过决策树的平均值来计算的,30棵树中有15棵树错误才能导致随机森林判断其为错误,而决策树的错误率是0.2,所以决策树的错误概率为0.2的15次方。
rfc = RandomForestClassifier(n_estimators=25, random_state=2) rfc = rfc.fit(Xtrain, Ytrain) # 随机森林中重要的属性:estimators_ ,查看森林中树的状况 rfc.estimators_[0] >>>DecisionTreeClassifier(max_features='auto', random_state=1872583848)
使用 oob_score=True 参数,无需划分训练集和测试集
rfc = RandomForestClassifier(n_estimators=25, oob_score=True) rfc = rfc.fit(wine.data, wine.target) #完整的数据集 rfc.oob_score_ >>>0.9887640449438202
重要的属性和接口
rfc = RandomForestClassifier(n_estimators=25) rfc = rfc.fit(Xtrain, Ytrain) rfc.score(Xtest,Ytest) >>>0.9814814814814815 rfc.feature_importances_ >>>array([0.09836407, 0.03709119, 0.00633773, 0.01690141, 0.05652153, 0.05270029, 0.14390297, 0.01017389, 0.00975081, 0.17916517, 0.11107285, 0.08174645, 0.19627163]) rfc.apply(Xtest) >>>array([[13, 6, 3, ..., 7, 1, 4], [ 6, 2, 10, ..., 1, 7, 6], [ 6, 2, 10, ..., 1, 7, 6], ..., [ 6, 2, 10, ..., 1, 7, 16], [16, 7, 16, ..., 21, 14, 20], [ 4, 6, 3, ..., 7, 1, 4]], dtype=int64) rfc.predict(Xtest) >>>array([2, 1, 1, 1, 0, 1, 1, 0, 0, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 2, 0, 0, 2, 1, 0, 1, 2, 0, 2, 2, 0, 1, 0, 2, 0, 0, 1, 0, 0, 1, 1, 2, 2, 1, 0, 1, 2, 2, 0, 2, 1, 0, 2]) rfc.predict_proba(Xtest) #分到的标签的概率 >>>array([[0. , 0.04, 0.96], [0. , 0.96, 0.04], [0. , 1. , 0. ], [0.08, 0.92, 0. ], [1. , 0. , 0. ], [0.08, 0.92, 0. ], ..... [0. , 1. , 0. ], [0.92, 0.08, 0. ], [0. , 0.04, 0.96]])