信用评分预测模型(三)--随机森林算法

前言

下面将对数据利用随机森林算法得到结果。

代码

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 导入随机森林分类器
from sklearn.ensemble import RandomForestClassifier
# 自动进行训练集的划分
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA


'''
version:1.0
author:mrx
Method:Random_Forest
'''

# 树的个数
trees=200
# 读取文件
readFileName="python\python Data analysis and mining\class\dataset\german.xls"


def list_add(a,b):
    assert len(a)==len(b)
    for i in range(len(a)):
        a[i] += b[i]

def list_div(a,num):
    for i in range(len(a)):
        a[i] /= num
    return a

# 读取excel
df=pd.read_excel(readFileName)
list_columns=list(df.columns[:-1])
x=df.ix[:,:-1]
# print(x)
y=df.ix[:,-1]
names=x.columns
# print(y)

acc_mean = 0
feature_mean = [0]*len(df.columns[:-1])
# n = 1000 #迭代次数,哪个训练集划分中精度最高
n = 896
max_acc = 0
index_of_max = 0
min_acc = 1
index_of_min = 0
# for i in range(895,897):
# for i in range(n-1,n+1):
for i in range(n):
    print('*'*150)
    print('第 %d 次 test' % (i+1))
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=i+1)
    # 经测试,random_state=896

    #n_estimators表示树的个数,测试中100颗树足够
    # print(len(x_train))
    # print(len(x_train[0]))
    # print('*'*100)
    # print('start')
    forest=RandomForestClassifier(n_estimators=trees)
    forest.fit(x_train,y_train)

    test_score = forest.score(x_test,y_test)
    print("random forest with %d trees:"%trees) 
    print("accuracy on the training subset:{:.3f}".format(forest.score(x_train,y_train)))
    print("accuracy on the test subset:{:.3f}".format(test_score))
    print('Feature importances:{}'.format(forest.feature_importances_))
    if (max_acc <= test_score):
        max_acc = test_score
        index_of_max = i+1
    if (min_acc >= test_score):
        min_acc = test_score
        index_of_min = i+1
    acc_mean += test_score
    list_add(feature_mean,forest.feature_importances_)


print('Final :')
print("avg accuracy on the test subset:{:.3f}".format(acc_mean/n))
# print('avg Feature importances:{}'.format(list_div(feature_mean,n)))
print('max_acc: %f' % max_acc)
print('index : %d' % index_of_max)
print('min_acc: %f' % min_acc)
print('index : %d' % index_of_min)

x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=index_of_max)
forest=RandomForestClassifier(n_estimators=trees)
forest.fit(x_train,y_train)

n_features=x.shape[1]
plt.barh(range(n_features),forest.feature_importances_,align='center')
plt.yticks(np.arange(n_features),names)
plt.title("random forest with %d trees:"%trees)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()

猜你喜欢

转载自www.cnblogs.com/LieDra/p/12018556.html