原始链接
学习目标
- 学习在金融分控领域常用的机器学习模型
- 学习机器学习模型的建模过程与调参流程
import pandas as pd
import numpy as np
import warnings
import os
import seaborn as sns
import matplotlib.pyplot as plt
#申明使用seaborn样式
sns.set()
#设置seaborn绘图风格,分别是darkgrid(默认)、whitegrid、dark、white、ticks。
sns.set_style("whitegrid")
#四个预置环境,从小到达分别是paper、notebook(默认)、dtalk、poster
sns.set_context("talk")
#设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
#解决保存图像是负号“-”显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
#解决Seaborn中文显示问题,并调整字体大小
sns.set(font = "SimHei")
memory_usage使用方法
#数据读取,调整数据类型,减少数据在内存中占用的空间
def reduce_mem_usage(df):
start_mem = df.memory_usage().sum()
print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == "int":
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum()
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
#数据读取
data = pd.read_csv("./dataset/train.csv")
data = reduce_mem_usage(data)
Memory usage of dataframe is 300800080.00 MB
Memory usage after optimization is: 72834896.00 MB
Decreased by 75.8%
#数据读取
test_A = pd.read_csv("./dataset/testA.csv")
test_A = reduce_mem_usage(test_A)
Memory usage of dataframe is 76800080.00 MB
Memory usage after optimization is: 18834472.00 MB
Decreased by 75.5%
train = data
train.head()
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35008.0 | 5 | 19.515625 | 918.0000 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.484375 | 462.0000 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
2 | 2 | 12000.0 | 5 | 16.984375 | 298.2500 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
3 | 3 | 11000.0 | 3 | 7.261719 | 341.0000 | A | A4 | 46854.0 | 10+ years | 1 | ... | 16.0 | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 4 | 3000.0 | 3 | 12.992188 | 101.0625 | C | C2 | 54.0 | NaN | 1 | ... | 4.0 | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 |
5 rows × 47 columns
数据预处理
data = pd.concat([train, test_A], axis=0, ignore_index=True)
D:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
"""Entry point for launching an IPython kernel.
print(sorted(data['grade'].unique()))
print(sorted(data['subGrade'].unique()))
['A', 'B', 'C', 'D', 'E', 'F', 'G']
['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']
data['employmentLength'].value_counts(dropna=False).sort_index()
NaN 58541
1 year 65671
10+ years 328525
2 years 90565
3 years 80163
4 years 59818
5 years 62645
6 years 46582
7 years 44230
8 years 45168
9 years 37866
< 1 year 80226
Name: employmentLength, dtype: int64
#首先对employmentLength进行转换到数值
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
def employmentLength_to_int(s):
if pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'].value_counts(dropna=False).sort_index()
0.0 80226
1.0 65671
2.0 90565
3.0 80163
4.0 59818
5.0 62645
6.0 46582
7.0 44230
8.0 45168
9.0 37866
10.0 328525
NaN 58541
Name: employmentLength, dtype: int64
#对earliesCreditLine进行预处理
data['earliesCreditLine'].sample(5)
930740 Dec-2005
314803 Jul-1994
155303 May-2008
480893 Aug-2000
712505 Dec-2000
Name: earliesCreditLine, dtype: object
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
data['earliesCreditLine'].describe()
count 1000000.000000
mean 1998.688632
std 7.606231
min 1944.000000
25% 1995.000000
50% 2000.000000
75% 2004.000000
max 2015.000000
Name: earliesCreditLine, dtype: float64
data.head()
annualIncome | applicationType | delinquency_2years | dti | earliesCreditLine | employmentLength | employmentTitle | ficoRangeHigh | ficoRangeLow | grade | ... | pubRecBankruptcies | purpose | regionCode | revolBal | revolUtil | subGrade | term | title | totalAcc | verificationStatus | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 110000.0 | 0 | 0.0 | 17.046875 | 2001 | 2.0 | 320.0 | 734.0 | 730.0 | E | ... | 0.0 | 1 | 32 | 24178.0 | 48.90625 | E2 | 5 | 1.0 | 27.0 | 2 |
1 | 46000.0 | 0 | 0.0 | 27.828125 | 2002 | 5.0 | 219843.0 | 704.0 | 700.0 | D | ... | 0.0 | 0 | 18 | 15096.0 | 38.90625 | D2 | 5 | 1723.0 | 18.0 | 2 |
2 | 74000.0 | 0 | 0.0 | 22.765625 | 2006 | 8.0 | 31698.0 | 679.0 | 675.0 | D | ... | 0.0 | 0 | 14 | 4606.0 | 51.81250 | D3 | 5 | 0.0 | 27.0 | 2 |
3 | 118000.0 | 0 | 0.0 | 17.203125 | 1999 | 10.0 | 46854.0 | 689.0 | 685.0 | A | ... | 0.0 | 4 | 11 | 9948.0 | 52.59375 | A4 | 3 | 4.0 | 28.0 | 1 |
4 | 29000.0 | 0 | 0.0 | 32.156250 | 1977 | NaN | 54.0 | 694.0 | 690.0 | C | ... | 0.0 | 10 | 21 | 2942.0 | 32.00000 | C2 | 3 | 11.0 | 27.0 | 2 |
5 rows × 49 columns
#类别特征处理
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
print(f, '类型数:', data[f].nunique())
grade 类型数: 7
subGrade 类型数: 35
employmentTitle 类型数: 298101
homeOwnership 类型数: 6
verificationStatus 类型数: 3
purpose 类型数: 14
postCode 类型数: 935
regionCode 类型数: 51
applicationType 类型数: 2
initialListStatus 类型数: 2
title 类型数: 6712
policyCode 类型数: 1
# 类型数在2之上,又不是高维稀疏的
data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
# 高维类别特征需要进行转换
for f in ['employmentTitle', 'postCode', 'title']:
data[f+'_cnts'] = data.groupby([f])['id'].transform('count')
data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
del data[f]
训练数据/测试数据准备
features = [f for f in data.columns if f not in ['id','issueDate','isDefault']]
train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)
x_train = train[features]
x_test = test[features]
y_train = train['isDefault']
# 5折交叉验证
from sklearn.model_selection import KFold
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
使用Lightgbm进行建模
"""对训练集数据进行划分,分成训练集和验证集,并进行相应的操作"""
from sklearn.model_selection import train_test_split
import lightgbm as lgb
# 数据集划分
X_train_split, X_val, y_train_split, y_val = train_test_split(x_train, y_train, test_size=0.2)
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(X_val, label=y_val)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'learning_rate': 0.1,
'metric': 'auc',
'min_child_weight': 1e-3,
'num_leaves': 31,
'max_depth': -1,
'reg_lambda': 0,
'reg_alpha': 0,
'feature_fraction': 1,
'bagging_fraction': 1,
'bagging_freq': 0,
'seed': 2020,
'nthread': 8,
'silent': True,
'verbose': -1,
}
"""使用训练集数据进行模型训练"""
model = lgb.train(params, train_set=train_matrix, valid_sets=valid_matrix, num_boost_round=20000, verbose_eval=1000, early_stopping_rounds=200)
D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
.format(key))
[LightGBM] [Warning] Unknown parameter: silent
D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
.format(key))
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[702] valid_0's auc: 0.728411
from sklearn import metrics
from sklearn.metrics import roc_auc_score
"""预测并计算roc的相关指标"""
val_pre_lgb = model.predict(X_val, num_iteration=model.best_iteration)
fpr, tpr, threshold = metrics.roc_curve(y_val, val_pre_lgb)
roc_auc = metrics.auc(fpr, tpr)
print('未调参前lightgbm单模型在验证集上的AUC:{}'.format(roc_auc))
"""画出roc曲线图"""
plt.figure(figsize=(8, 8))
plt.title('Validation ROC')
plt.plot(fpr, tpr, 'b', label = 'Val AUC = %0.4f' % roc_auc)
plt.ylim(0,1)
plt.xlim(0,1)
plt.legend(loc='best')
plt.title('ROC')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
# 画出对角线
plt.plot([0,1],[0,1],'r--')
plt.show()
未调参前lightgbm单模型在验证集上的AUC:0.7284105986326812
#更进一步的,使用5折交叉验证进行模型性能评估
import lightgbm as lgb
"""使用lightgbm 5折交叉验证进行建模预测"""
cv_scores = []
test = np.zeros(x_test.shape[0])
for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
print('************************************ {} ************************************'.format(str(i+1)))
X_train_split, y_train_split, X_val, y_val = x_train.iloc[train_index], y_train[train_index], x_train.iloc[valid_index], y_train[valid_index]
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(X_val, label=y_val)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'learning_rate': 0.1,
'metric': 'auc',
'min_child_weight': 1e-3,
'num_leaves': 31,
'max_depth': -1,
'reg_lambda': 0,
'reg_alpha': 0,
'feature_fraction': 1,
'bagging_fraction': 1,
'bagging_freq': 0,
'seed': 2020,
'nthread': 8,
'silent': True,
'verbose': -1,
}
model = lgb.train(params, train_set=train_matrix, num_boost_round=20000, valid_sets=valid_matrix, verbose_eval=1000, early_stopping_rounds=200)
val_pred = model.predict(X_val, num_iteration=model.best_iteration)
test_pred = model.predict(x_test, num_iteration=model.best_iteration)
test = test_pred / kf.n_splits
cv_scores.append(roc_auc_score(y_val, val_pred))
print(cv_scores)
print("lgb_scotrainre_list:{}".format(cv_scores))
print("lgb_score_mean:{}".format(np.mean(cv_scores)))
print("lgb_score_std:{}".format(np.std(cv_scores)))
************************************ 1 ************************************
D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
.format(key))
[LightGBM] [Warning] Unknown parameter: silent
D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
.format(key))
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[567] valid_0's auc: 0.729987
[0.7299873542198716]
************************************ 2 ************************************
D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
.format(key))
[LightGBM] [Warning] Unknown parameter: silent
D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
.format(key))
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[650] valid_0's auc: 0.726013
[0.7299873542198716, 0.726012918953538]
************************************ 3 ************************************
D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
.format(key))
[LightGBM] [Warning] Unknown parameter: silent
D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
.format(key))
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[468] valid_0's auc: 0.731096
[0.7299873542198716, 0.726012918953538, 0.7310964920946825]
************************************ 4 ************************************
D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
.format(key))
[LightGBM] [Warning] Unknown parameter: silent
D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
.format(key))
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[692] valid_0's auc: 0.729743
[0.7299873542198716, 0.726012918953538, 0.7310964920946825, 0.7297434016732525]
************************************ 5 ************************************
D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
.format(key))
[LightGBM] [Warning] Unknown parameter: silent
D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
.format(key))
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[743] valid_0's auc: 0.728744
[0.7299873542198716, 0.726012918953538, 0.7310964920946825, 0.7297434016732525, 0.7287440305086693]
lgb_scotrainre_list:[0.7299873542198716, 0.726012918953538, 0.7310964920946825, 0.7297434016732525, 0.7287440305086693]
lgb_score_mean:0.7291168394900027
lgb_score_std:0.0017229457178253177
test_A['isDefault'] = test_pred
test_A[['id','isDefault']].to_csv('./dataset/test_sub.csv', index=False)