DataWhale17期-task4

原始链接

学习目标

学习在金融分控领域常用的机器学习模型
学习机器学习模型的建模过程与调参流程

import pandas as pd
import numpy as np
import warnings 
import os
import seaborn as sns
import matplotlib.pyplot as plt

#申明使用seaborn样式
sns.set()

#设置seaborn绘图风格，分别是darkgrid（默认）、whitegrid、dark、white、ticks。

sns.set_style("whitegrid")
#四个预置环境，从小到达分别是paper、notebook（默认）、dtalk、poster
sns.set_context("talk")

#设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
#解决保存图像是负号“-”显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
#解决Seaborn中文显示问题，并调整字体大小
sns.set(font = "SimHei")

memory_usage使用方法

#数据读取，调整数据类型，减少数据在内存中占用的空间
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum()
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min  > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
            
    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

#数据读取
data = pd.read_csv("./dataset/train.csv")
data = reduce_mem_usage(data)

Memory usage of dataframe is 300800080.00 MB
Memory usage after optimization is: 72834896.00 MB
Decreased by 75.8%

#数据读取
test_A = pd.read_csv("./dataset/testA.csv")
test_A = reduce_mem_usage(test_A)

Memory usage of dataframe is 76800080.00 MB
Memory usage after optimization is: 18834472.00 MB
Decreased by 75.5%

train = data

train.head()

	id	loanAmnt	term	interestRate	installment	grade	subGrade	employmentTitle	employmentLength	homeOwnership	...	n5	n6	n7	n8	n9	n10	n11	n12	n13	n14
0	0	35008.0	5	19.515625	918.0000	E	E2	320.0	2 years	2	...	9.0	8.0	4.0	12.0	2.0	7.0	0.0	0.0	0.0	2.0
1	1	18000.0	5	18.484375	462.0000	D	D2	219843.0	5 years	0	...	NaN	NaN	NaN	NaN	NaN	13.0	NaN	NaN	NaN	NaN
2	2	12000.0	5	16.984375	298.2500	D	D3	31698.0	8 years	0	...	0.0	21.0	4.0	5.0	3.0	11.0	0.0	0.0	0.0	4.0
3	3	11000.0	3	7.261719	341.0000	A	A4	46854.0	10+ years	1	...	16.0	4.0	7.0	21.0	6.0	9.0	0.0	0.0	0.0	1.0
4	4	3000.0	3	12.992188	101.0625	C	C2	54.0	NaN	1	...	4.0	9.0	10.0	15.0	7.0	12.0	0.0	0.0	0.0	4.0

5 rows × 47 columns

数据预处理

data = pd.concat([train, test_A], axis=0, ignore_index=True)

D:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  """Entry point for launching an IPython kernel.

print(sorted(data['grade'].unique()))
print(sorted(data['subGrade'].unique()))

['A', 'B', 'C', 'D', 'E', 'F', 'G']
['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']

data['employmentLength'].value_counts(dropna=False).sort_index()

NaN           58541
1 year        65671
10+ years    328525
2 years       90565
3 years       80163
4 years       59818
5 years       62645
6 years       46582
7 years       44230
8 years       45168
9 years       37866
< 1 year      80226
Name: employmentLength, dtype: int64

#首先对employmentLength进行转换到数值
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
    
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

data['employmentLength'].value_counts(dropna=False).sort_index()

 0.0      80226
 1.0      65671
 2.0      90565
 3.0      80163
 4.0      59818
 5.0      62645
 6.0      46582
 7.0      44230
 8.0      45168
 9.0      37866
 10.0    328525
NaN       58541
Name: employmentLength, dtype: int64

#对earliesCreditLine进行预处理
data['earliesCreditLine'].sample(5)

930740    Dec-2005
314803    Jul-1994
155303    May-2008
480893    Aug-2000
712505    Dec-2000
Name: earliesCreditLine, dtype: object

data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
data['earliesCreditLine'].describe()

count    1000000.000000
mean        1998.688632
std            7.606231
min         1944.000000
25%         1995.000000
50%         2000.000000
75%         2004.000000
max         2015.000000
Name: earliesCreditLine, dtype: float64

data.head()

	annualIncome	dti	earliesCreditLine	employmentLength	employmentTitle	ficoRangeHigh	ficoRangeLow	grade	...	purpose	regionCode	revolBal	revolUtil	subGrade	term	title	totalAcc	verificationStatus
0	110000.0	17.046875	2001	2.0	320.0	734.0	730.0	E	...	1	32	24178.0	48.90625	E2	5	1.0	27.0	2
1	46000.0	27.828125	2002	5.0	219843.0	704.0	700.0	D	...	0	18	15096.0	38.90625	D2	5	1723.0	18.0	2
2	74000.0	22.765625	2006	8.0	31698.0	679.0	675.0	D	...	0	14	4606.0	51.81250	D3	5	0.0	27.0	2
3	118000.0	17.203125	1999	10.0	46854.0	689.0	685.0	A	...	4	11	9948.0	52.59375	A4	3	4.0	28.0	1
4	29000.0	32.156250	1977	NaN	54.0	694.0	690.0	C	...	10	21	2942.0	32.00000	C2	3	11.0	27.0	2

5 rows × 49 columns

#类别特征处理
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
                 'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
    print(f, '类型数：', data[f].nunique())

grade 类型数： 7
subGrade 类型数： 35
employmentTitle 类型数： 298101
homeOwnership 类型数： 6
verificationStatus 类型数： 3
purpose 类型数： 14
postCode 类型数： 935
regionCode 类型数： 51
applicationType 类型数： 2
initialListStatus 类型数： 2
title 类型数： 6712
policyCode 类型数： 1

# 类型数在2之上，又不是高维稀疏的
data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

# 高维类别特征需要进行转换
for f in ['employmentTitle', 'postCode', 'title']:
    data[f+'_cnts'] = data.groupby([f])['id'].transform('count')
    data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
    del data[f]

训练数据/测试数据准备

features = [f for f in data.columns if f not in ['id','issueDate','isDefault']]

train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)

x_train = train[features]
x_test = test[features]

y_train = train['isDefault']

# 5折交叉验证
from sklearn.model_selection import KFold
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

使用Lightgbm进行建模

"""对训练集数据进行划分，分成训练集和验证集，并进行相应的操作"""
from sklearn.model_selection import train_test_split
import lightgbm as lgb
# 数据集划分
X_train_split, X_val, y_train_split, y_val = train_test_split(x_train, y_train, test_size=0.2)
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(X_val, label=y_val)

params = {
    
    
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'learning_rate': 0.1,
            'metric': 'auc',
            'min_child_weight': 1e-3,
            'num_leaves': 31,
            'max_depth': -1,
            'reg_lambda': 0,
            'reg_alpha': 0,
            'feature_fraction': 1,
            'bagging_fraction': 1,
            'bagging_freq': 0,
            'seed': 2020,
            'nthread': 8,
            'silent': True,
            'verbose': -1,
}
"""使用训练集数据进行模型训练"""
model = lgb.train(params, train_set=train_matrix, valid_sets=valid_matrix, num_boost_round=20000, verbose_eval=1000, early_stopping_rounds=200)

D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[LightGBM] [Warning] Unknown parameter: silent


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[702]	valid_0's auc: 0.728411

from sklearn import metrics
from sklearn.metrics import roc_auc_score

"""预测并计算roc的相关指标"""
val_pre_lgb = model.predict(X_val, num_iteration=model.best_iteration)
fpr, tpr, threshold = metrics.roc_curve(y_val, val_pre_lgb)
roc_auc = metrics.auc(fpr, tpr)
print('未调参前lightgbm单模型在验证集上的AUC：{}'.format(roc_auc))
"""画出roc曲线图"""
plt.figure(figsize=(8, 8))
plt.title('Validation ROC')
plt.plot(fpr, tpr, 'b', label = 'Val AUC = %0.4f' % roc_auc)
plt.ylim(0,1)
plt.xlim(0,1)
plt.legend(loc='best')
plt.title('ROC')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
# 画出对角线
plt.plot([0,1],[0,1],'r--')
plt.show()

未调参前lightgbm单模型在验证集上的AUC：0.7284105986326812

在这里插入图片描述

#更进一步的，使用5折交叉验证进行模型性能评估
import lightgbm as lgb
"""使用lightgbm 5折交叉验证进行建模预测"""
cv_scores = []
test = np.zeros(x_test.shape[0])
for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
    print('************************************ {} ************************************'.format(str(i+1)))
    X_train_split, y_train_split, X_val, y_val = x_train.iloc[train_index], y_train[train_index], x_train.iloc[valid_index], y_train[valid_index]

    train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
    valid_matrix = lgb.Dataset(X_val, label=y_val)

    params = {
    
    
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'learning_rate': 0.1,
                'metric': 'auc',

                'min_child_weight': 1e-3,
                'num_leaves': 31,
                'max_depth': -1,
                'reg_lambda': 0,
                'reg_alpha': 0,
                'feature_fraction': 1,
                'bagging_fraction': 1,
                'bagging_freq': 0,
                'seed': 2020,
                'nthread': 8,
                'silent': True,
                'verbose': -1,
    }

    model = lgb.train(params, train_set=train_matrix, num_boost_round=20000, valid_sets=valid_matrix, verbose_eval=1000, early_stopping_rounds=200)
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    test_pred = model.predict(x_test, num_iteration=model.best_iteration)
    test = test_pred / kf.n_splits

    cv_scores.append(roc_auc_score(y_val, val_pred))
    print(cv_scores)

print("lgb_scotrainre_list:{}".format(cv_scores))
print("lgb_score_mean:{}".format(np.mean(cv_scores)))
print("lgb_score_std:{}".format(np.std(cv_scores)))

************************************ 1 ************************************


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[LightGBM] [Warning] Unknown parameter: silent


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[567]	valid_0's auc: 0.729987
[0.7299873542198716]
************************************ 2 ************************************


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[LightGBM] [Warning] Unknown parameter: silent


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[650]	valid_0's auc: 0.726013
[0.7299873542198716, 0.726012918953538]
************************************ 3 ************************************


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[LightGBM] [Warning] Unknown parameter: silent


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[468]	valid_0's auc: 0.731096
[0.7299873542198716, 0.726012918953538, 0.7310964920946825]
************************************ 4 ************************************


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[LightGBM] [Warning] Unknown parameter: silent


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[692]	valid_0's auc: 0.729743
[0.7299873542198716, 0.726012918953538, 0.7310964920946825, 0.7297434016732525]
************************************ 5 ************************************


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[LightGBM] [Warning] Unknown parameter: silent


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[743]	valid_0's auc: 0.728744
[0.7299873542198716, 0.726012918953538, 0.7310964920946825, 0.7297434016732525, 0.7287440305086693]
lgb_scotrainre_list:[0.7299873542198716, 0.726012918953538, 0.7310964920946825, 0.7297434016732525, 0.7287440305086693]
lgb_score_mean:0.7291168394900027
lgb_score_std:0.0017229457178253177

test_A['isDefault'] = test_pred

test_A[['id','isDefault']].to_csv('./dataset/test_sub.csv', index=False)

原始链接

学习目标

memory_usage使用方法

数据预处理

训练数据/测试数据准备

使用Lightgbm进行建模

猜你喜欢