DataWhale17期-task4

原始链接

学习目标

  1. 学习在金融分控领域常用的机器学习模型
  2. 学习机器学习模型的建模过程与调参流程
import pandas as pd
import numpy as np
import warnings 
import os
import seaborn as sns
import matplotlib.pyplot as plt
#申明使用seaborn样式
sns.set()

#设置seaborn绘图风格,分别是darkgrid(默认)、whitegrid、dark、white、ticks。

sns.set_style("whitegrid")
#四个预置环境,从小到达分别是paper、notebook(默认)、dtalk、poster
sns.set_context("talk")

#设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
#解决保存图像是负号“-”显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
#解决Seaborn中文显示问题,并调整字体大小
sns.set(font = "SimHei")

memory_usage使用方法

#数据读取,调整数据类型,减少数据在内存中占用的空间
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum()
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min  > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
            
    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
#数据读取
data = pd.read_csv("./dataset/train.csv")
data = reduce_mem_usage(data)
Memory usage of dataframe is 300800080.00 MB
Memory usage after optimization is: 72834896.00 MB
Decreased by 75.8%
#数据读取
test_A = pd.read_csv("./dataset/testA.csv")
test_A = reduce_mem_usage(test_A)
Memory usage of dataframe is 76800080.00 MB
Memory usage after optimization is: 18834472.00 MB
Decreased by 75.5%
train = data
train.head()
id loanAmnt term interestRate installment grade subGrade employmentTitle employmentLength homeOwnership ... n5 n6 n7 n8 n9 n10 n11 n12 n13 n14
0 0 35008.0 5 19.515625 918.0000 E E2 320.0 2 years 2 ... 9.0 8.0 4.0 12.0 2.0 7.0 0.0 0.0 0.0 2.0
1 1 18000.0 5 18.484375 462.0000 D D2 219843.0 5 years 0 ... NaN NaN NaN NaN NaN 13.0 NaN NaN NaN NaN
2 2 12000.0 5 16.984375 298.2500 D D3 31698.0 8 years 0 ... 0.0 21.0 4.0 5.0 3.0 11.0 0.0 0.0 0.0 4.0
3 3 11000.0 3 7.261719 341.0000 A A4 46854.0 10+ years 1 ... 16.0 4.0 7.0 21.0 6.0 9.0 0.0 0.0 0.0 1.0
4 4 3000.0 3 12.992188 101.0625 C C2 54.0 NaN 1 ... 4.0 9.0 10.0 15.0 7.0 12.0 0.0 0.0 0.0 4.0

5 rows × 47 columns

数据预处理

data = pd.concat([train, test_A], axis=0, ignore_index=True)
D:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  """Entry point for launching an IPython kernel.
print(sorted(data['grade'].unique()))
print(sorted(data['subGrade'].unique()))
['A', 'B', 'C', 'D', 'E', 'F', 'G']
['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']
data['employmentLength'].value_counts(dropna=False).sort_index()
NaN           58541
1 year        65671
10+ years    328525
2 years       90565
3 years       80163
4 years       59818
5 years       62645
6 years       46582
7 years       44230
8 years       45168
9 years       37866
< 1 year      80226
Name: employmentLength, dtype: int64
#首先对employmentLength进行转换到数值
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
    
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'].value_counts(dropna=False).sort_index()
 0.0      80226
 1.0      65671
 2.0      90565
 3.0      80163
 4.0      59818
 5.0      62645
 6.0      46582
 7.0      44230
 8.0      45168
 9.0      37866
 10.0    328525
NaN       58541
Name: employmentLength, dtype: int64
#对earliesCreditLine进行预处理
data['earliesCreditLine'].sample(5)
930740    Dec-2005
314803    Jul-1994
155303    May-2008
480893    Aug-2000
712505    Dec-2000
Name: earliesCreditLine, dtype: object
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
data['earliesCreditLine'].describe()
count    1000000.000000
mean        1998.688632
std            7.606231
min         1944.000000
25%         1995.000000
50%         2000.000000
75%         2004.000000
max         2015.000000
Name: earliesCreditLine, dtype: float64
data.head()
annualIncome applicationType delinquency_2years dti earliesCreditLine employmentLength employmentTitle ficoRangeHigh ficoRangeLow grade ... pubRecBankruptcies purpose regionCode revolBal revolUtil subGrade term title totalAcc verificationStatus
0 110000.0 0 0.0 17.046875 2001 2.0 320.0 734.0 730.0 E ... 0.0 1 32 24178.0 48.90625 E2 5 1.0 27.0 2
1 46000.0 0 0.0 27.828125 2002 5.0 219843.0 704.0 700.0 D ... 0.0 0 18 15096.0 38.90625 D2 5 1723.0 18.0 2
2 74000.0 0 0.0 22.765625 2006 8.0 31698.0 679.0 675.0 D ... 0.0 0 14 4606.0 51.81250 D3 5 0.0 27.0 2
3 118000.0 0 0.0 17.203125 1999 10.0 46854.0 689.0 685.0 A ... 0.0 4 11 9948.0 52.59375 A4 3 4.0 28.0 1
4 29000.0 0 0.0 32.156250 1977 NaN 54.0 694.0 690.0 C ... 0.0 10 21 2942.0 32.00000 C2 3 11.0 27.0 2

5 rows × 49 columns

#类别特征处理
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
                 'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
    print(f, '类型数:', data[f].nunique())
grade 类型数: 7
subGrade 类型数: 35
employmentTitle 类型数: 298101
homeOwnership 类型数: 6
verificationStatus 类型数: 3
purpose 类型数: 14
postCode 类型数: 935
regionCode 类型数: 51
applicationType 类型数: 2
initialListStatus 类型数: 2
title 类型数: 6712
policyCode 类型数: 1
# 类型数在2之上,又不是高维稀疏的
data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
# 高维类别特征需要进行转换
for f in ['employmentTitle', 'postCode', 'title']:
    data[f+'_cnts'] = data.groupby([f])['id'].transform('count')
    data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
    del data[f]

训练数据/测试数据准备

features = [f for f in data.columns if f not in ['id','issueDate','isDefault']]

train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)

x_train = train[features]
x_test = test[features]

y_train = train['isDefault']
# 5折交叉验证
from sklearn.model_selection import KFold
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

使用Lightgbm进行建模

"""对训练集数据进行划分,分成训练集和验证集,并进行相应的操作"""
from sklearn.model_selection import train_test_split
import lightgbm as lgb
# 数据集划分
X_train_split, X_val, y_train_split, y_val = train_test_split(x_train, y_train, test_size=0.2)
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(X_val, label=y_val)

params = {
    
    
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'learning_rate': 0.1,
            'metric': 'auc',
            'min_child_weight': 1e-3,
            'num_leaves': 31,
            'max_depth': -1,
            'reg_lambda': 0,
            'reg_alpha': 0,
            'feature_fraction': 1,
            'bagging_fraction': 1,
            'bagging_freq': 0,
            'seed': 2020,
            'nthread': 8,
            'silent': True,
            'verbose': -1,
}
"""使用训练集数据进行模型训练"""
model = lgb.train(params, train_set=train_matrix, valid_sets=valid_matrix, num_boost_round=20000, verbose_eval=1000, early_stopping_rounds=200)
D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[LightGBM] [Warning] Unknown parameter: silent


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[702]	valid_0's auc: 0.728411
from sklearn import metrics
from sklearn.metrics import roc_auc_score

"""预测并计算roc的相关指标"""
val_pre_lgb = model.predict(X_val, num_iteration=model.best_iteration)
fpr, tpr, threshold = metrics.roc_curve(y_val, val_pre_lgb)
roc_auc = metrics.auc(fpr, tpr)
print('未调参前lightgbm单模型在验证集上的AUC:{}'.format(roc_auc))
"""画出roc曲线图"""
plt.figure(figsize=(8, 8))
plt.title('Validation ROC')
plt.plot(fpr, tpr, 'b', label = 'Val AUC = %0.4f' % roc_auc)
plt.ylim(0,1)
plt.xlim(0,1)
plt.legend(loc='best')
plt.title('ROC')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
# 画出对角线
plt.plot([0,1],[0,1],'r--')
plt.show()
未调参前lightgbm单模型在验证集上的AUC:0.7284105986326812

在这里插入图片描述

#更进一步的,使用5折交叉验证进行模型性能评估
import lightgbm as lgb
"""使用lightgbm 5折交叉验证进行建模预测"""
cv_scores = []
test = np.zeros(x_test.shape[0])
for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
    print('************************************ {} ************************************'.format(str(i+1)))
    X_train_split, y_train_split, X_val, y_val = x_train.iloc[train_index], y_train[train_index], x_train.iloc[valid_index], y_train[valid_index]

    train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
    valid_matrix = lgb.Dataset(X_val, label=y_val)

    params = {
    
    
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'learning_rate': 0.1,
                'metric': 'auc',

                'min_child_weight': 1e-3,
                'num_leaves': 31,
                'max_depth': -1,
                'reg_lambda': 0,
                'reg_alpha': 0,
                'feature_fraction': 1,
                'bagging_fraction': 1,
                'bagging_freq': 0,
                'seed': 2020,
                'nthread': 8,
                'silent': True,
                'verbose': -1,
    }

    model = lgb.train(params, train_set=train_matrix, num_boost_round=20000, valid_sets=valid_matrix, verbose_eval=1000, early_stopping_rounds=200)
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    test_pred = model.predict(x_test, num_iteration=model.best_iteration)
    test = test_pred / kf.n_splits

    cv_scores.append(roc_auc_score(y_val, val_pred))
    print(cv_scores)

print("lgb_scotrainre_list:{}".format(cv_scores))
print("lgb_score_mean:{}".format(np.mean(cv_scores)))
print("lgb_score_std:{}".format(np.std(cv_scores)))
************************************ 1 ************************************


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[LightGBM] [Warning] Unknown parameter: silent


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[567]	valid_0's auc: 0.729987
[0.7299873542198716]
************************************ 2 ************************************


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[LightGBM] [Warning] Unknown parameter: silent


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[650]	valid_0's auc: 0.726013
[0.7299873542198716, 0.726012918953538]
************************************ 3 ************************************


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[LightGBM] [Warning] Unknown parameter: silent


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[468]	valid_0's auc: 0.731096
[0.7299873542198716, 0.726012918953538, 0.7310964920946825]
************************************ 4 ************************************


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[LightGBM] [Warning] Unknown parameter: silent


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[692]	valid_0's auc: 0.729743
[0.7299873542198716, 0.726012918953538, 0.7310964920946825, 0.7297434016732525]
************************************ 5 ************************************


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[LightGBM] [Warning] Unknown parameter: silent


D:\Users\Administrator\Anaconda3\lib\site-packages\lightgbm\basic.py:1077: UserWarning: silent keyword has been found in `params` and will be ignored.
Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[743]	valid_0's auc: 0.728744
[0.7299873542198716, 0.726012918953538, 0.7310964920946825, 0.7297434016732525, 0.7287440305086693]
lgb_scotrainre_list:[0.7299873542198716, 0.726012918953538, 0.7310964920946825, 0.7297434016732525, 0.7287440305086693]
lgb_score_mean:0.7291168394900027
lgb_score_std:0.0017229457178253177
test_A['isDefault'] = test_pred
test_A[['id','isDefault']].to_csv('./dataset/test_sub.csv', index=False)

猜你喜欢

转载自blog.csdn.net/qq_35268841/article/details/108785442