数据清洗+特征构造:application_{train|test}.csv

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u014281392/article/details/81190197

主表:客户详细信息

application_trian/test.csv


import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib.pyplot as plt

Preprocessing application_train.csv

application_train = pd.read_csv('data/application_train.csv')
application_train.head()
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
0 100002 1 Cash loans M N Y 0 202500.0 406597.5 24700.5 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0
1 100003 0 Cash loans F N N 0 270000.0 1293502.5 35698.5 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
2 100004 0 Revolving loans M Y Y 0 67500.0 135000.0 6750.0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
3 100006 0 Cash loans F N Y 0 135000.0 312682.5 29686.5 0 0 0 0 NaN NaN NaN NaN NaN NaN
4 100007 0 Cash loans M N Y 0 121500.0 513000.0 21865.5 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 122 columns

feature_names = application_train.columns
feature_names
Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=122)

CODE_GENDER

application_train['CODE_GENDER'].unique()
array(['M', 'F', 'XNA'], dtype=object)

DAYS_EMPLOYED 365243是不可能的,异常值

application_train.loc[application_train['DAYS_EMPLOYED'] >0]['DAYS_EMPLOYED'].value_counts()
365243    55374
Name: DAYS_EMPLOYED, dtype: int64

用np.nan来取代缺失值和异常值

application_train['CODE_GENDER'].replace('XNA', np.nan, inplace=True)
application_train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)

申请贷款前多少天更换了电话

DAYS_LAST_PHONE_CHANGE

application_train['DAYS_LAST_PHONE_CHANGE'].value_counts().head(5)
 0.0    37672
-1.0     2812
-2.0     2318
-3.0     1763
-4.0     1285
Name: DAYS_LAST_PHONE_CHANGE, dtype: int64
application_train['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace = True)

NAME_FAMILY_STATUS

application_train['NAME_FAMILY_STATUS'].unique()
array(['Single / not married', 'Married', 'Civil marriage', 'Widow',
       'Separated', 'Unknown'], dtype=object)
application_train['NAME_FAMILY_STATUS'].replace('Unknow', np.nan, inplace = True)

ORGANIZATION_TYPE

application_train['ORGANIZATION_TYPE'].unique()
array(['Business Entity Type 3', 'School', 'Government', 'Religion',
       'Other', 'XNA', 'Electricity', 'Medicine', 'Business Entity Type 2',
       'Self-employed', 'Transport: type 2', 'Construction', 'Housing',
       'Kindergarten', 'Trade: type 7', 'Industry: type 11', 'Military',
       'Services', 'Security Ministries', 'Transport: type 4',
       'Industry: type 1', 'Emergency', 'Security', 'Trade: type 2',
       'University', 'Transport: type 3', 'Police',
       'Business Entity Type 1', 'Postal', 'Industry: type 4',
       'Agriculture', 'Restaurant', 'Culture', 'Hotel', 'Industry: type 7',
       'Trade: type 3', 'Industry: type 3', 'Bank', 'Industry: type 9',
       'Insurance', 'Trade: type 6', 'Industry: type 2',
       'Transport: type 1', 'Industry: type 12', 'Mobile', 'Trade: type 1',
       'Industry: type 5', 'Industry: type 10', 'Legal Services',
       'Advertising', 'Trade: type 5', 'Cleaning', 'Industry: type 13',
       'Trade: type 4', 'Telecom', 'Industry: type 8', 'Realtor',
       'Industry: type 6'], dtype=object)
application_train['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace = True)
application_train.fillna(np.nan).tail(5)
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
307506 456251 0 Cash loans M N N 0 157500.0 254700.0 27558.0 0 0 0 0 NaN NaN NaN NaN NaN NaN
307507 456252 0 Cash loans F N Y 0 72000.0 269550.0 12001.5 0 0 0 0 NaN NaN NaN NaN NaN NaN
307508 456253 0 Cash loans F N Y 0 153000.0 677664.0 29979.0 0 0 0 0 1.0 0.0 0.0 1.0 0.0 1.0
307509 456254 1 Cash loans F N Y 0 171000.0 370107.0 20205.0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
307510 456255 0 Cash loans F N N 0 157500.0 675000.0 49117.5 0 0 0 0 0.0 0.0 0.0 2.0 0.0 1.0

5 rows × 122 columns

Feature Engineering

application_train.csv构造新特征

application_train['annuity_income_percentage'] = application_train['AMT_ANNUITY'] / application_train['AMT_INCOME_TOTAL']
application_train['car_to_birth_ratio'] = application_train['OWN_CAR_AGE'] / application_train['DAYS_BIRTH']
application_train['car_to_employ_ratio'] = application_train['OWN_CAR_AGE'] / application_train['DAYS_EMPLOYED']
application_train['children_ratio'] = application_train['CNT_CHILDREN'] / application_train['CNT_FAM_MEMBERS']
application_train['credit_to_annuity_ratio'] = application_train['AMT_CREDIT'] / application_train['AMT_ANNUITY']
application_train['credit_to_goods_ratio'] = application_train['AMT_CREDIT'] / application_train['AMT_GOODS_PRICE']
application_train['credit_to_income_ratio'] = application_train['AMT_CREDIT'] / application_train['AMT_INCOME_TOTAL']
application_train['days_employed_percentage'] = application_train['DAYS_EMPLOYED'] / application_train['DAYS_BIRTH']
application_train['income_credit_percentage'] = application_train['AMT_INCOME_TOTAL'] / application_train['AMT_CREDIT']
application_train['income_per_child'] = application_train['AMT_INCOME_TOTAL'] / (1 + application_train['CNT_CHILDREN'])
application_train['income_per_person'] = application_train['AMT_INCOME_TOTAL'] / application_train['CNT_FAM_MEMBERS']
application_train['payment_rate'] = application_train['AMT_ANNUITY'] / application_train['AMT_CREDIT']
application_train['phone_to_birth_ratio'] = application_train['DAYS_LAST_PHONE_CHANGE'] / application_train['DAYS_BIRTH']
application_train['phone_to_employ_ratio'] = application_train['DAYS_LAST_PHONE_CHANGE'] / application_train['DAYS_EMPLOYED']
application_train['cnt_non_child'] = application_train['CNT_FAM_MEMBERS'] - application_train['CNT_CHILDREN']
application_train['child_to_non_child_ratio'] = application_train['CNT_CHILDREN'] / application_train['cnt_non_child']
application_train['income_per_non_child'] = application_train['AMT_INCOME_TOTAL'] / application_train['cnt_non_child']
application_train['credit_per_person'] = application_train['AMT_CREDIT'] / application_train['CNT_FAM_MEMBERS']
application_train['credit_per_child'] = application_train['AMT_CREDIT'] / (1 + application_train['CNT_CHILDREN'])
application_train['credit_per_non_child'] = application_train['AMT_CREDIT'] / application_train['cnt_non_child']
application_train['external_sources_weighted'] = application_train.EXT_SOURCE_1 * 2 + application_train.EXT_SOURCE_2 * 3 + application_train.EXT_SOURCE_3 * 4
application_train['retirement'] = (application_train.DAYS_BIRTH < -18000).astype(int)
application_train['long_employment'] = (application_train.DAYS_EMPLOYED < -4000).astype(int)

EXT_SOURCE 外部数据源

for feature_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
    application_train['external_sources_{}'.format(feature_name)] = eval('np.{}'.format(feature_name))(
        application_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)
application_train.shape
(307511, 150)

Preprocessing application_test.csv

test = pd.read_csv('data/application_test.csv')

test['CODE_GENDER'].replace('XNA', np.nan, inplace=True)
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)
test['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace = True)
test['NAME_FAMILY_STATUS'].replace('Unknow', np.nan, inplace = True)
test['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace = True)
test = test.fillna(np.nan)

Feature Engineering

test['annuity_income_percentage'] = test['AMT_ANNUITY'] / test['AMT_INCOME_TOTAL']
test['car_to_birth_ratio'] = test['OWN_CAR_AGE'] / test['DAYS_BIRTH']
test['car_to_employ_ratio'] = test['OWN_CAR_AGE'] / test['DAYS_EMPLOYED']
test['children_ratio'] = test['CNT_CHILDREN'] / test['CNT_FAM_MEMBERS']
test['credit_to_annuity_ratio'] = test['AMT_CREDIT'] / test['AMT_ANNUITY']
test['credit_to_goods_ratio'] = test['AMT_CREDIT'] / test['AMT_GOODS_PRICE']
test['credit_to_income_ratio'] = test['AMT_CREDIT'] / test['AMT_INCOME_TOTAL']
test['days_employed_percentage'] = test['DAYS_EMPLOYED'] / test['DAYS_BIRTH']
test['income_credit_percentage'] = test['AMT_INCOME_TOTAL'] / test['AMT_CREDIT']
test['income_per_child'] = test['AMT_INCOME_TOTAL'] / (1 + test['CNT_CHILDREN'])
test['income_per_person'] = test['AMT_INCOME_TOTAL'] / test['CNT_FAM_MEMBERS']
test['payment_rate'] = test['AMT_ANNUITY'] / test['AMT_CREDIT']
test['phone_to_birth_ratio'] = test['DAYS_LAST_PHONE_CHANGE'] / test['DAYS_BIRTH']
test['phone_to_employ_ratio'] = test['DAYS_LAST_PHONE_CHANGE'] / test['DAYS_EMPLOYED']
test['cnt_non_child'] = test['CNT_FAM_MEMBERS'] - test['CNT_CHILDREN']
test['child_to_non_child_ratio'] = test['CNT_CHILDREN'] / test['cnt_non_child']
test['income_per_non_child'] = test['AMT_INCOME_TOTAL'] / test['cnt_non_child']
test['credit_per_person'] = test['AMT_CREDIT'] / test['CNT_FAM_MEMBERS']
test['credit_per_child'] = test['AMT_CREDIT'] / (1 + test['CNT_CHILDREN'])
test['credit_per_non_child'] = test['AMT_CREDIT'] / test['cnt_non_child']
test['external_sources_weighted'] = test.EXT_SOURCE_1 * 2 + test.EXT_SOURCE_2 * 3 + application_train.EXT_SOURCE_3 * 4
test['retirement'] = (test.DAYS_BIRTH < -18000).astype(int)
test['long_employment'] = (test.DAYS_EMPLOYED < -4000).astype(int)
for feature_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
    test['external_sources_{}'.format(feature_name)] = eval('np.{}'.format(feature_name))(
        test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)
test.shape
(48744, 149)

相关性

correlations = application_train.corr()['TARGET'].sort_values()
correlations.iloc[:-1][:].plot(kind = 'barh', figsize = (10, 60))

这里写图片描述

Save data

application_train.to_csv('data/no_select_train.csv',index = False)
test.to_csv('data/no_select_test.csv', index = False)

猜你喜欢

转载自blog.csdn.net/u014281392/article/details/81190197