一、对象特征数据预处理
category_feat = list ( data_df. select_dtypes( include= [ 'object' ] ) . columns)
numerical_feat = list ( data_df. select_dtypes( exclude= [ 'object' ] ) . columns)
label = 'isDefault'
numerical_feat. remove( label)
数据类型为Object类型的特征如下:
[ 'grade' , 'subGrade' , 'employmentLength' , 'issueDate' , 'earliesCreditLine' ]
就业年限(年):employmentLength
data_df[ 'employmentLength' ] . value_counts( dropna= False ) . sort_index( )
data_df[ 'employmentLength' ] . replace( '10+ years' , '10 years' , inplace= True )
data_df[ 'employmentLength' ] . replace( '< 1 year' , '0 years' , inplace= True )
def employmentLength_to_int ( s) :
if pd. isnull( s) :
return s
else :
return np. int8( s. split( ) [ 0 ] )
data_df[ 'employmentLength' ] = data_df[ 'employmentLength' ] . apply ( employmentLength_to_int)
借款人最早报告的信用额度开立的月份:earliesCreditLine
data_df[ 'earliesCreditLine' ] . sample( 5 )
data_df[ 'earliesCreditLine' ] = data_df[ 'earliesCreditLine' ] . apply ( lambda s: int ( s[ - 4 : ] ) )
贷款发放的月份:issueDate
data_df[ 'issueDate' ] = data_df[ 'issueDate' ] . apply ( lambda s: int ( s[ : 4 ] ) )
二、类别特征数据预处理
cate_features = [ 'grade' , 'subGrade' , 'employmentTitle' , 'homeOwnership' , 'verificationStatus' , 'purpose' , 'postCode' , 'regionCode' , \
'applicationType' , 'initialListStatus' , 'title' , 'policyCode' ]
for f in cate_features:
print ( f, '类型个数:' , data_df[ f] . nunique( ) )
one-hot编码:类型数在2之上,又不是高维稀疏的类别特征
data_df = pd. get_dummies( data_df, columns= [ 'grade' , 'subGrade' , 'homeOwnership' , 'verificationStatus' , 'purpose' , 'regionCode' ] , drop_first= True )
for f in [ 'employmentTitle' , 'postCode' , 'title' ] :
data_df[ f+ '_cnts' ] = data_df. groupby( [ f] ) [ 'id' ] . transform( 'count' )
data_df[ f+ '_rank' ] = data_df. groupby( [ f] ) [ 'id' ] . rank( ascending= False ) . astype( int )
del data_df[ f]
三、缺失值和异常值处理
data_df[ numerical_feat] = data_df[ numerical_feat] . fillna( data_df[ numerical_feat] . median( ) )
data_df[ category_feat] = data_df[ category_feat] . fillna( data_df[ category_feat] . mode( ) )
四、时间格式处理
data_df[ 'issueDate' ] = pd. to_datetime( data_df[ 'issueDate' ] , format = '%Y-%m-%d' )
startdate = datetime. datetime. strptime( '2007-06-01' , '%Y-%m-%d' )
data_df[ 'issueDateDT' ] = data_df[ 'issueDate' ] . apply ( lambda x: x- startdate) . dt. days
五、特征构造
for col in [ 'grade' , 'subGrade' ] :
temp_dict = data_df. groupby( [ col] ) [ 'isDefault' ] . agg( [ 'mean' ] ) . reset_index( ) . rename( columns= {
'mean' : col + '_target_mean' } )
temp_dict. index = temp_dict[ col] . values
temp_dict = temp_dict[ col + '_target_mean' ] . to_dict( )
data_df[ col + '_target_mean' ] = data_df[ col] . map ( temp_dict)
for item in [ 'n0' , 'n1' , 'n2' , 'n2.1' , 'n4' , 'n5' , 'n6' , 'n7' , 'n8' , 'n9' , 'n10' , 'n11' , 'n12' , 'n13' , 'n14' ] :
data_df[ 'grade_to_mean_' + item] = data_df. groupby( [ 'grade' ] ) [ item] . transform( 'mean' )
data_df[ 'grade_to_std_' + item] = data_df. groupby( [ 'grade' ] ) [ item] . transform( 'std' )