特征工程——天池学习赛之贷款违约预测

一、对象特征数据预处理

  • 区分对象特征和数值特征
category_feat = list(data_df.select_dtypes(include=['object']).columns) # 对象特征
numerical_feat = list(data_df.select_dtypes(exclude=['object']).columns) # 数值特征
label = 'isDefault'
numerical_feat.remove(label) # 移除标签label
数据类型为Object类型的特征如下:
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
  • 处理对象特征的详细步骤
  1. 就业年限(年):employmentLength
data_df['employmentLength'].value_counts(dropna=False).sort_index() # 保留空值按值计数后按索引排序

# 值替换
data_df['employmentLength'].replace('10+ years', '10 years', inplace=True)
data_df['employmentLength'].replace('< 1 year', '0 years', inplace=True)

# 将字符串转换为整型
def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
    data_df['employmentLength'] = data_df['employmentLength'].apply(employmentLength_to_int)
  1. 借款人最早报告的信用额度开立的月份:earliesCreditLine
data_df['earliesCreditLine'].sample(5) # 随机采样
data_df['earliesCreditLine'] = data_df['earliesCreditLine'].apply(lambda s: int(s[-4:])) # 取后四位年份并转换为整型
  1. 贷款发放的月份:issueDate
data_df['issueDate'] = data_df['issueDate'].apply(lambda s: int(s[:4])) # 取前四位年份并转换为整型

二、类别特征数据预处理

  • 统计类别特征的类别个数
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
                 'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
    print(f, '类型个数:', data_df[f].nunique())
  • one-hot编码:类型数在2之上,又不是高维稀疏的类别特征
# pd.get_dummies参数drop_first表示移除第一个类别
data_df = pd.get_dummies(data_df, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
  • 将高维类别特征进行转换
for f in ['employmentTitle', 'postCode', 'title']:
    data_df[f+'_cnts'] = data_df.groupby([f])['id'].transform('count') # 类别特征f的计数
    data_df[f+'_rank'] = data_df.groupby([f])['id'].rank(ascending=False).astype(int) # f在某个类别下的倒序排名
    del data_df[f]

三、缺失值和异常值处理

# 按照平均数填充数值型特征
data_df[numerical_feat] = data_df[numerical_feat].fillna(data_df[numerical_feat].median())
# 按照众数填充类别型特征
data_df[category_feat] = data_df[category_feat].fillna(data_df[category_feat].mode())

四、时间格式处理

# 转化成时间格式
data_df['issueDate'] = pd.to_datetime(data_df['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
# 构造时间特征
data_df['issueDateDT'] = data_df['issueDate'].apply(lambda x: x-startdate).dt.days

五、特征构造

for col in ['grade', 'subGrade']: 
    temp_dict = data_df.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={
    
    'mean': col + '_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col + '_target_mean'].to_dict() # 将Series转换为字典{index:value}
    data_df[col + '_target_mean'] = data_df[col].map(temp_dict)
for item in ['n0','n1','n2','n2.1','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
    data_df['grade_to_mean_' + item] = data_df.groupby(['grade'])[item].transform('mean')
    data_df['grade_to_std_' + item] = data_df.groupby(['grade'])[item].transform('std')

猜你喜欢

转载自blog.csdn.net/xylbill97/article/details/108714968