oil_data_for_tree.xlsx文件是滴滴给司机放的加油的贷款. | ||
uid:用户id | bad_ind:目标值 | coupon_amount:优惠券 |
oil_actv_dt:放款的时间. | oil_amount:消耗的油的数量 | channel_code:从哪个渠道来的 |
create_dt:创建账户的时间 | sale_amount:销售的金额 | oil_code:加的是哪种油 |
total_oil_cnt:总共加油的数量 | discount_amount:打折之后的金额 | source_app:来源的APP |
pay amount total:支付总金额 | amount:还剩下多少钱 | scene 场景 |
class_new:给用户的评级 | pay_amount:花了多少钱 | call_source:通话来源 |
import pandas as pd
import numpy as np
data = pd.read_excel('oil_data_for_tree.xlsx')
data.head()
#此表是B卡的用户,属于放款中。A卡是放款前,C卡是催收。
uid | oil_actv_dt | create_dt | total_oil_cnt | pay_amount_total | class_new | bad_ind | oil_amount | discount_amount | sale_amount | amount | pay_amount | coupon_amount | payment_coupon_amount | channel_code | oil_code | scene | source_app | call_source | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A8217710 | 2018-08-19 | 2018-08-17 | 275.0 | 48295495.4 | B | 0 | 3308.56 | 1760081.0 | 1796001.0 | 1731081.0 | 8655401.0 | 1.0 | 1.0 | 1 | 3 | 2 | 0 | 3 |
1 | A8217710 | 2018-08-19 | 2018-08-16 | 275.0 | 48295495.4 | B | 0 | 4674.68 | 2487045.0 | 2537801.0 | 2437845.0 | 12189221.0 | 1.0 | 1.0 | 1 | 3 | 2 | 0 | 3 |
2 | A8217710 | 2018-08-19 | 2018-08-15 | 275.0 | 48295495.4 | B | 0 | 1873.06 | 977845.0 | 997801.0 | 961845.0 | 4809221.0 | 1.0 | 1.0 | 1 | 2 | 2 | 0 | 3 |
3 | A8217710 | 2018-08-19 | 2018-08-14 | 275.0 | 48295495.4 | B | 0 | 4837.78 | 2526441.0 | 2578001.0 | 2484441.0 | 12422201.0 | 1.0 | 1.0 | 1 | 2 | 2 | 0 | 3 |
4 | A8217710 | 2018-08-19 | 2018-08-13 | 275.0 | 48295495.4 | B | 0 | 2586.38 | 1350441.0 | 1378001.0 | 1328441.0 | 6642201.0 | 1.0 | 1.0 | 1 | 2 | 2 | 0 | 3 |
set(data.class_new) #创建集合set 集合是一个容器类型,可以存储多个数据
#{'A', 'B', 'C', 'D', 'E', 'F'}
#数据重组
#把特征进行区分,哪些是原始的,哪些是可以做特征衍生的,哪些是文本类的
org_lst = ['uid','create_dt','oil_actv_dt','class_new','bad_ind']#不需要处理,建模时不会用到。
agg_lst = ['oil_amount','discount_amount','sale_amount','amount','pay_amount','coupon_amount','payment_coupon_amount']
#add_list 做统计量的计算,求和,评价,求方差,标准差,极差...
#数据类型的需要做特征衍生,
dstc_lst = ['channel_code','oil_code','scene','source_app','call_source']
#dstc_lst分类型的变量,计算(有几种不同的取值)
#origin_list 不需要做特殊变换,直接去掉
#aggregation_list 数值型变量做聚合
#dstc_lst 文本型变量做cnt
df = data[org_lst].copy()
df[agg_lst] = data[agg_lst].copy()
df[dstc_lst] = data[dstc_lst].copy()
#.copy() 函数返回一个字典的浅复制。浅复制是指当对象的字段值被复制时,字段引用的对象不会被复制
df.isnull().sum().head() #df.isna().sum()等同效果
'''
uid 0
create_dt 4944
oil_actv_dt 0
class_new 0
bad_ind 0
dtype: int64
'''
df.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50609 entries, 0 to 50608
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 uid 50609 non-null object
1 create_dt 45665 non-null datetime64[ns]
2 oil_actv_dt 50609 non-null datetime64[ns]
3 class_new 50609 non-null object
4 bad_ind 50609 non-null int64
5 oil_amount 45665 non-null float64
6 discount_amount 45665 non-null float64
7 sale_amount 45665 non-null float64
8 amount 45665 non-null float64
9 pay_amount 45665 non-null float64
10 coupon_amount 45665 non-null float64
11 payment_coupon_amount 45663 non-null float64
12 channel_code 50609 non-null int64
13 oil_code 50609 non-null int64
14 scene 50609 non-null int64
15 source_app 50609 non-null int64
16 call_source 50609 non-null int64
dtypes: datetime64[ns](2), float64(7), int64(6), object(2)
memory usage: 6.6+ MB
'''
df.describe()
bad_ind | oil_amount | discount_amount | sale_amount | amount | pay_amount | coupon_amount | payment_coupon_amount | channel_code | oil_code | scene | source_app | call_source | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 50609.000000 | 45665.000000 | 4.566500e+04 | 4.566500e+04 | 4.566500e+04 | 4.566500e+04 | 45665.000000 | 45663.000000 | 50609.000000 | 50609.000000 | 50609.000000 | 50609.000000 | 50609.000000 |
mean | 0.017764 | 425.376107 | 1.832017e+05 | 1.881283e+05 | 1.808673e+05 | 9.043344e+05 | 0.576853 | 149.395397 | 1.476378 | 1.617894 | 1.906519 | 0.306072 | 2.900729 |
std | 0.132093 | 400.596244 | 2.007574e+05 | 2.048742e+05 | 1.977035e+05 | 9.885168e+05 | 0.494064 | 605.138823 | 1.511470 | 3.074166 | 0.367280 | 0.893682 | 0.726231 |
min | 0.000000 | 1.000000 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 5.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 175.440000 | 6.039100e+04 | 6.200100e+04 | 5.976100e+04 | 2.988010e+05 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 |
50% | 0.000000 | 336.160000 | 1.229310e+05 | 1.279240e+05 | 1.209610e+05 | 6.048010e+05 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 |
75% | 0.000000 | 557.600000 | 2.399050e+05 | 2.454010e+05 | 2.360790e+05 | 1.180391e+06 | 1.000000 | 100.000000 | 1.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 |
max | 1.000000 | 7952.820000 | 3.916081e+06 | 3.996001e+06 | 3.851081e+06 | 1.925540e+07 | 1.000000 | 50000.000000 | 6.000000 | 9.000000 | 2.000000 | 3.000000 | 4.000000 |
df.count()#总体数据4万多条
'''
uid 50609
create_dt 45665
oil_actv_dt 50609
class_new 50609
bad_ind 50609
oil_amount 45665
discount_amount 45665
sale_amount 45665
amount 45665
pay_amount 45665
coupon_amount 45665
payment_coupon_amount 45663
channel_code 50609
oil_code 50609
scene 50609
source_app 50609
call_source 50609
dtype: int64
'''
对creat_dt做补全,用oil_actv_dt来填补 截取6个月的数据。构造变量的时候不能直接对历史所有数据做累加。否则随着时间推移,变量分布会有很大的变化。
# oil_actv_dt放款的日期,create_dt创建账户的日期
# 用放款日期填充创建日期
def time_isna(x,y):
if str(x) == 'NaT':
#NaT 非时间空值,Not a Time.该值可以存储在 datetime 数组中以指示未知或缺失的 datetime 值。NaT 返回一个 (NaT) datetime 非时间标量值。
x = y #有缺失的话创建时间等于放贷时间
else:
x = x #没有缺失,创建时间等于创建时间
return x
df2 = df.sort_values(['uid','create_dt'],ascending = False) #降序 根据'uid','create_dt'
df2['create_dt'] = df2.apply(lambda x: time_isna(x.create_dt,x.oil_actv_dt),axis = 1)
#创建账户的时间对放贷的时间做了一个填充.如果有缺失的话,做填充。
df2['dtn'] = (df2.oil_actv_dt - df2.create_dt).apply(lambda x :x.days)
df = df2[df2['dtn']<180]
#创建和放贷的时间是不是小于180天,小于180天的提取出来.
'''
B卡 货后管理 用户注册数据 从三方购买,如果半年之内没有新的操作
从不同渠道买来的评分数据 有效期半年用户第一次来的适合 个人用户的数据不全,需要从其他合作方购买 同盾
用户第一次来的时候, 个人用户的数据不全,需要从其他合作方购买 同盾
'''
df.head()
uid | create_dt | oil_actv_dt | class_new | bad_ind | oil_amount | discount_amount | sale_amount | amount | pay_amount | coupon_amount | payment_coupon_amount | channel_code | oil_code | scene | source_app | call_source | dtn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
50608 | B96436391985035703 | 2018-10-08 | 2018-10-08 | B | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 | 0 |
50607 | B96436391984693397 | 2018-10-11 | 2018-10-11 | E | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 | 0 |
50606 | B96436391977217468 | 2018-10-17 | 2018-10-17 | B | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 | 0 |
50605 | B96436391976480892 | 2018-09-28 | 2018-09-28 | B | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 | 0 |
50604 | B96436391972106043 | 2018-10-19 | 2018-10-19 | A | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6 | 9 | 2 | 3 | 4 | 0 |
特征选择
对org_list变量求历史贷款天数的最大间隔,并且去重
#对org_list变量求历史贷款天数的最大间隔,并且去重
base = df[org_lst] # base是框架,代表org_lst数据
base['dtn'] = df['dtn'] #把间隔时间小于180天的提取出来另列一列.
base = base.sort_values(['uid','create_dt'],ascending = False)
#根据'uid'先去排序,然后是'create_dt'。做降序排列
base = base.drop_duplicates(['uid'],keep = 'first')#去重操作
#keep : 有三个值,{‘first’, ‘last’, False},first:保留第一次出现的重复行,删除后面的重复行。
base.shape #一共1万多个用户,40000多条数据,说明同一用户产生了不同的贷款记录
#(11099, 6)
特征衍生
gn = pd.DataFrame() #pd.DataFrame 创建一个DataFrame对象
#由于同一个用户会有多条记录,所以使得下面的操作有意义。一个用户只有一条记录,则无需用此方法。
for i in agg_lst:
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:len(df[i])).reset_index())
#用户的id去分组,看一看这个用户在每个维度上一共有几条记录
#按照uid进行分组,uid作为索引列。; df[i]取值, len判断有几条,
tp.columns = ['uid',i + '_cnt']
if gn.empty == True: #1.字符串为空,2.查不到数据 3.集合为空
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left') #left说明on在最左边,所以uid在最左边
#用户的id去分组,判断数据是否大于0,统计大于0的条目数
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.where(df[i]>0,1,0).sum()).reset_index())
tp.columns = ['uid',i + '_num']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#用户的id去分组,忽略nan把每个维度上用户的数据做求和的情况
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nansum(df[i])).reset_index())
tp.columns = ['uid',i + '_tot']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#用户的id去分组,忽略nan把每个维度上用户的数据做求平均的情况
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmean(df[i])).reset_index())
tp.columns = ['uid',i + '_avg']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#用户的id去分组,忽略nan把每个维度上用户的数据做求最大值的情况
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmax(df[i])).reset_index())
tp.columns = ['uid',i + '_max']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#用户的id去分组,忽略nan把每个维度上用户的数据做求最小值的情况
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmin(df[i])).reset_index())
tp.columns = ['uid',i + '_min']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#用户的id去分组,忽略nan把每个维度上用户的数据做求方差的情况
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanvar(df[i])).reset_index())
tp.columns = ['uid',i + '_var']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#用户的id去分组,忽略nan把每个维度上用户的数据做求极差的情况
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmax(df[i]) -np.nanmin(df[i]) ).reset_index())
tp.columns = ['uid',i + '_var']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#对数值型的变量最特征衍生
gn.columns
'''
Index(['uid', 'oil_amount_cnt', 'oil_amount_num', 'oil_amount_tot',
'oil_amount_avg', 'oil_amount_max', 'oil_amount_min',
'oil_amount_var_x', 'oil_amount_var_y', 'discount_amount_cnt',
'discount_amount_num', 'discount_amount_tot', 'discount_amount_avg',
'discount_amount_max', 'discount_amount_min', 'discount_amount_var_x',
'discount_amount_var_y', 'sale_amount_cnt', 'sale_amount_num',
'sale_amount_tot', 'sale_amount_avg', 'sale_amount_max',
'sale_amount_min', 'sale_amount_var_x', 'sale_amount_var_y',
'amount_cnt', 'amount_num', 'amount_tot', 'amount_avg', 'amount_max',
'amount_min', 'amount_var_x', 'amount_var_y', 'pay_amount_cnt',
'pay_amount_num', 'pay_amount_tot', 'pay_amount_avg', 'pay_amount_max',
'pay_amount_min', 'pay_amount_var_x', 'pay_amount_var_y',
'coupon_amount_cnt', 'coupon_amount_num', 'coupon_amount_tot',
'coupon_amount_avg', 'coupon_amount_max', 'coupon_amount_min',
'coupon_amount_var_x', 'coupon_amount_var_y',
'payment_coupon_amount_cnt', 'payment_coupon_amount_num',
'payment_coupon_amount_tot', 'payment_coupon_amount_avg',
'payment_coupon_amount_max', 'payment_coupon_amount_min',
'payment_coupon_amount_var_x', 'payment_coupon_amount_var_y'],
dtype='object')
'''
gn.shape[1] #返回的是生成的总列数,7个列,每个赋值8个。56个加上初始值57
#57
#分类型变量,数条目数,有几个不同的分类
# 对dstc_lst变量求distinct个数
gc = pd.DataFrame()
#分类特征,看一看每个用户的所有记录中一共有几个不同的分类,把不同分类的数据记下来
for i in dstc_lst:
tp = pd.DataFrame(df.groupby('uid').apply(lambda df: len(set(df[i]))).reset_index())
#用set把重复值去掉,len查看有多少条数据,创建一个新的dataframe.定义给tp。set在这里相当于distinct
tp.columns = ['uid',i + '_dstc']
if gc.empty == True:
gc = tp
else:
gc = pd.merge(gc,tp,on = 'uid',how = 'left')
gc.columns
'''
Index(['uid', 'channel_code_dstc', 'oil_code_dstc', 'scene_dstc',
'source_app_dstc', 'call_source_dstc'],
dtype='object')
'''
# 将变量组合在一起
fn = pd.merge(base,gn,on= 'uid')
fn = pd.merge(fn,gc,on= 'uid')
fn.shape
#(11099, 67)
fn = fn.fillna(0) #组合后会有很多空值,此处填充为0
fn.head(100)
uid | create_dt | oil_actv_dt | class_new | bad_ind | dtn | oil_amount_cnt | oil_amount_num | oil_amount_tot | oil_amount_avg | ... | payment_coupon_amount_avg | payment_coupon_amount_max | payment_coupon_amount_min | payment_coupon_amount_var_x | payment_coupon_amount_var_y | channel_code_dstc | oil_code_dstc | scene_dstc | source_app_dstc | call_source_dstc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | B96436391985035703 | 2018-10-08 | 2018-10-08 | B | 0 | 0 | 1 | 0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
1 | B96436391984693397 | 2018-10-11 | 2018-10-11 | E | 0 | 0 | 1 | 0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
2 | B96436391977217468 | 2018-10-17 | 2018-10-17 | B | 0 | 0 | 1 | 0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
3 | B96436391976480892 | 2018-09-28 | 2018-09-28 | B | 0 | 0 | 1 | 0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
4 | B96436391972106043 | 2018-10-19 | 2018-10-19 | A | 0 | 0 | 1 | 0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
95 | B96117370332355190 | 2018-10-19 | 2018-10-19 | B | 0 | 0 | 1 | 0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
96 | B96117370330101658 | 2018-10-12 | 2018-10-12 | B | 0 | 0 | 1 | 0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
97 | B96117370330066347 | 2018-10-01 | 2018-10-01 | D | 0 | 0 | 1 | 0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
98 | B96117370328724350 | 2018-09-20 | 2018-09-20 | C | 0 | 0 | 1 | 0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
99 | B96117370321159033 | 2018-10-08 | 2018-10-08 | D | 0 | 0 | 1 | 0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 | 1 | 1 |
100 rows × 67 columns
#把['uid','oil_actv_dt','create_dt','bad_ind','class_new']删掉,剩下的最为特征
x = fn.drop(['uid','oil_actv_dt','create_dt','bad_ind','class_new'],axis = 1)
#org_lst之所以前期会处理,在这里又删除,是因为它要删除重复性,把一共的1万多个用户留下来,
#总体数据4万多条数据,把同一用户产生了不同的贷款记录去掉,再去和其他数据做整合处理
#(11099行,62列)
#bad_ind目标值
y = fn.bad_ind.copy()
from sklearn import tree
#利用回归树
'''
所谓分类树就是面向分类的,每个决策树最末端的叶子结点出来的是一个分类标签,不是0就是1或者2等类别。回归树就是面向回归的,回归就是拟合函数一样,输出连续值
'''
dtree = tree.DecisionTreeRegressor(max_depth = 2,min_samples_leaf = 500,min_samples_split = 5000)
#限制树的最大深度
#叶子最少包含样本的个数
#节点必须包含训练样本的个数
dtree = dtree.fit(x,y) #利用x的值,预测y是否是坏人
import pydotplus
#用于在图表语言中的计算机处理和过程图表。
from IPython.display import Image
from six import StringIO
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'
#https://graphviz.org/download/,需要下载,安装到指定目录
with open("dt.dot", "w") as f:
tree.export_graphviz(dtree, out_file=f)
dot_data = StringIO()
tree.export_graphviz(dtree, out_file=dot_data,
feature_names=x.columns,
class_names=['bad_ind'],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
'''
feature_names=x.columns 所有的特征
rounded=True 圆弧矩形
filled=True 填充颜色
decision_tree 策树的的对象名
out_file 不导出文件,选择 None
special_characters=True 格式化显示形式(作用不大)
'''
sum(fn.bad_ind)/len(fn.bad_ind)
#0.04658077304261645