信用评分流程

1、数据获取

我使用的信贷数据共有3000条数据，每条数据11个特征。

rm(list=ls())
setwd("D:\\case")
library(xlsx)
dat<-read.xlsx2("credit.xlsx",1,colClasses = NA)
summary(dat)
str(dat)

'data.frame':	3000 obs. of  11 variables:
 $ 年龄        : num  46 34 31 39 32 23 42 35 26 24 ...
 $ 收入        : num  0 3200 3300 1500 0 0 1900 0 1700 3400 ...
 $ 孩子数量    : num  0 4 3 0 3 0 0 2 1 1 ...
 $ 家庭人口数  : num  2 6 5 1 5 1 2 4 3 2 ...
 $ 在现住址时间: num  15 144 108 192 48 192 144 54 288 18 ...
 $ 在现工作时间: num  33 54 120 6 108 60 30 168 33 30 ...
 $ 住房种类    : Factor w/ 3 levels "缺失","自有",..: 3 2 3 3 3 3 3 3 3 3 ...
 $ 国籍        : Factor w/ 8 levels "德国","南斯拉夫",..: 1 5 5 1 1 1 1 1 1 1 ...
 $ 信用卡类型  : Factor w/ 7 levels "欧洲Master卡",..: 7 5 5 5 7 7 5 7 7 5 ...
 $ 是否违约    : num  0 1 1 1 0 1 0 0 1 1 ...
 $ 权重        : num  30 1 1 1 30 1 30 30 1 1 ...

2、数据预处理

主要工作包括数据清洗、缺失值处理、异常值处理，主要是为了将获取的原始数据转化为可用作模型开发的格式化数据。

dat[,1:6]<-sapply(dat[,1:6],function(x) {x[x==999]<-NA;return(x)} )
dat<-dat[,-11]

library(smbinning)
library(prettyR)

dat1<-dat
dat1[,4]<-dat1[,4]-dat1[,3]
table(dat1[,4])
dat1[,4]<-factor(dat1[,4],levels=c(1,2),labels=c("其他","已婚"))
colnames(dat1)<-c("age","income","child","marital","dur_live",
                  "dur_work","housetype","nation","cardtype","loan")
summary(dat1)

##盖帽法函数
block<-function(x,lower=T,upper=T){
    if(lower){
        q1<-quantile(x,0.01)
        x[x<=q1]<-q1
    }
    if(upper){
        q99<-quantile(x,0.99)
        x[x>q99]<-q99
    }
   return(x)
}

dat1$loan<-as.numeric(!as.logical(dat1$loan))

3、探索性数据分析

该步骤主要是获取样本总体的大概情况，描述样本总体情况的指标主要有直方图、箱形图等。

4、变量分箱

首先，需要在R中安装smbinning包。我们将使用最优分段对于数据集中的income、child、婚姻状态和现在工作时间等进行分类。

par(mfrow=c(2,2))
?smbinning.plot()
smbinning.plot(age,option="dist",sub="年龄")
smbinning.plot(age,option="WoE",sub="年龄")
smbinning.plot(age,option="goodrate",sub="年龄")
smbinning.plot(age,option="badrate",sub="年龄")
par(mfrow=c(1,1))
age$iv
cred_iv<-c("年龄"=age$iv)

##income
boxplot(income~loan,data=dat1,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
dat1$income<-block(dat1$income)
boxplot(income~loan,data=dat1,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
income<-smbinning(dat1,"loan","income")
income$ivtable
smbinning.plot(income,option="WoE",sub="收入")
income$iv
cred_iv<-c(cred_iv,"收入"=income$iv)

##child
boxplot(child~loan,data=dat1,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")
dat1$child<-block(dat1$child)
child<-smbinning(dat1,"loan","child")
child$ivtable
smbinning.plot(child,option="WoE",sub="孩子数量")
child$iv
cred_iv<-c(cred_iv,"孩子数量"=child$iv)

##marital
xtab(~marital+loan,data=dat1,chisq=T)
marital<-smbinning.factor(dat1,"loan","marital")
marital$ivtable
smbinning.plot(marital,option="WoE",sub="婚姻状态")
marital$iv
cred_iv<-c(cred_iv,"婚姻状态"=marital$iv)

##dur_live
boxplot(dur_live~loan,data=dat1,horizontal=T, 
        frame=F, col="lightgray",main="Distribution")
t.test(dur_live~loan,data=dat1)
dur_live<-smbinning(dat1,"loan","dur_live")
dur_live

##dur_work
boxplot(dur_work~loan,data=dat1,horizontal=T, 
        frame=F, col="lightgray",main="Distribution")
t.test(dur_work~loan,data=dat1)
dur_work<-smbinning(dat1,"loan","dur_work")
dur_work$ivtable
smbinning.plot(dur_work,option="WoE",sub="在现工作时间")
dur_work$iv
cred_iv<-c(cred_iv,"在现工作时间"=dur_work$iv)

##housetype
xtab(~housetype+loan,data=dat1,chisq=T)
housetype<-smbinning.factor(dat1,"loan","housetype")
housetype$ivtable
smbinning.plot(housetype,option="WoE",sub="住房类型")
housetype$iv
cred_iv<-c(cred_iv,"住房种类"=housetype$iv)

变量的分段都对应差异较大WoE值，说明分段区分效果较好，且无违背Business Sense的现象出现，可以接受最优分段提供的分箱结果。

通过IV值判断变量预测能力：

可以看出，孩子数量、住房种类和国籍的IV值明显较低，年龄的IV值明显较高。

5、模型建立

首先将筛选后的变量转换为WoE值并建立Logistic模型，然后计算变量对应的WoE值，对变量对应的取值进行WoE替换。

将经过WoE转换的数据放入Logistic模型中建模，并使用向后逐步回归方法（backward stepwise）筛选变量，再输出结果。

dat2<-dat1
dat2<-smbinning.gen(dat2,age,"glage")
dat2<-smbinning.gen(dat2,income,"glincome")
dat2<-smbinning.gen(dat2,child,"glchild")
dat2<-smbinning.factor.gen(dat2,marital,"glmarital")
dat2<-smbinning.gen(dat2,dur_work,"gldur_work")
dat2<-smbinning.factor.gen(dat2,housetype,"glhousetype")
dat2<-smbinning.factor.gen(dat2,nation,"glnation")
dat2<-smbinning.factor.gen(dat2,cardtype,"glcardtype")

dat3<-dat2[,c(11:18,10)]

cred_mod<-glm(loan~. ,data=dat3,family=binomial())
summary(cred_mod)
Call:
glm(formula = loan ~ ., family = binomial(), data = dat3)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.33337  -1.02705  -0.07231   1.03589   2.19744  

Coefficients:
                              Estimate Std. Error z value             Pr(>|z|)    
(Intercept)                   -0.04911    0.51056  -0.096             0.923372    
glage02 <= 27                  0.36516    0.15699   2.326             0.020018 *  
glage03 <= 35                  0.75621    0.16166   4.678          0.000002899 ***
glage04 <= 45                  1.00575    0.17200   5.847          0.000000005 ***
glage05 > 45                   1.51719    0.18241   8.317 < 0.0000000000000002 ***
glincome02 <= 2300            -0.02803    0.21727  -0.129             0.897365    
glincome03 > 2300              0.17386    0.21368   0.814             0.415859    
glchild02 > 0                 -0.08882    0.10088  -0.880             0.378633    
glmarital02 = '已婚'           0.48576    0.09982   4.866          0.000001138 ***
gldur_work01 <= 15            -0.30166    0.40322  -0.748             0.454380    
gldur_work02 <= 84             0.05581    0.39685   0.141             0.888162    
gldur_work03 <= 144            0.19316    0.40889   0.472             0.636647    
gldur_work04 > 144             0.48729    0.40522   1.203             0.229159    
glhousetype02 = '自有'         0.07610    0.21606   0.352             0.724691    
glhousetype03 = '租住'        -0.04330    0.10722  -0.404             0.686324    
glnation02 = '南斯拉夫'        0.50457    0.52195   0.967             0.333694    
glnation03 = '其它非欧洲国家' -0.54416    0.25341  -2.147             0.031766 *  
glnation04 = '其它欧洲国家'   -0.99992    0.53370  -1.874             0.060991 .  
glnation05 = '土耳其'          0.06674    0.13657   0.489             0.625031    
glnation06 = '西班牙'         -0.13392    0.77746  -0.172             0.863237    
glnation07 = '希腊'            0.19612    0.32623   0.601             0.547727    
glnation08 = '意大利'          0.89119    0.55358   1.610             0.107426    
glcardtype02 = '其它信用卡'   -0.75915    0.75365  -1.007             0.313797    
glcardtype03 = '它行Visa卡'    0.29944    1.26889   0.236             0.813441    
glcardtype04 = '我行Visa卡'   -1.31870    1.26879  -1.039             0.298650    
glcardtype05 = '无信用卡'     -1.26384    0.32937  -3.837             0.000124 ***
glcardtype06 = '运通卡'       -0.74910    1.46779  -0.510             0.609798    
glcardtype07 = '支票账户'     -0.33311    0.29358  -1.135             0.256517    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 4158.9  on 2999  degrees of freedom
Residual deviance: 3687.3  on 2972  degrees of freedom
AIC: 3743.3

Number of Fisher Scoring iterations: 4

6、模型评估

到这里，我们的建模部分基本结束了。我们需要验证一下模型的预测能力如何。使用在建模开始阶段预留的250条数据进行检验:

prediction <- predict(cred_mod,newdata=test2)
for (i in 1:250) {
  if(prediction[i]>0.99){
    prediction[i]=1}
  else
  {prediction[i]=0}
}
confusionMatrix(prediction, test2$loan)

7、信用评分

在建立标准评分卡之前，我们需要选取几个评分卡参数：基础分值、 PDO（比率翻倍的分值）和好坏比。这里，我们取800分为基础分值，PDO为45 （每高45分好坏比翻一倍），好坏比取50。；可得下式：

845= q - p * log(50)

800= q - p * log(50/2)

p = 45/log(2)

q =800-20*log(50)/log(2)

其中总评分为基础分+部分得分。基础分可通过：

base <- q + p*as.numeric(coe[1])

cre_scal<-smbinning.scaling(cred_mod,pdo=45,score=800,odds=50)
cre_scal$logitscaled
cre_scal$minmaxscore

8、信用评分

dat4<-smbinning.scoring.gen(smbscaled=cre_scal, dataset=dat3)
boxplot(Score~loan,data=dat4,horizontal=T, frame=F, 
        col="lightgray",main="Distribution")

scaledcard<-cre_scal$logitscaled[[1]][-1,c(1,2,6)]
scaledcard[,1]<-c(rep("年龄",5),rep("收入",3),
                  rep("孩子数量",2),rep("婚否",2),rep("在现工作时间",5),
                  rep("住房类型",3),rep("国籍",8),rep("信用卡类型",7))
scaledcard
write.csv(scaledcard,"card.csv",row.names = F)

最终得出的打分卡结果为：

生成信用评分卡

原文笔误，修改于2019/4/15 00：00

基于R的信用评分卡模型解析

信用评分流程

猜你喜欢