import numpy as np
import matplotlib.pyplot as plt import pandas as pd from sklearn.model_selection import train_test_split import statsmodels.api as sm from statsmodels.graphics.mosaicplot import mosaic

%pylab inline
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号 '-' 显示为方块的问题

Populating the interactive namespace from numpy and matplotlib

载入和分析数据

In [4]:

def readData(path):
    """  使用pandas读取数据  """ data = pd.read_csv(path) cols = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week", "label"] return data[cols]

data = readData('dataset/adult.data')

In [5]:

data.hist(rwidth=0.9, grid=False, figsize=(8, 8), alpha=0.6) # 各个变量的直方图 plt.show()

In [6]:

data.head()

Out[6]:

	age	education_num	capital_gain	hours_per_week	label
0	39	13	2174	40	<=50K
1	50	13	0	13	<=50K
2	38	9	0	40	<=50K
3	53	7	0	40	<=50K
4	28	13	0	40	<=50K

In [8]:

data['label_code'] = pd.Categorical(data.label).codes # 将文字变量转化为数字变量

In [10]:

data[['label', 'label_code']].head(8)

Out[10]:

	label	label_code
0	<=50K	0
1	<=50K	0
2	<=50K	0
3	<=50K	0
4	<=50K	0
5	<=50K	0
6	<=50K	0
7	>50K	1

In [18]:

def visualData(df):
    """  画直方图，直观了解数据  """ df.hist( rwidth=0.9, grid=False, figsize=(8, 8), alpha=0.6, color="grey") plt.show(block=False)

In [19]:

df = data[["age", "education_num", "capital_gain", "capital_loss", "hours_per_week", "label_code"]] visualData(df)

In [20]:

data.describe()  # 仅仅显示数值型变量的基本统计信息，若要显示全部变量的统计信息只需传入参数 `include='all'`

Out[20]:

	age	education_num	capital_gain	capital_loss	hours_per_week	label_code
count	32561.000000	32561.000000	32561.000000	32561.000000	32561.000000	32561.000000
mean	38.581647	10.080679	1077.648844	87.303830	40.437456	0.240810
std	13.640433	2.572720	7385.292085	402.960219	12.347429	0.427581
min	17.000000	1.000000	0.000000	0.000000	1.000000	0.000000
25%	28.000000	9.000000	0.000000	0.000000	40.000000	0.000000
50%	37.000000	10.000000	0.000000	0.000000	40.000000	0.000000
75%	48.000000	12.000000	0.000000	0.000000	45.000000	0.000000
max	90.000000	16.000000	99999.000000	4356.000000	99.000000	1.000000

交叉报表

交叉报表用来描述两个变量之间的关系。

按四分位数划分为

In [23]:

qeducation_num = pd.qcut(data['education_num'], [.25, .5, .75, 1]) qeducation_num

Out[23]:

0         (12.0, 16.0]
1         (12.0, 16.0]
2        (8.999, 10.0]
3                  NaN
4         (12.0, 16.0]
5         (12.0, 16.0]
6                  NaN
7        (8.999, 10.0]
8         (12.0, 16.0]
9         (12.0, 16.0]
10       (8.999, 10.0]
11        (12.0, 16.0]
12        (12.0, 16.0]
13        (10.0, 12.0]
14        (10.0, 12.0]
15                 NaN
16       (8.999, 10.0]
17       (8.999, 10.0]
18                 NaN
19        (12.0, 16.0]
20        (12.0, 16.0]
21       (8.999, 10.0]
22                 NaN
23                 NaN
24       (8.999, 10.0]
25        (12.0, 16.0]
26       (8.999, 10.0]
27       (8.999, 10.0]
28       (8.999, 10.0]
29       (8.999, 10.0]
             ...      
32531     (12.0, 16.0]
32532     (12.0, 16.0]
32533     (12.0, 16.0]
32534    (8.999, 10.0]
32535              NaN
32536     (12.0, 16.0]
32537    (8.999, 10.0]
32538     (12.0, 16.0]
32539     (12.0, 16.0]
32540    (8.999, 10.0]
32541    (8.999, 10.0]
32542    (8.999, 10.0]
32543     (10.0, 12.0]
32544     (12.0, 16.0]
32545     (10.0, 12.0]
32546     (10.0, 12.0]
32547    (8.999, 10.0]
32548     (12.0, 16.0]
32549    (8.999, 10.0]
32550    (8.999, 10.0]
32551              NaN
32552     (10.0, 12.0]
32553     (12.0, 16.0]
32554     (12.0, 16.0]
32555    (8.999, 10.0]
32556     (10.0, 12.0]
32557    (8.999, 10.0]
32558    (8.999, 10.0]
32559    (8.999, 10.0]
32560    (8.999, 10.0]
Name: education_num, Length: 32561, dtype: category
Categories (3, interval[float64]): [(8.999, 10.0] < (10.0, 12.0] < (12.0, 16.0]]

计算 education_num 和 label 的交叉报表

In [48]:

cross1 = pd.crosstab(data['label'], qeducation_num) cross1

Out[48]:

education_num	(8.999, 10.0]	(10.0, 12.0]	(12.0, 16.0]
label
<=50K	14730	1823	4158
>50K	3062	626	3909

将 cross1 图表化

In [65]:

props = lambda key: {"color": "0.45"} if ' >50K' in key else {"color": "#C6E2FF"} mosaic(cross1.stack(), gap=0.05,properties=props, title='label 和 education_num 的交叉报表') plt.show()

搭建模型，并训练模型

statsmodels 建模时可以使用类似文字的表达式:

formula 定义了模型的形式, ~ 相当于等号.

In [67]:

def trainModel(data):
    """  搭建逻辑回归模型，并训练模型  """ formula = "label_code ~ age + education_num + capital_gain + capital_loss + hours_per_week" model = sm.Logit.from_formula(formula, data=data) re = model.fit() return re

In [70]:

def modelSummary(re):
    """  分析逻辑回归模型的统计性质  """ # 整体统计分析结果 print(re.summary()) # 用f test检验education_num的系数是否显著 print("检验假设education_num的系数等于0：") print(re.f_test("education_num=0")) # 用f test检验两个假设是否同时成立 print("检验假设education_num的系数等于0.32和hours_per_week的系数等于0.04同时成立：") print(re.f_test("education_num=0.32, hours_per_week=0.04"))

def interpretModel(re):
"""
理解模型结果

扫描二维码关注公众号，回复： 10204081 查看本文章

参数
----
re ：BinaryResults，训练好的逻辑回归模型
"""
conf = re.conf_int()
conf['OR'] = re.params
# 计算各个变量对事件发生比的影响
# conf里面的三列，分别对应着估计值的下界、上界和估计值本身
conf.columns = ['2.5%', '97.5%', 'OR']
print("各个变量对事件发生比的影响：")
print(np.exp(conf))
# 计算各个变量的边际效应
print("各个变量的边际效应：")
print(re.get_margeff(at="overall").summary())

def makePrediction(re, testSet, alpha=0.5):
"""
使用训练好的模型对测试数据做预测
"""
# 关闭pandas有关chain_assignment的警告
pd.options.mode.chained_assignment = None
# 计算事件发生的概率
testSet["prob"] = re.predict(testSet)
print("事件发生概率（预测概率）大于0.6的数据个数：")
print(testSet[testSet["prob"] > 0.6].shape[0]) # 输出值为576
print("事件发生概率（预测概率）大于0.5的数据个数：")
print(testSet[testSet["prob"] > 0.5].shape[0]) # 输出值为834
# 根据预测的概率，得出最终的预测
testSet["pred"] = testSet.apply(lambda x: 1 if x["prob"] > alpha else 0, axis=1)
return testSet

def evaluation(re):
"""
计算预测结果的查准查全率以及f1

参数
----
re ：DataFrame，预测结果，里面包含两列：真实值‘lable_code’、预测值‘pred’
"""
bins = np.array([0, 0.5, 1])
label = re["label_code"]
pred = re["pred"]
tp, fp, fn, tn = np.histogram2d(label, pred, bins=bins)[0].flatten()
precision = tp / (tp + fp) # 0.951
recall = tp / (tp + fn) # 0.826
f1 = 2 precision recall / (precision + recall) # 0.884
print("查准率: %.3f, 查全率: %.3f, f1: %.3f" % (precision, recall, f1))

In [72]:

# 将数据分为训练集和测试集
trainSet, testSet = train_test_split(data, test_size=0.2, random_state=2310)

# 训练模型并分析模型效果
re = trainModel(trainSet)
modelSummary(re)
interpretModel(re)

Optimization terminated successfully.
         Current function value: 0.406094
         Iterations 8
                           Logit Regression Results                           
==============================================================================
Dep. Variable:             label_code   No. Observations:                26048
Model:                          Logit   Df Residuals:                    26042
Method:                           MLE   Df Model:                            5
Date:                Wed, 22 Aug 2018   Pseudo R-squ.:                  0.2639
Time:                        19:29:31   Log-Likelihood:                -10578.
converged:                       True   LL-Null:                       -14370.
                                        LLR p-value:                     0.000
==================================================================================
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -8.2970      0.128    -64.623      0.000      -8.549      -8.045
age                0.0435      0.001     31.726      0.000       0.041       0.046
education_num      0.3215      0.008     42.231      0.000       0.307       0.336
capital_gain       0.0003   1.07e-05     29.650      0.000       0.000       0.000
capital_loss       0.0007   3.64e-05     20.055      0.000       0.001       0.001
hours_per_week     0.0399      0.001     26.995      0.000       0.037       0.043
==================================================================================
检验假设education_num的系数等于0：
<F test: F=array([[1783.4276255]]), p=0.0, df_denom=26042, df_num=1>
检验假设education_num的系数等于0.32和hours_per_week的系数等于0.04同时成立：
<F test: F=array([[0.01940236]]), p=0.9807846677772952, df_denom=26042, df_num=2>
各个变量对事件发生比的影响：
                    2.5%     97.5%        OR
Intercept       0.000194  0.000321  0.000249
age             1.041611  1.047218  1.044411
education_num   1.358725  1.399879  1.379149
capital_gain    1.000298  1.000340  1.000319
capital_loss    1.000659  1.000802  1.000731
hours_per_week  1.037733  1.043769  1.040746
各个变量的边际效应：
        Logit Marginal Effects       
=====================================
Dep. Variable:             label_code
Method:                          dydx
At:                           overall
==================================================================================
                    dy/dx    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
age                0.0056      0.000     33.563      0.000       0.005       0.006
education_num      0.0413      0.001     47.313      0.000       0.040       0.043
capital_gain     4.09e-05    1.3e-06     31.500      0.000    3.84e-05    4.34e-05
capital_loss    9.372e-05   4.54e-06     20.648      0.000    8.48e-05       0.000
hours_per_week     0.0051      0.000     28.167      0.000       0.005       0.005
==================================================================================

In [73]:

re = makePrediction(re, testSet) evaluation(re)

事件发生概率（预测概率）大于0.6的数据个数：
576
事件发生概率（预测概率）大于0.5的数据个数：
834
查准率: 0.951, 查全率: 0.826, f1: 0.884

python模型的统计信息

载入和分析数据

交叉报表

搭建模型，并训练模型

猜你喜欢