在当下我们生活的环境中,经常会有各种购物平台、淘宝平台、京东平台等等,我们都是其中的用户之一,如果咱们长时间不用某一平台,可能会收到某某平台的促销信息,那么平台为什么给我们发这个消息呢,显然平台是经过数据分析,他会分析我们不用这个平台的可能性有多大,现在拿到某平台的一组数据,进行建模分析
from __future__ import division import pandas as pd import numpy as np churn_df = pd.read_csv('churn.csv') col_names = churn_df.columns.tolist()#读取每一列的列名 print ("Column names:") print (col_names) to_show = col_names[:6] + col_names[-6:]#显示前六行和后六行 print ("\nSample data:") churn_df[to_show].head(6)
打印结果:
Out[2]:
churn_result = churn_df['Churn?']#提取churn?这一列 y = np.where(churn_result == 'True.',1,0)#把有二分类字符的改用0 1编码 # We don't need these columns to_drop = ['State','Area Code','Phone','Churn?'] churn_feat_space = churn_df.drop(to_drop,axis=1)#删除以上所在的列 # 'yes'/'no' has to be converted to boolean values # NumPy converts these from boolean to 1. and 0. later yes_no_cols = ["Int'l Plan","VMail Plan"] churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'#nuompy将其转换成1 0 # Pull out features for future use features = churn_feat_space.columns X = churn_feat_space.as_matrix().astype(np.float) # This is important from sklearn.preprocessing import StandardScaler#统一处理数据,标准化操作 scaler = StandardScaler() X = scaler.fit_transform(X) print("Feature space holds %d observations and %d features" % X.shape) print ("Unique target labels:", np.unique(y)) print (X[0]) print (len(y[y == 0]))
打印结果:
Feature space holds 3333 observations and 17 features Unique target labels: [0 1] [ 0.67648946 -0.32758048 1.6170861 1.23488274 1.56676695 0.47664315 1.56703625 -0.07060962 -0.05594035 -0.07042665 0.86674322 -0.46549436 0.86602851 -0.08500823 -0.60119509 -0.0856905 -0.42793202] 2850
交叉验证
from sklearn.cross_validation import KFold#k折交叉验证 def run_cv(X,y,clf_class,**kwargs): # Construct a kfolds object kf = KFold(len(y),n_folds=5,shuffle=True) y_pred = y.copy() # Iterate through folds for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train = y[train_index] # Initialize a classifier with key word arguments clf = clf_class(**kwargs) clf.fit(X_train,y_train) y_pred[test_index] = clf.predict(X_test) return y_pred#以上都是常规操作
用几种机器学习模型去跑精度
from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier as RF from sklearn.neighbors import KNeighborsClassifier as KNN def accuracy(y_true,y_pred): # NumPy interprets True and False as 1. and 0. return np.mean(y_true == y_pred) print ("Support vector machines:") print ("%.3f" % accuracy(y, run_cv(X,y,SVC))) print ("Random forest:") print ("%.3f" % accuracy(y, run_cv(X,y,RF))) print ("K-nearest-neighbors:") print ("%.3f" % accuracy(y, run_cv(X,y,KNN)))
打印结果:
Support vector machines: 0.913 Random forest: 0.938 K-nearest-neighbors: 0.894下面定义的这个函数很棒
def run_prob_cv(X, y, clf_class, **kwargs):#这个函数项目实践时很好用,可以调用几个模型跑数据进行对比 kf = KFold(len(y), n_folds=5, shuffle=True) y_prob = np.zeros((len(y),2)) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train = y[train_index] clf = clf_class(**kwargs)#分类器的参数输入 clf.fit(X_train,y_train) # Predict probabilities, not classes y_prob[test_index] = clf.predict_proba(X_test) return y_prob
基于业务角度去分析时,我们在平台一般会有一个当用户达到多大概率大的时候我们再进行预警通知吧,现在我们要分档,设计一些概率值,进行用户流失的预警
import warnings warnings.filterwarnings('ignore') # Use 10 estimators so predictions are all multiples of 0.1 pred_prob = run_prob_cv(X, y, RF, n_estimators=10) #print pred_prob[0] pred_churn = pred_prob[:,1] is_churn = y == 1 # Number of times a predicted probability is assigned to an observation counts = pd.value_counts(pred_churn) #print counts # calculate true probabilities true_prob = {} for prob in counts.index: true_prob[prob] = np.mean(is_churn[pred_churn == prob]) true_prob = pd.Series(true_prob) # pandas-fu counts = pd.concat([counts,true_prob], axis=1).reset_index() counts.columns = ['pred_prob', 'count', 'true_prob'] counts
打印结果:
pred_prob | count | true_prob | |
---|---|---|---|
0 | 0.0 | 1757 | 0.028458 |
1 | 0.1 | 716 | 0.032123 |
2 | 0.2 | 255 | 0.035294 |
3 | 0.3 | 122 | 0.131148 |
4 | 0.4 | 84 | 0.428571 |
5 | 0.7 | 82 | 0.902439 |
6 | 0.9 | 79 | 0.974684 |
7 | 0.8 | 65 | 0.938462 |
8 | 1.0 | 62 | 1.000000 |
9 | 0.5 | 61 | 0.622951 |
10 | 0.6 | 50 | 0.740000 |