(本节内容的数据见电脑F:/python数据/customer 或腾讯微云文件”python数据\customer “)
包含27个相关的特征(姓名、地址、教育情况);还有一个会员卡的类型(金卡、银卡、铜卡、普通卡)
1.决策树
特征的选择:特征列太多,我们先选择三个数字型特征的列(年收入,小孩数,家庭汽车拥有量)。年收入是一个范围,我们要替换一下才能用;
import pandas as pd
frame=pd.read_csv('F:/python数据/customer.csv')
print(frame['yearly_income'].head(2))
frame['yearly_income']=frame['yearly_income'].str.replace('[^0-9]','') #frame['yearly_income'].str获得列那一列元素的字符串表示,然后用空字符替换不属于0-9的阿拉伯数字
print(frame['yearly_income'].head(2))
'''用3050表示30-50'''
方法二:
import pandas as pd
frame=pd.read_csv('F:/python数据/customer.csv')
print(frame['yearly_income'].head(2))
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','') #我们只取下限作为年收入
print(frame['yearly_income'].head(2))
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
frame=pd.read_csv('F:/python数据/customer.csv')
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','') #我们只取下限作为年收入
y=frame['member_card'] #把会员卡列作为预测列
X=frame[["yearly_income",'total_children','num_cars_owned']] #将三个数值列作为特征列
clf=DecisionTreeClassifier() #用了决策树
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))
如果能够引入更多的分类特征,决策树的效果会更好一些,比如受教育程度和职业与会员等级也有很大的联系
import pandas as pd
from sklearn.preprocessing import LabelEncoder
frame=pd.read_csv('F:/python数据/customer.csv')
encoding=LabelEncoder() #使用这种方法将字符串映射为数字
encoding.fit(frame['education'])
education_new=encoding.transform(frame['education'])
print(frame['education'].values)
print(education_new)
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
frame=pd.read_csv('F:/python数据/customer.csv')
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','') #我们只取下限作为年收入
encoding=LabelEncoder() #使用这种方法将字符串映射为数字
encoding.fit(frame['education'])
frame['education_new']=encoding.transform(frame['education'])
y=frame['member_card'] #把会员卡列作为预测列
X=frame[["yearly_income",'total_children','num_cars_owned']] #将三个数值列作为特征列
clf=DecisionTreeClassifier() #用了决策树
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
frame=pd.read_csv('F:/python数据/customer.csv')
encoding=OneHotEncoder()
print(frame['education'].values)
newData=encoding.fit_transform(np.vstack(frame['education'].values)).todense()
print(newData)
import pandas as pd
import numpy as np
frame=pd.read_csv('F:/python数据/customer.csv')
print(frame['education'].values)
print(np.vstack(frame['education'].values)) #vstack把序列竖了起来,只有这样才能存储独热编码的那些列
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
frame=pd.read_csv('F:/python数据/customer.csv')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['education'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
right_index=True)
print(frame_full)
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['education'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
right_index=True)
X=frame_full
clf=DecisionTreeClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense() #我们仅仅是将'education'换成了"marital_status"婚否,准确率就提高了很多
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
right_index=True)
X=frame_full
clf=DecisionTreeClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))
2.随机森林
决策树是仅仅挑选了特征列,如果我们能够根据列的组合和行的组合建立不同的多颗矩阵数,用来分别预测,这就是随机森林
用起来很简单,我们只需要将DecisionTreeClassifier()替换为RandomForestClassifier()就可以了
import pandas as pd
from sklearn.ensemble import RandomForestClassifier #替换为随机森林
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
right_index=True)
X=frame_full
clf=RandomForestClassifier() #替换为随机森林
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))
随机森林的参数很多,我们可以根据算法来获得最优的参数
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.model_selection import GridSearchCV
frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
right_index=True)
X=frame_full
clf=RandomForestClassifier()
parameter_space={
'max_features':[2,4,'auto'],
'n_estimators':[100,],
'criterion':['gini','entropy'],
'min_samples_leaf':[2,4,6],
}
clf=RandomForestClassifier()
grid=GridSearchCV(clf,parameter_space)
grid.fit(X,y)
print(grid.best_estimator_)
print(grid.best_score_)
把最优参数填进去
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
right_index=True)
X=frame_full
clf=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=6, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))
4.会员卡预测改进
(本节内容的数据见电脑F:/python数据/customer 或腾讯微云文件”python数据\customer “)
包含27个相关的特征(姓名、地址、教育情况);还有一个会员卡的类型(金卡、银卡、铜卡、普通卡)
1.数据的预处理
import pandas as pd
frame=pd.read_csv('F:/python数据/customer.csv')
print(frame['yearly_income'].describe()) #了解这一列数据的总数,出现次数最高的数据,出现次数最高的数据出现的次数
print('------------------------------------------------')
print(frame['yearly_income'].unique()) #可以得到年收入的八种不同取值依次是什么
import pandas as pd
from sklearn.ensemble import RandomForestClassifier #替换为随机森林
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn import preprocessing
frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
frame['yearly_income_new']=preprocessing.scale(frame['yearly_income']) #将年收入均值调整为0,将标准差调整为1
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income_new','total_children','num_cars_owned']],frame_new,left_index=True,
right_index=True)
X=frame_full
clf=RandomForestClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))
import pandas as pd
frame=pd.read_csv('F:/python数据/customer.csv')
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
frame['yearly_income_new']=frame['yearly_income'].astype(int)
print(frame['yearly_income_new'].describe())
'''std=35.973839可以看到方差非常大,数据分布非常分撒,我们得处理一下'''
import pandas as pd
frame=pd.read_csv('F:/python数据/customer.csv')
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
frame['yearly_income_new']=frame['yearly_income'].astype(int)
frame['yearly_income_new']=frame['yearly_income_new']//30 #将数据调整的小一些
print(frame['yearly_income_new'].describe())
import pandas as pd
from sklearn.ensemble import RandomForestClassifier #替换为随机森林
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
frame['yearly_income_new']=frame['yearly_income'].astype(int)
frame['yearly_income_new']=frame['yearly_income_new']//30 #将数据调整的小一些
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income_new','total_children','num_cars_owned']],frame_new,left_index=True,
right_index=True)
X=frame_full
clf=RandomForestClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))
import pandas as pd
frame=pd.read_csv('F:/python数据/customer.csv')
frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
print(frame['age'].describe())
import pandas as pd
frame=pd.read_csv('F:/python数据/customer.csv')
frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限
import pandas as pd
from sklearn.ensemble import RandomForestClassifier #替换为随机森林
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']
frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['age','yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
right_index=True)
X=frame_full
clf=RandomForestClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))
2.数据的选择
①使用SelectKBest
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.feature_selection import SelectKBest
from scipy.stats import chi2_contingency as chi2
frame=pd.read_csv('F:/python数据/customer.csv')
frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['age','yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
right_index=True)
X=frame_full
y=frame['member_card']
transformer=SelectKBest(score_func=chi2,k='all') #score_func=chi2表示使用卡方检验,k='all'表示返回所有特征
Xt_chi2=transformer.fit_transform(X,y)
print(transformer.scores_)
'''一共有六个特征{'age','yearly_income','total_children','num_cars_owned',"marital_status=True",'marital_status=True'}
我们可以看到,第一个特征的结果特别小,说明不具备相关性;其他的数值还是蛮高的
'''
②使用PCA
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.decomposition import PCA
frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']
frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['age','yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
right_index=True)
X=frame_full
pca=PCA(n_components=2) #从上面的特征中最终生成两个关联较大的特征
Xd=pca.fit_transform(X)
np.set_printoptions(precision=3,suppress=True) #调用一个numpy的设置,将小数设置为三位
print(pca.explained_variance_ratio_) #我们看到第一列相关性非常大
print(Xd)
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']
frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['age','yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
right_index=True)
X=frame_full
pca=PCA(n_components=2)
Xd=pca.fit_transform(X)
clf=RandomForestClassifier()
scores=cross_val_score(clf,Xd,y,scoring='accuracy') #将pca得到的两列特征应用
print(np.mean(scores))