会员卡预测

(本节内容的数据见电脑F:/python数据/customer 或腾讯微云文件”python数据\customer “)
在这里插入图片描述
包含27个相关的特征(姓名、地址、教育情况);还有一个会员卡的类型(金卡、银卡、铜卡、普通卡)
1.决策树
特征的选择:特征列太多,我们先选择三个数字型特征的列(年收入,小孩数,家庭汽车拥有量)。年收入是一个范围,我们要替换一下才能用;
在这里插入图片描述

import pandas as pd

frame=pd.read_csv('F:/python数据/customer.csv')
print(frame['yearly_income'].head(2))
frame['yearly_income']=frame['yearly_income'].str.replace('[^0-9]','') #frame['yearly_income'].str获得列那一列元素的字符串表示,然后用空字符替换不属于0-9的阿拉伯数字
print(frame['yearly_income'].head(2))

'''用3050表示30-50'''

在这里插入图片描述
方法二:

import pandas as pd

frame=pd.read_csv('F:/python数据/customer.csv')
print(frame['yearly_income'].head(2))
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','') #我们只取下限作为年收入
print(frame['yearly_income'].head(2))

在这里插入图片描述

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

frame=pd.read_csv('F:/python数据/customer.csv')
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','') #我们只取下限作为年收入
y=frame['member_card'] #把会员卡列作为预测列
X=frame[["yearly_income",'total_children','num_cars_owned']] #将三个数值列作为特征列

clf=DecisionTreeClassifier() #用了决策树
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

如果能够引入更多的分类特征,决策树的效果会更好一些,比如受教育程度和职业与会员等级也有很大的联系
在这里插入图片描述

import pandas as pd
from sklearn.preprocessing import LabelEncoder

frame=pd.read_csv('F:/python数据/customer.csv')
encoding=LabelEncoder() #使用这种方法将字符串映射为数字
encoding.fit(frame['education'])
education_new=encoding.transform(frame['education'])
print(frame['education'].values)
print(education_new)

在这里插入图片描述

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','') #我们只取下限作为年收入
encoding=LabelEncoder()  #使用这种方法将字符串映射为数字
encoding.fit(frame['education'])
frame['education_new']=encoding.transform(frame['education'])
y=frame['member_card'] #把会员卡列作为预测列
X=frame[["yearly_income",'total_children','num_cars_owned']] #将三个数值列作为特征列

clf=DecisionTreeClassifier() #用了决策树
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述
在这里插入图片描述

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
encoding=OneHotEncoder() 
print(frame['education'].values)
newData=encoding.fit_transform(np.vstack(frame['education'].values)).todense()
print(newData)

在这里插入图片描述

import pandas as pd
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
print(frame['education'].values)
print(np.vstack(frame['education'].values)) #vstack把序列竖了起来,只有这样才能存储独热编码的那些列

在这里插入图片描述

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['education'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)
print(frame_full)

在这里插入图片描述

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['education'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=DecisionTreeClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()  #我们仅仅是将'education'换成了"marital_status"婚否,准确率就提高了很多
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=DecisionTreeClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述
2.随机森林
决策树是仅仅挑选了特征列,如果我们能够根据列的组合和行的组合建立不同的多颗矩阵数,用来分别预测,这就是随机森林
用起来很简单,我们只需要将DecisionTreeClassifier()替换为RandomForestClassifier()就可以了

import pandas as pd
from sklearn.ensemble import RandomForestClassifier #替换为随机森林
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=RandomForestClassifier()  #替换为随机森林
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述
随机森林的参数很多,我们可以根据算法来获得最优的参数

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.model_selection import GridSearchCV

frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=RandomForestClassifier()

parameter_space={
    
    
    'max_features':[2,4,'auto'],
    'n_estimators':[100,],
    'criterion':['gini','entropy'],
    'min_samples_leaf':[2,4,6],
    }
clf=RandomForestClassifier()
grid=GridSearchCV(clf,parameter_space)
grid.fit(X,y)
print(grid.best_estimator_)
print(grid.best_score_)

在这里插入图片描述
把最优参数填进去

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np

frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card'] #把会员卡列作为预测列

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=6, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述

4.会员卡预测改进

(本节内容的数据见电脑F:/python数据/customer 或腾讯微云文件”python数据\customer “)
在这里插入图片描述
包含27个相关的特征(姓名、地址、教育情况);还有一个会员卡的类型(金卡、银卡、铜卡、普通卡)

1.数据的预处理

import pandas as pd

frame=pd.read_csv('F:/python数据/customer.csv')
print(frame['yearly_income'].describe()) #了解这一列数据的总数,出现次数最高的数据,出现次数最高的数据出现的次数

print('------------------------------------------------')

print(frame['yearly_income'].unique()) #可以得到年收入的八种不同取值依次是什么

在这里插入图片描述

import pandas as pd
from sklearn.ensemble import RandomForestClassifier #替换为随机森林
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn import preprocessing


frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
frame['yearly_income_new']=preprocessing.scale(frame['yearly_income']) #将年收入均值调整为0,将标准差调整为1
encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income_new','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=RandomForestClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述

import pandas as pd

frame=pd.read_csv('F:/python数据/customer.csv')
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
frame['yearly_income_new']=frame['yearly_income'].astype(int)
print(frame['yearly_income_new'].describe()) 

'''std=35.973839可以看到方差非常大,数据分布非常分撒,我们得处理一下'''

在这里插入图片描述

import pandas as pd

frame=pd.read_csv('F:/python数据/customer.csv')
frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
frame['yearly_income_new']=frame['yearly_income'].astype(int)
frame['yearly_income_new']=frame['yearly_income_new']//30 #将数据调整的小一些
print(frame['yearly_income_new'].describe())  

在这里插入图片描述

import pandas as pd
from sklearn.ensemble import RandomForestClassifier #替换为随机森林
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')
frame['yearly_income_new']=frame['yearly_income'].astype(int)
frame['yearly_income_new']=frame['yearly_income_new']//30 #将数据调整的小一些

encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['yearly_income_new','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=RandomForestClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述

import pandas as pd

frame=pd.read_csv('F:/python数据/customer.csv')
frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
print(frame['age'].describe())  

在这里插入图片描述

import pandas as pd

frame=pd.read_csv('F:/python数据/customer.csv')
frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限
import pandas as pd
from sklearn.ensemble import RandomForestClassifier #替换为随机森林
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np


frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']

frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')

encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['age','yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

clf=RandomForestClassifier()
scores=cross_val_score(clf,X,y,scoring='accuracy')
print(np.mean(scores))

在这里插入图片描述
2.数据的选择
①使用SelectKBest

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.feature_selection import SelectKBest
from scipy.stats import chi2_contingency as chi2

frame=pd.read_csv('F:/python数据/customer.csv')

frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')

encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['age','yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full
y=frame['member_card']

transformer=SelectKBest(score_func=chi2,k='all') #score_func=chi2表示使用卡方检验,k='all'表示返回所有特征
Xt_chi2=transformer.fit_transform(X,y)

print(transformer.scores_)

'''一共有六个特征{'age','yearly_income','total_children','num_cars_owned',"marital_status=True",'marital_status=True'}
我们可以看到,第一个特征的结果特别小,说明不具备相关性;其他的数值还是蛮高的
'''

在这里插入图片描述
②使用PCA

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.decomposition import PCA

frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']

frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')

encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['age','yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

pca=PCA(n_components=2) #从上面的特征中最终生成两个关联较大的特征
Xd=pca.fit_transform(X)
np.set_printoptions(precision=3,suppress=True) #调用一个numpy的设置,将小数设置为三位
print(pca.explained_variance_ratio_) #我们看到第一列相关性非常大
print(Xd)

在这里插入图片描述

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

frame=pd.read_csv('F:/python数据/customer.csv')
y=frame['member_card']

frame['age']=pd.to_datetime(frame['date_accnt_opened']).dt.year-pd.to_datetime(frame['birthdate']).dt.year #顾客开卡的时间减去生日等于顾客开卡的年龄
frame['age']=frame['age']//20 #除以20减小方差
frame.loc[frame['age'] >=3,'age']=1
frame.loc[frame['age'] <=1,'age']=1 #将小于20岁和大于60岁的年龄统一为一个类别,因为他们消费能力都有限

frame['yearly_income']=frame['yearly_income'].str.split(' ').str[0].str.replace('[^0-9]','')

encoding=OneHotEncoder()
newData=encoding.fit_transform(np.vstack(frame['marital_status'].values)).todense()
frame_new=pd.DataFrame(newData)
frame_full=pd.merge(frame[['age','yearly_income','total_children','num_cars_owned']],frame_new,left_index=True,
                    right_index=True)

X=frame_full

pca=PCA(n_components=2) 
Xd=pca.fit_transform(X)

clf=RandomForestClassifier()
scores=cross_val_score(clf,Xd,y,scoring='accuracy') #将pca得到的两列特征应用
print(np.mean(scores))

在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/weixin_45014721/article/details/114653936
今日推荐