python大数据分析电影评分与时长等等

准备好相关数据

链接:https://pan.baidu.com/s/1EvuEnVhSAUghEkF5rckMoA?pwd=2222 
提取码:2222

一.利用Kmeans分析时长与评分的关系

 导入相关库

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from datetime import datetime
from sklearn.model_selection import train_test_split #划分测试集与训练集
from sklearn.linear_model import LinearRegression as LR #回归模块
from sklearn.metrics import mean_squared_error #MSE
from sklearn.metrics import mean_absolute_error #MAE
from sklearn.metrics import r2_score #R2
plt.rcParams['font.sans-serif']=['SimHei']
data = pd.read_csv('C:\\Users\\wt\\Desktop\\data1.csv')
mold = data.iloc[:, 1]
avg_rating_num = np.mean(data.iloc[:, [2]], axis=0)
X = data.iloc[:, [0, 2]]
X = X.values.astype('float32')

记录相关电影类型数目

label = {}
def fetch(s):
    if s in label:
        label[s] += 1
    else:
        label[s] = 1
for i in mold:
    print(i)
    if len(i) == 2:
        fetch(i[0:2])
    elif len(i) == 3:
        fetch(i[0:3])
    elif len(i) == 5:
        fetch(i[0:2])
        fetch(i[3:5])
    elif len(i) == 8:
        fetch(i[0:2])
        fetch(i[3:5])
        fetch(i[6:8])
    elif len(i) == 11:
        fetch(i[0:2])
        fetch(i[3:5])
        fetch(i[6:8])
        fetch(i[9:11])
    elif len(i) == 14:
        fetch(i[0:2])
        fetch(i[3:5])
        fetch(i[6:8])
        fetch(i[9:11])
        fetch(i[12:14])

 打印相关系数

print(np.corrcoef(X[:, 0], X[:, 1]))

clf = KMeans(n_clusters=4)  
y_pred = clf.fit_predict(X)  

patches, text = plt.pie(label.values(), labels=label.keys(), radius=1)
text[-1].set_text('')
text[-2].set_text('')
text[-3].set_text('')
text[-5].set_text('')
text[-6].set_text('')
text[-7].set_text('')
text[-8].set_text('')
text[-9].set_text('')
text[13].set_text('')
for t in text:
    t.set_size(10)
plt.title("高质量电影类型成分分析")

画图分析电影类型占比情况

x = [n[0] for n in X]
y = [n[1] for n in X]
plt.title("Kmeans分析时长与评分")
plt.scatter(x, y, c=y_pred, marker='x')
plt.xlabel("时长")
plt.ylabel("评分")
plt.show()

 利用kmeans时长与分布情况

  二.多元回归模型分析播放量

记录电影种类

item=['剧情','犯罪']
def finds(iss):
    global item
    iss=iss.split("/")
    i=0
    while(1):
        if i==len(item):
            break
        for y in range(len(iss)):
            if iss[y]== item[i]:
                iss.remove(iss[y])
                break
        i+=1
    return iss    
for i in range(len(data.mold)):
    iss=finds(data.mold[i])
    item=item+list(iss)

 自定义独热编码

def my_get_dummies(ser):
  
    data=[]
    data=list(data)
    base_data=np.zeros((len(ser),),dtype=np.int)
    for i in range(len(item)):
        data.append(base_data)
    array = np.array(data, dtype = int)
    array=array.reshape(250,27)
    df=pd.DataFrame(array,columns=item,index=ser.index)
    for irec in ser.index:
        rec=ser[irec].split(',')
        for dirt in rec:
            if dirt not in item:
                print(dirt)
            else:
                df[dirt][irec]=1
    return df        
data=data.join(my_get_dummies(data.mold))
data

 建立模型

#二分原则为80%为样例数据作为模型训练集20%为样本数据作为测试集检查估计能力
 
from sklearn.model_selection import train_test_split #划分测试集与训练集
from sklearn.linear_model import LinearRegression as LR #回归模块
##在ipy中显示图像
%matplotlib inline
#设置绘图显示中文字体

pd.set_option('display.max_columns', None)
#特征提取
film_type=data[item]
film_type
# total_layer=data.总楼层
# 选择自变量与因变量
X = pd.concat([film_type,data.duration,data.Wtsee_people,data.Rating_people,data.Comments_people,data.year,data.rating_num],axis=1)
Y = data.Watching_people
print(type(X))
X = X.fillna(0)
#划分测试集与训练集
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.2,random_state=420)
reg=LR().fit(Xtrain,Ytrain)
#预测
Yhat=reg.predict(Xtest)
#查看回归系数
print(list(zip(X.columns,reg.coef_)))
#查看截距
print(reg.intercept_)

from sklearn.metrics import mean_squared_error #MSE
from sklearn.metrics import mean_absolute_error #MAE
from sklearn.metrics import r2_score #R2
mse= mean_squared_error(Ytest,Yhat)
mae= mean_absolute_error(Ytest,Yhat)
r2=r2_score(Ytest,Yhat)
#调整R2
n=Xtest.shape[0]
k=Xtest.shape[1]
adj_r2=1-(1-r2)*((n-1)/(n-k-1))
print('MSE:'+str(mse))
print('MAE:'+str(mae))
print('R2:'+str(r2))
print('调整后R2:'+str(adj_r2))

评估模型

 绘制图表评测结果

#绘制前50条记录
n=50
#绘制模型预测值
plt.plot(range(len(Yhat[:n])),Yhat[:n])
#绘制模型真实值
plt.plot(range(len(Ytrain[:n])),Ytrain[:n])
#图形设置
plt.xlabel('个例')
plt.ylabel('播放量')
plt.title('线性回归预测结果')
plt.legend(["预估","实际"])

 将测试集真实值与模型预测值用折线图的形式表现出来

#绘制前50条记录
n=50
#绘制模型预测值
plt.plot(range(len(Yhat[:n])),Yhat[:n])
#绘制模型测试真实值
plt.plot(range(len(Ytest[:n])),Ytest[:n])
#图形设置
plt.xlabel('个例')
plt.ylabel('播放量')
plt.title('线性回归预测结果')
plt.legend(["预估","实际"])

三.决策树预测评分

计算pearsonr系数判断相关程度

from sklearn import tree#决策树模型
from sklearn.model_selection import train_test_split#划分测试集合与训练集合
from sklearn.model_selection import GridSearchCV#用于找到最优模型
from scipy.stats import pearsonr
# 通常情况下通过以下取值范围判断变量的相关强度:
# 相关系数         0.8-1.0     极强相关
#                  0.6-0.8     强相关
#                  0.4-0.6     中等程度相关
#                  0.2-0.4     弱相关
#                  0.0-0.2     极弱相关或无相关
# x=np.array([1,3,5])
# y=np.array([1,3,4])
# pc = pearsonr(x,y)
# print("相关系数:",pc[0])
# print("显著性水平:",pc[1])

pccs = pearsonr(data['duration'],data['rating_num'])
print('时长')
print("相关系数:",pccs[0])
print("显著性水平:",pccs[1])
pccs = pearsonr(data['Watching_people'],data['rating_num'])
print('评分')
print("相关系数:",pccs[0])
print("显著性水平:",pccs[1])
pccs = pearsonr(data['year'],data['rating_num'])
print('年份')
print("相关系数:",pccs[0])
print("显著性水平:",pccs[1])
pccs = pearsonr(data['Rating_people'],data['rating_num'])
print('评价人数')
print("相关系数:",pccs[0])
print("显著性水平:",pccs[1])
pccs = pearsonr(data['Comments_people'],data['rating_num'])
print('短评人数')
print("相关系数:",pccs[0])
print("显著性水平:",pccs[1])
pccs = pearsonr(data['Wtsee_people'],data['rating_num'])
print('想看人数')
print("相关系数:",pccs[0])
print("显著性水平:",pccs[1])

建立树模型

X=pd.concat([data['Watching_people'],data['Wtsee_people'],data['Watching_people'],data['Rating_people'],data['Comments_people'],data['year']],axis=1)
Y=data['rating_num']
# 划分测试与训练集
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.1,random_state=420)
# 选择最优参数
tree_param={'criterion':['mse','friedman_mse','mae'],'max_depth':list(range(10))}
# GridSearchCV网格搜索,搜索的是参数,即在指定的参数范围内,按步长依次调整参数,利用调整的参数训练学习器,从所有的参数中找到在验证集上精度最高的参数,这其实是一个训练和比较的过程。k折交叉验证将所有数据集分成k份,不重复地每次取其中一份做测试集,
# 用其余k-1份做训练集训练模型,之后计算该模型在测试集上的得分,将k次的得分取平均得到最后的得分。
grid=GridSearchCV(tree.DecisionTreeRegressor(),param_grid=tree_param,cv=3)#实例化对象
grid.fit(Xtrain,Ytrain)#训练模型
grid.best_params_,grid.best_score_#最优参数,最优分数
print(grid.best_params_)
print(grid.best_score_)
# #建立决策树(改进的均方误差不纯度准则)
dtr=tree.DecisionTreeRegressor(criterion='friedman_mse',max_depth =4)
# #训练决策树
#预测训练结果
dtr.fit(Xtrain,Ytrain)
pred=dtr.predict(Xtest)

 画图预测接下来25条真实评分与预测评分

fig=plt.figure(figsize=(15.6,7.2))
ax=fig.add_subplot(111)
s1=ax.scatter(range(len(pred)),pred,facecolors="red",label='预测')
s2=ax.scatter(range(len(Ytest)),Ytest,facecolors="blue",label='实际')
plt.legend()

 可观察到有15条左右的预测评分接近真实值

误差在0.3左右

猜你喜欢

转载自blog.csdn.net/m0_59054762/article/details/130416630