Critics analysis embarrassing mom

Reptile section

After the order to get to all the comments, you can log onto their account using a cookie to prevent anti-climb, can also be used selenium landing crawling. I look at here is the direct handling other people's code

import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd 
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

# 定义登录函数
def login_douban():
    '''功能:自动登录豆瓣网站'''
    global browser  # 设置为全局变量
    browser = webdriver.Chrome() 

    # 进入登录页面
    login_url = 'https://accounts.douban.com/passport/login?source=movie'
    browser.get(login_url) 

    # 点击密码登录
    browser.find_element_by_class_name('account-tab-account').click() 

    # 输入账号和密码
    username = browser.find_element_by_id('username')
    username.send_keys('你的电话') 
    password = browser.find_element_by_id('password')
    password.send_keys('你的豆瓣密码') 
    

    # 点击登录
    browser.find_element_by_class_name('btn-account').click() 
    time.sleep(5)#留点时间,有的时候有滑块验证码

# 定义函数获取单页数据
def get_one_page(url):
    '''功能:传入url,豆瓣电影一页的短评信息''' 
    # 进入短评页
    browser.get(url) 

    # 使用bs解析网页数据
    bs = BeautifulSoup(browser.page_source, 'lxml')

    # 获取用户名
    username = [i.find('a').text for i in bs.findAll('span', class_='comment-info')]
    # 获取用户url
    user_url = [i.find('a')['href'] for i in bs.findAll('span', class_='comment-info')]

    # 获取推荐星级
    rating = []
    for i in bs.findAll('span', class_='comment-info'):
        try:
            one_rating = i.find('span', class_='rating')['title']
            rating.append(one_rating)
        except:
            rating.append('力荐') 

    # 评论时间
    time = [i.find('span', class_='comment-time')['title'] for i in bs.findAll('span', class_='comment-info')]
    # 短评信息
    short = [i.text for i in bs.findAll('span', class_='short')]
    # 投票次数
    votes = [i.text for i in bs.findAll('span', class_='votes')] 

    # 创建一个空的DataFrame
    df_one = pd.DataFrame() 
    # 存储信息 
    df_one['用户名'] = username
    df_one['用户主页'] = user_url
    df_one['推荐星级'] = rating
    df_one['评论时间'] = time
    df_one['短评信息'] = short
    df_one['投票次数'] = votes

    return df_one

# 定义函数获取25页数据(目前所能获取的最大页数)
def get_25_page(movie_id):
    '''功能:传入电影ID,获取豆瓣电影25页的短评信息'''
    # 创建空的DataFrame
    df_all = pd.DataFrame()

    # 循环追加
    for i in range(25):
        url = "https://movie.douban.com/subject/{}/comments?start={}&limit=20&sort=new_score&status=P".format(movie_id,i*20)
        print('我正在抓取第{}页'.format(i+1), end='\r')   
        # 调用函数
        df_one = get_one_page(url) 
        df_all = df_all.append(df_one, ignore_index=True)
        # 程序休眠一秒
        time.sleep(1.5)   
    return df_all  

if __name__ == '__main__':
    # 先运行登录函数
    login_douban()
    # 程序休眠两秒
    time.sleep(2) 
    # 再运行循环翻页函数
    movie_id = 30306570  # 囧妈 
    df_all = get_25_page(movie_id) 
    df_all.to_csv('./囧妈数据.csv')
    print("爬取完成!!!")

Then crawling everyone's location data obtained from the user home page

# 得到用户所在城市
from lxml import etree
import requests

def get_city(df):
    urls=df['用户主页']
    cities=[]
    headers={
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Host': 'www.douban.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
        'Cookie': '自己的Cookie'
    }
    for url in urls:
        html=requests.get(url,headers=headers).text
        #print(html)
        selector=etree.HTML(html)
        try:
            city=selector.xpath("//div[@class='user-info']/a/text()")[0]
            cities.append(city)
        except:
            cities.append('未知')
    return cities


data=pd.read_csv('./囧妈数据.csv')
cities=get_city(data)
print(len(cities))
data['城市处理']=cities
data.to_csv('./囧妈处理后数据.csv')

Here Insert Picture Description

Data analysis

pyecharts using a version 1.X, use the 0.5 version to be wrong

import pandas as pd
import numpy as np
from pyecharts import options as opts
from pyecharts.charts import Pie, Page,Geo
from pyecharts.charts import Line
from pyecharts.charts import Bar
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType, ThemeType,ChartType

Import Data

data=pd.read_csv('./囧妈处理后数据.csv')
data.info()
data.head(10)

Here Insert Picture Description

# 定义函数转换推荐星级字段
def transform_star(x):
    if x == '力荐':
        return 5
    elif x == '推荐':
        return 4
    elif x == '还行':
        return 3
    elif x == '较差':
        return 2
    else:
        return 1

# 星级转换
data['星级'] = data.推荐星级.map(lambda x:transform_star(x)) 
# 转换日期类型
data['评论时间'] = pd.to_datetime(data.评论时间) 
# 提取日期
data['日期'] = data.评论时间.dt.date

Word, get ready to draw the word frequency word cloud
stopwords download

# 定义函数-获取短评信息关键词
def get_comment_word(df): 
    '''功能:传入df,提取短评信息关键词'''
    # 导入库
    import jieba.analyse
    import os 
    # 去停用词
    stop_words = set()  

    # 加载停用词
    stop_words_path = './中文停用词表.txt'

    with open(stop_words_path, 'r', encoding='utf-8') as sw:
         for line in sw.readlines():
            stop_words.add(line.strip()) 

    # 添加停用词
    stop_words.add('6.3')
    stop_words.add('一张')
    stop_words.add('一部')
    stop_words.add('徐峥')
    stop_words.add('徐导') 
    stop_words.add('电影')
    stop_words.add('电影票') 

    # 合并评论信息
    df_comment_all = df['短评信息'].str.cat() 

    # 使用TF-IDF算法提取关键词
    word_num = jieba.analyse.extract_tags(df_comment_all, topK=100, withWeight=True, allowPOS=())

    # 做一步筛选
    word_num_selected = []

    # 筛选掉停用词
    for i in word_num:
        if i[0] not in stop_words:
            word_num_selected.append(i) 
        else:
            pass 

    return word_num_selected

key_words = get_comment_word(data)
key_words = pd.DataFrame(key_words, columns=['words','num']) 
key_words.head()

Here Insert Picture Description
Word cloud

word = WordCloud(init_opts=opts.InitOpts(width='1350px', height='750px'))
word.add("", [*zip(key_words.words, key_words.num)], word_size_range=[20, 200]) 
word.set_global_opts(title_opts=opts.TitleOpts(title="囧妈电影评论词云图"),
                    toolbox_opts=opts.ToolboxOpts())  
word.render('囧妈电影评论词云图.html')

Here Insert Picture Description
Ratings Distribution

# 总体评分百分比
score_perc = data.星级.value_counts() / data.星级.value_counts().sum()
score_perc = np.round(score_perc*100,2)


# 绘制柱形图
pie1 = Pie(init_opts=opts.InitOpts(width='1350px', height='750px'))
pie1.add("", 
         [*zip(score_perc.index, score_perc.values)], 
         radius=["40%","75%"]) 
pie1.set_global_opts(title_opts=opts.TitleOpts(title='总体评分分布'),
                     legend_opts=opts.LegendOpts(orient="vertical", pos_top="15%", pos_left="2%"),
                     toolbox_opts=opts.ToolboxOpts())
pie1.set_series_opts(label_opts=opts.LabelOpts(formatter="{c}%"))
pie1.render('总体评分分布.html') 

Here Insert Picture Description
Comment by distribution

city_num = data.城市处理.value_counts()[:15] 
city_num.drop('未知', inplace=True) 

c1 = Geo(init_opts=opts.InitOpts(width='1350px', height='750px'))   
c1.add_schema(maptype='china') 
c1.add('geo', [list(z) for z in zip(city_num.index, city_num.values.astype('str'))], type_=ChartType.EFFECT_SCATTER) 
c1.set_series_opts(label_opts=opts.LabelOpts(is_show=False))  
c1.set_global_opts(visualmap_opts=opts.VisualMapOpts(), 
                   title_opts=opts.TitleOpts(title='评论者城市分布'),
                   toolbox_opts=opts.ToolboxOpts()) 
c1.render('评论者城市分布地图.html')

Here Insert Picture Description
Most commented city bar

# 国内城市top10
city_top10 = data.城市处理.value_counts()[:12] 
city_top10.drop('未知', inplace=True) 

# 条形图
bar1 = Bar(init_opts=opts.InitOpts(width='1350px', height='750px')) 
bar1.add_xaxis(city_top10.index.tolist())
bar1.add_yaxis("城市", city_top10.values.tolist()) 
bar1.set_global_opts(title_opts=opts.TitleOpts(title="评论者Top10城市分布"),toolbox_opts=opts.ToolboxOpts())
bar1.render('评论者Top10城市分布条形图.html') 

Here Insert Picture Description
Comment time trend

# 时间排序
time = data.日期.value_counts() 
time.sort_index(inplace=True)


# 绘制时间走势图
line1 = Line(init_opts=opts.InitOpts(width='1350px', height='750px'))
line1.add_xaxis(time.index.tolist())
line1.add_yaxis('评论热度', time.values.tolist(), areastyle_opts=opts.AreaStyleOpts(opacity=0.5), label_opts=opts.LabelOpts(is_show=False))
line1.set_global_opts(title_opts=opts.TitleOpts(title="时间走势图"), toolbox_opts=opts.ToolboxOpts())
line1.render('评论时间走势图.html') 

Here Insert Picture Description
According to this figure, we can roughly guess the movie embarrassing mom probably be released in about 25, and check the Internet really is No. 25, the data often contains a lot of information, and this is where the importance of data mining.

Views on the film

First, you can see the score at 3 multi-separation, that is okay. I read the greatest feeling this movie is not very pure, then if it is a comedy, but the movie interspersed with a lot of family, emotional conflicts content, if it is a step family education, family affection, then not so deep. Not to say that comedy interspersed with some educational content is not good, but probably not grasp the degree, it leads to humor the movie is not so much less funny than other real educational film is inadequate. People are still beginning I miss in the embarrassing way, although there are educational but it does make people laugh, to grasp the degree of good.

Published 85 original articles · won praise 55 · views 20000 +

Guess you like

Origin blog.csdn.net/shelgi/article/details/104169515