python爬取豆瓣250

import csv
import re

import requests
# 导入用于随机User-Agent值的第三方库
from fake_useragent import UserAgent
from lxml import etree


class DBMovie(object):
    def __init__(self):
        self.base_url = 'https://movie.douban.com/top250'
        self.ua = UserAgent()
        self.html_obj = None

    def get_page_code(self, url=''):
        """
        根据url获取网页源代码
        :param url: 从页面中的下一页标签中提取的相对地址 ?start=20&filter=
        :return:
        """
        # 拼接每一页的完整地址
        abs_url = self.base_url + url
        headers = {
            'User-Agent':self.ua.random
        }
        content = requests.get(abs_url, headers=headers).content.decode()
        self.html_obj = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))

        self.get_content_by_xpath(self.html_obj)
        # self.get_content_by_css(self.html_obj)

    def get_content_by_xpath(self, html_obj):
        """
        根据每一页的文档对象Element,使用xpath/cssselect取出相关信息。
        :param html_obj: 接收的某一页的根文档对象
        :return:
        """
        movie_list = []
        item_div = html_obj.xpath('//div[@class="item"]')
        for item_tag in item_div:
            movie_dict = {}
            # 获取em标签内部的电影排名
            em = item_tag.xpath('.//em/text()')[0]
            # 获取电影的简要信息
            hd = item_tag.xpath('.//div[@class="hd"]/a/span/text()')
            # 将hd中的三个信息拼接成一个字符串
            info = ''
            for info_text in hd:
                content = info_text.strip('\n').strip()
                info += content
            # 获取电影的详细信息
            # 演员介绍
            member_info = item_tag.xpath('.//p[@class=""]/text()')[0].strip('\n').strip()
            # 电影评分
            star_number = item_tag.xpath('.//span[@class="rating_num"]/text()')[0]
            # 电影评论数
            comment_number = item_tag.xpath('.//div[@class="star"]/span[last()]/text()')[0]
            comment_number = re.search(re.compile('(\d+)'), comment_number).group(1)
            # 电影点评
            quote = item_tag.xpath('.//span[@class="inq"]')
            if len(quote) != 0:
                quote = quote[0].xpath('text()')[0]
            else:
                quote = '影评不存在'

            # 将以上数据添加到字典中
            movie_dict['movie_rank'] = em
            movie_dict['movie_name'] = info
            movie_dict['movie_member'] = member_info
            movie_dict['movie_star'] = star_number
            movie_dict['movie_comment'] = comment_number
            movie_dict['movie_quote'] = quote

            movie_list.append(movie_dict)

        # 将movie_list中的所有字典数据,写入到本地excel文件中
        self.write_movie_info(movie_list)

    def write_movie_info(self, movie_list):
        """
        将当前页的所有电影数据,写入到本地
        :param movie_list: 当前页的所有数据
        :return:
        """
        for index, movie in enumerate(movie_list):
            self.writer.writerow(movie)
            print('第{}页写入完成!'.format(index))

        # 当前页数据写入完毕,获取下一页的url
        self.get_next_page_url()

    def open_file(self):
        csv_file = open('movie.csv', 'w', encoding='utf-8', newline='')
        self.writer = csv.DictWriter(csv_file,
                                fieldnames=['movie_rank', 'movie_name', 'movie_member', 'movie_star', 'movie_comment',
                                            'movie_quote'])
        self.writer.writeheader()

    def get_next_page_url(self):
        a = self.html_obj.xpath('//span[@class="next"]/a')
        if len(a) == 0:
            print('最后一页了!')
            return
        next_page = a[0].xpath('@href')[0]
        self.get_page_code(next_page)

    def get_content_by_css(self, html_obj):
        item_div = html_obj.cssselect('.item')
        for div_tag in item_div:
            # 排名信息
            em = div_tag.cssselect('em')[0].text
            # 电影名称
            name_list = div_tag.cssselect('a>span')
            name = ''
            for name_tag in name_list:
                name_str = name_tag.text.strip().strip('\n').encode('utf-8')
                name += name_str

            # 演员信息
            bd = div_tag.cssselect('.bd')[0]
            p_list = bd.cssselect('p')
            if len(p_list) == 2:
                # 包含演员信息和影评信息
                member = p_list[0].text.encode('utf-8')
                quote = p_list[1].cssselect('span')[0].text.encode('utf-8')
            else:
                # 只有演员信息,没有影评
                member = p_list[0].text.encode('utf-8')

            # 获取评分及评论数
            star_list = bd.cssselect('.star>span')
            star = star_list[1].text
            comment_number = star_list[3].text
            comment_number = re.search(re.compile('(\d+)'), comment_number).group(1)

            print('css选择器选择完毕')


if __name__ == '__main__':
    movie_obj = DBMovie()
    movie_obj.open_file()
    movie_obj.get_page_code()

猜你喜欢

转载自blog.csdn.net/DonQuixote_/article/details/81544215