import csv
import re
import requests
# 导入用于随机User-Agent值的第三方库
from fake_useragent import UserAgent
from lxml import etree
class DBMovie(object):
def __init__(self):
self.base_url = 'https://movie.douban.com/top250'
self.ua = UserAgent()
self.html_obj = None
def get_page_code(self, url=''):
"""
根据url获取网页源代码
:param url: 从页面中的下一页标签中提取的相对地址 ?start=20&filter=
:return:
"""
# 拼接每一页的完整地址
abs_url = self.base_url + url
headers = {
'User-Agent':self.ua.random
}
content = requests.get(abs_url, headers=headers).content.decode()
self.html_obj = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
self.get_content_by_xpath(self.html_obj)
# self.get_content_by_css(self.html_obj)
def get_content_by_xpath(self, html_obj):
"""
根据每一页的文档对象Element,使用xpath/cssselect取出相关信息。
:param html_obj: 接收的某一页的根文档对象
:return:
"""
movie_list = []
item_div = html_obj.xpath('//div[@class="item"]')
for item_tag in item_div:
movie_dict = {}
# 获取em标签内部的电影排名
em = item_tag.xpath('.//em/text()')[0]
# 获取电影的简要信息
hd = item_tag.xpath('.//div[@class="hd"]/a/span/text()')
# 将hd中的三个信息拼接成一个字符串
info = ''
for info_text in hd:
content = info_text.strip('\n').strip()
info += content
# 获取电影的详细信息
# 演员介绍
member_info = item_tag.xpath('.//p[@class=""]/text()')[0].strip('\n').strip()
# 电影评分
star_number = item_tag.xpath('.//span[@class="rating_num"]/text()')[0]
# 电影评论数
comment_number = item_tag.xpath('.//div[@class="star"]/span[last()]/text()')[0]
comment_number = re.search(re.compile('(\d+)'), comment_number).group(1)
# 电影点评
quote = item_tag.xpath('.//span[@class="inq"]')
if len(quote) != 0:
quote = quote[0].xpath('text()')[0]
else:
quote = '影评不存在'
# 将以上数据添加到字典中
movie_dict['movie_rank'] = em
movie_dict['movie_name'] = info
movie_dict['movie_member'] = member_info
movie_dict['movie_star'] = star_number
movie_dict['movie_comment'] = comment_number
movie_dict['movie_quote'] = quote
movie_list.append(movie_dict)
# 将movie_list中的所有字典数据,写入到本地excel文件中
self.write_movie_info(movie_list)
def write_movie_info(self, movie_list):
"""
将当前页的所有电影数据,写入到本地
:param movie_list: 当前页的所有数据
:return:
"""
for index, movie in enumerate(movie_list):
self.writer.writerow(movie)
print('第{}页写入完成!'.format(index))
# 当前页数据写入完毕,获取下一页的url
self.get_next_page_url()
def open_file(self):
csv_file = open('movie.csv', 'w', encoding='utf-8', newline='')
self.writer = csv.DictWriter(csv_file,
fieldnames=['movie_rank', 'movie_name', 'movie_member', 'movie_star', 'movie_comment',
'movie_quote'])
self.writer.writeheader()
def get_next_page_url(self):
a = self.html_obj.xpath('//span[@class="next"]/a')
if len(a) == 0:
print('最后一页了!')
return
next_page = a[0].xpath('@href')[0]
self.get_page_code(next_page)
def get_content_by_css(self, html_obj):
item_div = html_obj.cssselect('.item')
for div_tag in item_div:
# 排名信息
em = div_tag.cssselect('em')[0].text
# 电影名称
name_list = div_tag.cssselect('a>span')
name = ''
for name_tag in name_list:
name_str = name_tag.text.strip().strip('\n').encode('utf-8')
name += name_str
# 演员信息
bd = div_tag.cssselect('.bd')[0]
p_list = bd.cssselect('p')
if len(p_list) == 2:
# 包含演员信息和影评信息
member = p_list[0].text.encode('utf-8')
quote = p_list[1].cssselect('span')[0].text.encode('utf-8')
else:
# 只有演员信息,没有影评
member = p_list[0].text.encode('utf-8')
# 获取评分及评论数
star_list = bd.cssselect('.star>span')
star = star_list[1].text
comment_number = star_list[3].text
comment_number = re.search(re.compile('(\d+)'), comment_number).group(1)
print('css选择器选择完毕')
if __name__ == '__main__':
movie_obj = DBMovie()
movie_obj.open_file()
movie_obj.get_page_code()
python爬取豆瓣250
猜你喜欢
转载自blog.csdn.net/DonQuixote_/article/details/81544215
今日推荐
周排行