使用正则解析,获得名字,影片信息,打分,评价人数,影评等数据。存储到csv文件中,少部分数据爬取不到还存在优化空间。
import requests
import re
import csv
#拿到豆瓣top250网站源码
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"
}
for i in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start={}&filter='.format(str(i))
url = str(url)
resp = requests.get(url=url,headers=headers)
connect = resp.text
print(connect)
#正则解析数据提取:
# 名字,影片信息,打分,评价人数,影评
obj1 = re.compile(r'<div class="hd">.*?<span class="title">(?P<name>.*?)</span>.*?<span class="title">.*?/ (?P<name2>.*?)</span>'
+'.*?<span class="other">.*?/ (?P<other_name>.*?)</span>'
+'.*?<div class="bd">.*?<p class="">(?P<film_>.*?)</p>'
+'.*? <div class="star">.*?property="v:average">(?P<star>.*?)</span>'
+'.*?content="10.0"></span>.*?<span>(?P<num>.*?)</span>'
+'.*?<span class="inq">(?P<evaluate>.*?)</span>',re.S)
name_ = obj1.finditer(connect)
# 存储
f = open('data{}.csv'.format(str(i)),mode='w', encoding='utf-8')
csv_writer = csv.writer(f)
for it in name_:
dic = it.groupdict()
dic['name'] = dic['name'].strip()
dic['name2'] = dic['name2'].strip()
dic['other_name'] = dic['other_name'].strip()
dic['film_'] = dic['film_'].strip()
dic['star'] = dic['star'].strip()
dic['num'] = dic['num'].strip()
dic['film_'] = dic['film_'].strip()
dic['evaluate'] = dic['evaluate'].strip()
csv_writer.writerow(dic.values())
f.close()
resp.close()
爬虫爬取豆瓣top250电影信息