import urllib.request
from bs4 import BeautifulSoup
import re
url="https://movie.douban.com/top250?start="
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.6"
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
bs=BeautifulSoup(content,'lxml')
datalist=[]
x=0
for i in range(0,10):
xurl=url+str(i*25)
request = urllib.request.Request(xurl, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
# print(content) #测试是否打印出10页的信息
bs=BeautifulSoup(content,'html.parser')
for item in bs.find_all('div',class_="item"):
x=x+1 #测试循环次数是否为250次
item=str(item)
data=[]
# 电影链接
re_href = re.compile(r'<a href="(.+)">')
href=re.findall(re_href,item)[0]
# 图片链接
re_img = re.compile(r'img.*src="(.*?)"', re.S)
img=re.findall(re_img,item)
# 电影名
re_name = re.compile(r'<span class="title">(.*?)</span>')
name=re.findall(re_name,item)
# 英文名
english_name=re.findall(re.compile(r'<span class="title"> /\xa0(.*)</span>'),item)
# 别名
other_name = re.findall(re.compile(r'<span class="other"> /\xa0(.*)</span>'), item)
# 评分
re_Rating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
rating_num=re.findall(re_Rating,item)
# 演职员表
re_credits = re.compile(r'<p class="">(.*?)</p>', re.S)
credit=re.findall(re_credits,item)
credits=credit[0]
credits=re.sub('<br/>(\s+)?'," ",credits)
credits=re.sub("\xa0\xa0\xa0","",credits)
credits=re.sub("\xa0/\xa0"," ",credits)
#评价数
re_comment_num = re.compile(r"<span>(.*?)人评价</span>")
comment_num = re.findall(re_comment_num, item)
#评价
re_comment = re.compile(r'<span class="inq">(.*?)</span>')
comment = re.findall(re_comment, item)
#所有信息添加到列表中
data.append(href)
data.append(img)
data.append(name[0])
data.append(english_name)
data.append(other_name)
data.append(rating_num)
data.append(credits.strip())
data.append(comment_num)
data.append(comment)
datalist.append(data)
print("top",x,data)
#保存数据
import xlwt
savepath=("HJY豆瓣电影Top250.xls")
book=xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet('HJY豆瓣电影Top250',cell_overwrite_ok=True)
col = ("电影详情链接", "图片链接", "影片中文名", "影片外国名", "别名","评分","演职员表","评价数(人)", "评价")
for i in range(0, 9):
sheet.write(0, i, col[i]) # 列名
for i in range(0, 250):
print("第%d条" % (i + 1))
data = datalist[i]
for j in range(0, 9):
sheet.write(i + 1, j, data[j]) # 数据
book.save(savepath) # 保存
#测试
# print(datalist)
reptile doubanTop250
猜你喜欢
转载自blog.csdn.net/txmmy/article/details/115798467
今日推荐
周排行