reptile doubanTop250

import urllib.request
from bs4 import BeautifulSoup
import  re
url="https://movie.douban.com/top250?start="
headers = {
    
    
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.6"
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
bs=BeautifulSoup(content,'lxml')
datalist=[]
x=0
for i in range(0,10):
    xurl=url+str(i*25)
    request = urllib.request.Request(xurl, headers=headers)
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')

    # print(content)  #测试是否打印出10页的信息
    bs=BeautifulSoup(content,'html.parser')
    for item in bs.find_all('div',class_="item"):
        x=x+1  #测试循环次数是否为250次
        item=str(item)
        data=[]
        # 电影链接
        re_href = re.compile(r'<a href="(.+)">')
        href=re.findall(re_href,item)[0]
        # 图片链接
        re_img = re.compile(r'img.*src="(.*?)"', re.S)
        img=re.findall(re_img,item)
        # 电影名
        re_name = re.compile(r'<span class="title">(.*?)</span>')
        name=re.findall(re_name,item)
        # 英文名
        english_name=re.findall(re.compile(r'<span class="title"> /\xa0(.*)</span>'),item)
        # 别名
        other_name = re.findall(re.compile(r'<span class="other"> /\xa0(.*)</span>'), item)
        # 评分
        re_Rating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
        rating_num=re.findall(re_Rating,item)
        # 演职员表
        re_credits = re.compile(r'<p class="">(.*?)</p>', re.S)
        credit=re.findall(re_credits,item)
        credits=credit[0]
        credits=re.sub('<br/>(\s+)?'," ",credits)
        credits=re.sub("\xa0\xa0\xa0","",credits)
        credits=re.sub("\xa0/\xa0"," ",credits)
        #评价数
        re_comment_num = re.compile(r"<span>(.*?)人评价</span>")
        comment_num = re.findall(re_comment_num, item)
        #评价
        re_comment = re.compile(r'<span class="inq">(.*?)</span>')
        comment = re.findall(re_comment, item)


        #所有信息添加到列表中
        data.append(href)
        data.append(img)
        data.append(name[0])
        data.append(english_name)
        data.append(other_name)
        data.append(rating_num)
        data.append(credits.strip())
        data.append(comment_num)
        data.append(comment)
        datalist.append(data)
        print("top",x,data)
#保存数据
import xlwt
savepath=("HJY豆瓣电影Top250.xls")
book=xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet('HJY豆瓣电影Top250',cell_overwrite_ok=True)
col = ("电影详情链接", "图片链接", "影片中文名", "影片外国名", "别名","评分","演职员表","评价数(人)", "评价")
for i in range(0, 9):
    sheet.write(0, i, col[i])  # 列名
for i in range(0, 250):
    print("第%d条" % (i + 1))
    data = datalist[i]
    for j in range(0, 9):
        sheet.write(i + 1, j, data[j])  # 数据

book.save(savepath)  # 保存
#测试
# print(datalist)

在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/txmmy/article/details/115798467