#爬取的网址:http://dianying.2345.com/top/
#电影的名字,主演,简介,和标题图
'''
爬取最新电影排行榜单
url:http://dianying.2345.com/top/
使用 requests --- bs4 线路
Python版本: 3.7
'''
import requests as rs
import bs4
def get_html(url):
try:
r=rs.get(url,timeout=30)#超时时间
r.raise_for_status()#https://www.jianshu.com/p/159bea26f7b5判断网络状态是否正常
r.encoding='gbk'#采用gbk编码
return r.text
except:
return "出错"
def get_content(url):
html=get_html(url)
soup=bs4.BeautifulSoup(html,'lxml')#一个解析库;https://blog.csdn.net/zhangzejia/article/details/79658221
# 找到电影排行榜的ul列表
movies_list = soup.find('ul', class_='picList clearfix')
movies = movies_list.find_all('li')
for top in movies:
# 找到图片连接,
img_url = top.find('img')['src']
name = top.find('span', class_='sTit').a.text
# 这里做一个异常捕获,防止没有上映时间的出现
try:
time = top.find('span', class_='sIntro').text
except:
time = "暂无上映时间"
# 这里用bs4库迭代找出“pACtor”的所有子孙节点,即每一位演员解决了名字分割的问题
actors = top.find('p', class_='pActor')
actor = ''
for act in actors.contents:
actor = actor + act.string + ' '
# 找到影片简介
intro = top.find('p', class_='pTxt pIntroShow').text
print("片名:{}\t{}\n{}\n{} \n \n ".format(name, time, actor, intro))
# 我们来吧图片下载下来:
with open('image/' + name + '.png', 'wb+') as f:
img_url2 ='http:'+ img_url#原文未加http,一定要加,不加报错
f.write(rs.get(img_url2).content)
print(img_url2)
f.close()
def main():
url = 'http://dianying.2345.com/top/'
get_content(url)
if __name__ == "__main__":
main()
爬虫实践: 电影排行榜和图片批量下载(看的大佬的)
猜你喜欢
转载自blog.csdn.net/honest_boy/article/details/89070643
今日推荐
周排行