使用python3 爬取豆瓣电影热映和即将上映

使用python3爬取都摆即将上映和正在热映的电影,代码如下


直接使用bs4获取页面,使用css 获取到对应的信息后,使用字符串拼接的方式,将正在热映和即将上映的信息拼接出来并写入到html页面中,在爬取完毕后打开生成的html。


内容比较简单


[码云的地址](https://gitee.com/xlelou/spider/blob/master/doubai.py)

其他地址,我的小社区




import requests
import json
from bs4 import BeautifulSoup
import webbrowser
# 热映的url
nowplayingUrl = 'https://movie.douban.com/cinema/nowplaying/dongying/'


class getM():
# 获取热映的数据
	def getNowPlaying (url):
		r = requests.get(url)
		res = r.text
		soup = BeautifulSoup(res,'html.parser')
		div = soup.find(id = 'nowplaying').find_all("li", attrs={"class": "list-item"})
		# print(div)
		nowplaying = ''
		for i in range(len(div)):
			# if i < 10 :
			# print(div[i].find("li", attrs={"class": 'srating'}).find('span','subject-rate'))
			if div[i].find("li", attrs={"class": 'stitle'}).find('a').get('title') != None:
	 			nowplaying += '电影名称:'+ div[i].find("li", attrs={"class": 'stitle'}).find('a').get('title')+ '\r\n'
			else:
				nowplaying += '电影名称:'+'暂无名称'+ '\r\n'


			if div[i].find("li", attrs={"class": 'srating'}).find('span','subject-rate') != None:
	 			nowplaying += '评分:' + div[i].find("li", attrs={"class": 'srating'}).find('span','subject-rate').text+ '\r\n'
			else:
	 			nowplaying +='评分:' + '暂无评分'+'\r\n'
			if div[i].find("li", attrs={"class": 'stitle'}).find('a').get('href') != None:
				nowplaying += '电影简介:<a target="_blank" href='+ div[i].find("li", attrs={"class": 'stitle'}).find('a').get('href')+'/>'+div[i].find("li", attrs={"class": 'stitle'}).find('a').get('href')+'</a>' +'\r\n' 
			else:
				nowplaying += '电影简介:'+'暂无简介'+ '\r\n'
			if div[i].find("li", attrs={"class": 'sbtn'}).find('a').get('href') != None:
				nowplaying += '购票地址:<a target="_blank" href='+ div[i].find("li", attrs={"class": 'sbtn'}).find('a').get('href')+ '>'+div[i].find("li", attrs={"class": 'sbtn'}).find('a').get('href')+'</a>'+'\r\n'
			else:
				nowplaying += '购票地址:'+'暂无地址'+ '\r\n'	 
		return nowplaying
 


	def getComing (url):
		r = requests.get(url)
		res = r.text
		soup = BeautifulSoup(res,'html.parser')
		trs = soup.find('table','coming_list').find('tbody').find_all('tr')
		coming = '' 
		for i in range(len(trs)): 
			if trs[i]:
				coming += '上映日期:' + trs[i].find_all('td')[0].text.strip() + '\r\n'
				coming += '片名:' + trs[i].find_all('td')[1].text.strip() + '\r\n'
				coming += '类型:' + trs[i].find_all('td')[2].text.strip() + '\r\n'
				coming += '制片地区:' + trs[i].find_all('td')[3].text.strip() + '\r\n'
				coming += '想看:' + trs[i].find_all('td')[4].text.strip() + '\r\n'
				coming += '简介:<a target="_blank" href=' + trs[i].find_all('td')[1].find('a').get('href').strip() + '>'+trs[i].find_all('td')[1].find('a').get('href').strip()+'</a>' +'\r\n'
				coming += '\r\n' + '\r\n'
			# pass
		return coming






GEN_HTML = 'asd.html'
print(getM.getNowPlaying(nowplayingUrl))


print(getM.getComing('https://movie.douban.com/coming'))
content = getM.getNowPlaying(nowplayingUrl).replace('\r\n','<br/>')
coming = getM.getComing('https://movie.douban.com/coming').replace('\r\n','<br/>')
f = open(GEN_HTML,'w',encoding='utf8')
message = """
<html>
<head>
  <meta name="renderer" content="webkit" />
    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
</head>
<body>
<p>Hello,World!</p>
<p>豆瓣电影</p>
<div>
<p>热映</p>
%s
</div>
<div>
<p>即将上映</p>
%s
</div>
</body>
</html>"""%(content,coming)


f.write(message)
f.close()


webbrowser.open(GEN_HTML,new = 1)


猜你喜欢

转载自blog.csdn.net/xlelou/article/details/80804981