豆瓣上映电影爬虫

https://study.163.com/course/courseLearn.htm?courseId=1005913008#/learn/video?lessonId=1053258282&courseId=1005913008

课堂上的代码,做个记录

 1 import requests
 2 from bs4 import BeautifulSoup
 3 import json
 4 
 5 
 6 def get_page():
 7     url = 'https://movie.douban.com/cinema/nowplaying/changsha/'
 8     headers = {
 9         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
10     }
11     response = requests.get(url, headers=headers, verify=False)
12     text = response.text
13     return text
14 
15 
16 def parse_page(text):
17     soup = BeautifulSoup(text, 'lxml')
18     movies = []
19     liList = soup.find_all('li', attrs={"data-category":"nowplaying"})
20     for li in liList:
21         movie = {}
22         title = li['data-title']
23         score = li['data-score']
24         release = li['data-release']
25         region = li['data-region']
26         director = li['data-director']
27         actors = li['data-actors']
28         img = li.find('img')['src']
29 
30         movie['title'] = title
31         movie['score'] = score
32         movie['release'] = release
33         movie['region'] = region
34         movie['director'] = director
35         movie['actors'] = actors
36         movie['img'] = img
37         movies.append(movie)
38     return movies
39 
40 
41 def save_data(data):
42     # 返回一个文件指针
43     with open('douban.json', 'w', encoding='utf-8') as fp:
44         # json.dump作用
45         # 将字典、列表dump成满足json格式的字符串
46         # ensure_ascii=False可以保存非ascii的值
47         json.dump(data, fp, ensure_ascii=False)
48 
49 
50 if __name__ == '__main__':
51     text = get_page()
52     movies = parse_page(text)
53     save_data(movies)

猜你喜欢

转载自www.cnblogs.com/weiwei2016/p/10162280.html
今日推荐