作业4:
爬取豆瓣250
import requests import re url = 'https://movie.douban.com/top250' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36' } index = 0 while index < 10: number=25 * index response = requests.get(url, headers=headers,params={"start":number}) movie_content_list = re.findall( '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<p class="">(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*? <span>(.*?)人评价.*?<span class="inq">(.*?)</span>',response.text, re.S) for movie_content in movie_content_list: detail_url, movie_jpg, name,detail_info, point, num ,slogan = movie_content data = f'电影名称:{name}, 详情页url:{detail_url}, 图片url:{movie_jpg}, 电影详情信息:{detail_info}, 评分: {point}, 评价人数: {num} 宣传标语:{slogan} \n' print(data) with open('douban.txt', 'a', encoding='utf-8') as f: f.write(data) index +=1
课堂笔记:
# 昨日回顾 ''' 爬虫原理: 什么是爬虫? 爬虫指的是爬取数据 什么是互联网? 由一堆网络设备把一台一台的计算机互联到一起 互联网建立的目的? 数据的传递与数据的共享、 上网的全过程: - 普通用户 打开浏览器 --> 往目标站点发送请求 --> 接受响应数据 --> 渲染到页面上 - 爬虫程序 模拟浏览器 --> 往目标站点发送数据 --> 接受响应数据 --> 提取有用的数据 --> 浏览器发送的是什么请求? http协议的请求? - 请求url - 请求方式: GET.POST -请求头: cookies user-agent host 爬虫的全过程: 1.发送请求(请求库) - requests模块 - selenium模块 2.获取相应数据(服务器返回) 3.解析并提取数据(解析库) - bs4(BeautifulSoup4) - Xpath 4.保存数据(存储库) - MongoDB (1.3.4 需要手动写) - 爬虫框架: Scrapy(基于面向对象) 爬取梨视频: 1.分析网站的视频源地址 2.通过requests网视频源地址发送请求 3.获取视频的二进制流,并保存到本地 ''' ''' 今日内容: requests模块 ''' # 爬取梨视频 ''' 使用Chrome浏览器 ''' ''' 请求url: https://www.pearvideo.com/ 请求方式: GET 请求头: user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 ''' # import requests # import re #正则模块 # # # 对梨视频详情页发送请求,获取响应请求 # response = requests.get(url='https://www.pearvideo.com/') # print(response.status_code) # print(response.text) # # # refindall('正则匹配规则','解析文本','正则模式') # # re.S:全局模式(对整个文本进行匹配) # # .指的是当前位置 # # *指的是查找所以 # ''' # <a href="video_(.*?)" #提取后面数字 # ''' # res = re.findall('<a href="video_(.*?)"', response.text, re.S) # print(res) # # for m_id in res: # # 拼接详情页url # detail_url = 'https://www.pearvideo.com/video_' + m_id # print(detail_url) import requests import re #正则模块 import uuid # 爬虫三部曲 # 1.发送请求 def get_page(url): response = requests.get(url) return response # 2.解析数据 # 解析主页获取视频详情页ID def parse_index(text): res = re.findall('<a href="video_(.*?)"', text, re.S) print(res) detail_url_list = [] for m_id in res: # 拼接详情页url detail_url = 'https://www.pearvideo.com/video_' + m_id # print(detail_url) detail_url_list.append(detail_url) return detail_url_list # refindall('正则匹配规则','解析文本','正则模式') # re.S:全局模式(对整个文本进行匹配) # .指的是当前位置 # *指的是查找所以 ''' <a href="video_(.*?)" #提取后面数字 ''' def parse_detail(text): ''' (.*?):提取括号的内容 .*?:直接匹配 <video webkit-playsinline="" playsinline="" x-webkit-airplay="" autoplay="autoplay" src="https://video.pearvideo.com/mp4/adshort/20190613/cont-1566066-14015478_adpkg-ad_hd.mp4" style="width: 100%; height: 100%;"></video> 正则:<video.*?src="(.*?)" 正则:srcUrl="(.*?)" ''' movie_url = re.findall('srcUrl="(.*?)"',text,re.S)[0] return movie_url def save_movie(movie_url): response = requests.get(movie_url) with open(f'{uuid.uuid4()}.mp4','wb') as f: f.write(response .content) f.flush() if __name__ == '__main__': # 1.对主页发送请求 index_res = get_page(url='https://www.pearvideo.com/') # 2.对主页额进行解析、获取详情页id detail_url_list = parse_index(index_res.text) # print(detail_url_list) # 3.对每个详情页url发送请求 for detail_url in detail_url_list: detail_res = get_page(url=detail_url) # print(detail_res.text) # 4. 解析详情页获取视频url movie_url=parse_detail(detail_res.text) print(movie_url) # 5.保存视频 save_movie(movie_url)
import requests import re # 正则模块 # uuid.uuid4() 可以根据时间戳生成一段世界上唯一的随机字符串 import uuid # 导入线程池模块 from concurrent.futures import ThreadPoolExecutor # 线程池限制50个线程 pool = ThreadPoolExecutor(50) # 爬虫三部曲 # 1.发送请求 def get_page(url): print(f'开始异步任务:{url}') response = requests.get(url) return response # 2.解析数据 # 解析主页获取视频详情页ID def parse_index(res): response = res.result() # 提取出主页所有ID id_list = re.findall('<a href="video_(.*?)"',response.text, re.S) # print(res) for m_id in id_list: # 拼接详情页url detail_url = 'https://www.pearvideo.com/video_' + m_id # print(detail_url) #把详情页url提交给get_page函数 pool.submit(get_page,detail_url).add_done_callback(parse_detail) # refindall('正则匹配规则','解析文本','正则模式') # re.S:全局模式(对整个文本进行匹配) # .指的是当前位置 # *指的是查找所以 ''' <a href="video_(.*?)" #提取后面数字 ''' # 解析详情页获取视频url def parse_detail(res): response = res.result() movie_url = re.findall('srcUrl="(.*?)"', response.text, re.S)[0] # 异步提交把视频url传给get_page函数,把返回的结果传给save_movie pool.submit(get_page,movie_url).add_done_callback(save_movie) # 3.保存数据 def save_movie(res): movie_res = res.result() # 把视频写到本地 with open(f'{uuid.uuid4()}.mp4', 'wb') as f: f.write(movie_res.content) print(f'视频下载结束:{movie_res.url}') f.flush() if __name__ == '__main__': # 1.往get_page发送异步请求,把结果交给parse_index函数 url='https://www.pearvideo.com/' pool.submit(get_page,url).add_done_callback(parse_index)
# requests详细使用 ''' 访问知乎发现 请求url:https://www.zhihu.com/explore 请求方式:GET 请求头: user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 ''' # # 访问知乎 # import requests # response = requests.get(url='https://www.zhihu.com/explore') # print(response.status_code) #400 # print(response.text) # 返回错误页面 # # #携带请求头参数访问知乎 # # 携带请求头字典 # headers = { # 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36' # } # # # 在get请求内,添加user-agent # response = requests.get(url='https://www.zhihu.com/explore',headers=headers) # print(response.status_code) # with open('zhihu.html','w',encoding='utf-8') as f: # f.write(response.text) ''' params参数 访问百度搜索蔡徐坤url ''' # import requests # from urllib.parse import urlencode # # url = 'https://www.baidu.com/s?wd=%E8%94%A1%E5%BE%90%E5%9D%A4' # # url = 'https://www.baidu.com/s?' + urlencode({"wd":蔡徐坤}) # url = 'https://www.baidu.com/s?' # headers = { # 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36' # } # #print(url) # # # 在get方法中添加params参数 # # response = requests.get(url,headers=headers,params={"wd":'安徽工程大学'}) # response = requests.get(url,headers=headers,params={"wd":"安徽工程大学","pn":'20'}) # # print(response.text) # with open('ahpu.html','w',encoding='utf-8') as f: # f.write(response.text) ''' 携带cookies 携带登录cookies破解GitHub登录验证 请求url: https://github.com/settings/emails 请求方式: GET 请求头: user-agent Cookie:has_recent_activity=1; _ga=GA1.2.637317055.1560497549; tz=Asia%2FShanghai; _octo=GH1.1.1390785586.1560497554; _device_id=ca2487362e89d9b8b40e3c55105f19d8; user_session=aRhxdrD1JIMG-svN8QAwtIHs5_NSRZI0Ajtp10FnKOi-IUmv; __Host-user_session_same_site=aRhxdrD1JIMG-svN8QAwtIHs5_NSRZI0Ajtp10FnKOi-IUmv; logged_in=yes; dotcom_user=lyj68; _gh_sess=OW9aaXVQbG1wY3JMWFhWdlg0MlVMSm5nZjlCQ250QnVNWHhUYU9ybHloem45OG1leFArQXc0SUttcnE2M3lvT3gvckVZVThtVXRQTnZjbGFNenF0YUxjZHNwdUlIeHhuQ2V2Q2xEOTFqaDFpNnhDNEJwNzFOQXNvcUdSN1FrYVNGUXpNMUZFWVd2aE56OSthd2dkMmQwVEp3M0E1Q21YQ2hjelRGMndkQ1NpcWJidExMb0tEL0NuQU9ZNmtYNkRNSjBpaC80b0FVc05yVmlEbkV3VUhKL3IrN2dHak9RR1ZVVFNTZU1YU1RWUlVDMVdkYklWUjRYU3A0NExOb2NEMnI0YlJheHg5enBGdzdMQmtYZThnVkFHM2ltcjFPS3ovVEdLZnVyUzNkeUk9LS1oSFdnK1R3R0JkRmJ6VGZoRW5TNVBRPT0%3D--067d19150a40df9a4afc9a0cd26e87624f41d623 ''' # import requests # # # 请求url # url = 'https://github.com/settings/emails' # # # 请求头 # headers = { # 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36', # 'Cookie':'has_recent_activity=1; _ga=GA1.2.637317055.1560497549; tz=Asia%2FShanghai; _octo=GH1.1.1390785586.1560497554; _device_id=ca2487362e89d9b8b40e3c55105f19d8; user_session=aRhxdrD1JIMG-svN8QAwtIHs5_NSRZI0Ajtp10FnKOi-IUmv; __Host-user_session_same_site=aRhxdrD1JIMG-svN8QAwtIHs5_NSRZI0Ajtp10FnKOi-IUmv; logged_in=yes; dotcom_user=lyj68; _gh_sess=OW9aaXVQbG1wY3JMWFhWdlg0MlVMSm5nZjlCQ250QnVNWHhUYU9ybHloem45OG1leFArQXc0SUttcnE2M3lvT3gvckVZVThtVXRQTnZjbGFNenF0YUxjZHNwdUlIeHhuQ2V2Q2xEOTFqaDFpNnhDNEJwNzFOQXNvcUdSN1FrYVNGUXpNMUZFWVd2aE56OSthd2dkMmQwVEp3M0E1Q21YQ2hjelRGMndkQ1NpcWJidExMb0tEL0NuQU9ZNmtYNkRNSjBpaC80b0FVc05yVmlEbkV3VUhKL3IrN2dHak9RR1ZVVFNTZU1YU1RWUlVDMVdkYklWUjRYU3A0NExOb2NEMnI0YlJheHg5enBGdzdMQmtYZThnVkFHM2ltcjFPS3ovVEdLZnVyUzNkeUk9LS1oSFdnK1R3R0JkRmJ6VGZoRW5TNVBRPT0%3D--067d19150a40df9a4afc9a0cd26e87624f41d623' # } # # github_res= requests.get(url,headers=headers) import requests url = 'https://github.com/settings/emails' headers = { 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'} cookie = { 'Cookie':'has_recent_activity=1; _ga=GA1.2.637317055.1560497549; tz=Asia%2FShanghai; _octo=GH1.1.1390785586.1560497554; _device_id=ca2487362e89d9b8b40e3c55105f19d8; user_session=aRhxdrD1JIMG-svN8QAwtIHs5_NSRZI0Ajtp10FnKOi-IUmv; __Host-user_session_same_site=aRhxdrD1JIMG-svN8QAwtIHs5_NSRZI0Ajtp10FnKOi-IUmv; logged_in=yes; dotcom_user=lyj68; _gh_sess=OW9aaXVQbG1wY3JMWFhWdlg0MlVMSm5nZjlCQ250QnVNWHhUYU9ybHloem45OG1leFArQXc0SUttcnE2M3lvT3gvckVZVThtVXRQTnZjbGFNenF0YUxjZHNwdUlIeHhuQ2V2Q2xEOTFqaDFpNnhDNEJwNzFOQXNvcUdSN1FrYVNGUXpNMUZFWVd2aE56OSthd2dkMmQwVEp3M0E1Q21YQ2hjelRGMndkQ1NpcWJidExMb0tEL0NuQU9ZNmtYNkRNSjBpaC80b0FVc05yVmlEbkV3VUhKL3IrN2dHak9RR1ZVVFNTZU1YU1RWUlVDMVdkYklWUjRYU3A0NExOb2NEMnI0YlJheHg5enBGdzdMQmtYZThnVkFHM2ltcjFPS3ovVEdLZnVyUzNkeUk9LS1oSFdnK1R3R0JkRmJ6VGZoRW5TNVBRPT0%3D--067d19150a40df9a4afc9a0cd26e87624f41d623' } github_res = requests.get(url,headers=headers,cookies=cookie) print("540335019" in github_res.text)
'''''' ''' 主页: https://movie.douban.com/top250 GET User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36 re正则: # 电影详情页url、图片链接、电影名称、电影评分、评价人数 <div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价 ''' import requests import re url = 'https://movie.douban.com/top250' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36' } # 1、往豆瓣TOP250发送请求获取响应数据 response = requests.get(url, headers=headers) # print(response.text) # 2、通过正则解析提取数据 # 电影详情页url、图片链接、电影名称、电影评分、评价人数 movie_content_list = re.findall( # 正则规则 '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价', # 解析文本 response.text, # 匹配模式 re.S) for movie_content in movie_content_list: # 解压赋值每一部电影 detail_url, movie_jpg, name, point, num = movie_content data = f'电影名称:{name}, 详情页url:{detail_url}, 图片url:{movie_jpg}, 评分: {point}, 评价人数: {num} \n' print(data) # 3、保存数据,把电影信息写入文件中 with open('douban.txt', 'a', encoding='utf-8') as f: f.write(data)
小结:学习了很多,真好。