async 异步抓取 花瓣网高清大图 30s爬取500张

废话 不多说,直接上代码,不懂得看注释

先安装  pip install aiohttp

 1 "异步抓取花瓣网图片"
 2 
 3 # pip install aiohttp
 4 import requests
 5 import aiohttp
 6 import asyncio
 7 import time
 8 import os
 9 
10 headers = {
11     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
12     "X-Request": "JSON",
13     "Accept": "application/json",
14     "X-Requested-With": "XMLHttpRequest"
15 }
16 
17 
18 def get_image_urls():
19     """获取图片链接"""
20     print('开始获取图片链接,请耐心等待......')
21     image_id = 2551285279
22     for page in range(1,26):
23         url = "https://huaban.com/boards/41743806/?jzwfs8ej&max="+ str(image_id) +"&limit=20&wfl=1"
24         response = requests.get(url,headers=headers).json()
25         pins = response['board']['pins']
26         end_pins_id = pins[-1]['pin_id']
27         for i in pins:
28             key = i['file']['key']
29             urls.append('http://hbimg.huabanimg.com/'+key)
30         image_id = end_pins_id  # 下一页url中需要的参数
31 
32 
33 #特殊的函數:该函数调用后,函数内部的程序语句不会被执行,但是该函数调用会返回一个协程对象
34 async def get_audio_data(url):
35   #使用aiohttp进行请求发送
36     #实例化了一个发送网络请求的对象
37     async with aiohttp.ClientSession() as s:
38       #该函数内部的异步操作必须使用await进行修饰
39         async with await s.get(url=url,headers=headers) as response:
40             audio_data = await response.read()  #read()返回的是二进制形式的响应数据
41             return {'data':audio_data,'url':url}
42 
43 
44 
45 #任务对象的回调函数,进行数据的持久化存储
46 def saveData(task):
47     dic_obj = task.result()
48     name = dic_obj['url'].split('/')[-1]
49     data = dic_obj['data']
50     image_dir = 'images'
51     if not os.path.exists(image_dir):
52         os.mkdir(image_dir)
53     with open(os.path.join(image_dir,name) + '.jpg','wb') as fp:
54         fp.write(data)
55     print(name+'下载完毕!')
56 
57 
58 if __name__ == '__main__':
59     start_time = time.clock()
60     urls = []
61     tasks = []
62     get_image_urls()
63     for url in urls:
64         #调用该特殊函数,让其返回一个协程对象
65         c = get_audio_data(url)
66         #将协程对象封装到任务对象中
67         task = asyncio.ensure_future(c)
68         # 给任务对象绑定回调函数
69         task.add_done_callback(saveData)
70         #将任务对象添加到列表中
71         tasks.append(task)
72     #创建一个事件循环对象
73     loop = asyncio.get_event_loop()
74     #将任务对象列表注册到事件循环对象中,并且开启事件循环
75     loop.run_until_complete(asyncio.wait(tasks))
76     end_time = time.clock()
77     print('抓取{}张图片,共计用时{}秒'.format(len(tasks),end_time-start_time))

注:window最大线程数 512,所以任务数不要超过这个值,否则 抛出异常

猜你喜欢

转载自www.cnblogs.com/lvye001/p/11431332.html