python网络数据抓取二(bing图片抓取)

  上一回尝试抓取了百度热点数据,这次继续选择利用bing搜索抓取图片练习下,代码放在最下供大家参考。程序需要传入三个参数,图片关键词、图片保存路径、需要抓取的数量。运行过程中可能会有一些错误(大部分的是网络错误,比如超时等)我这里捕获到只打印出来然后跳过。代码中翻页的url请求是抓包获取到的(没有写全,有几个参数不知道什么意思去掉了..),然后就是分析返回的html提取想要的数据。下面是我运行的一次结果供参考:

 结果:

源码:

 1 # -*- coding: UTF-8 -*-
 2 
 3 from bs4 import BeautifulSoup
 4 import urllib.request
 5 import requests
 6 import time
 7 import json
 8 import sys
 9 import re
10 import os
11 
12 
13 #爬取目标网站url
14 CRAWL_TARGET_URL = 'https://cn.bing.com/images/async?q=%s&first=%d&count=%d&relp=%d&lostate=r&mmasync=1'
15 
16 #每次抓取图片数量(35是此网页每次翻页请求数量)
17 NUMS_PER_CRAWL = 35
18 
19 #抓取图片最小大小(单位字节),小于此值抛弃
20 MIN_IMAGE_SIZE = 50000
21 
22 
23 def get_image(url, path, count):
24     try:
25         u = urllib.request.urlopen(url, timeout=5)
26         t = u.read()
27         if sys.getsizeof(t) < MIN_IMAGE_SIZE:
28             return -1
29     except Exception as e:
30         print(url, e)
31         return -2
32     #提取图片格式
33     frmt = url[url.rfind('.'):]
34     p = re.compile("^\\.[a-zA-Z]+")
35     m = p.match(frmt)
36     frmt = m.group(0)
37     try:
38         f = open(os.path.join(path, str(count)+frmt), 'wb')
39         f.write(t)
40         f.close()
41     except Exception as e:
42         print(os.path.join(path, str(count)+frmt), e)
43         return -3
44     return 0
45 
46 
47 def crawl_data(info, path, num):
48     first = 0
49     count = 0
50     #创建一个会话
51     s = requests.Session()
52     while(count < num):
53         u = CRAWL_TARGET_URL%(info, first, NUMS_PER_CRAWL, NUMS_PER_CRAWL)
54         #3.05s为发送超时时间,10s为接收到数据超时时间
55         req = s.get(url =u, timeout=(3.05, 10))
56         bf = BeautifulSoup(req.text, "html.parser")
57         imgtags = bf.find_all("a", class_ = "iusc")
58         for e in imgtags:
59             if count == num:
60                 return
61             urldict = json.loads(e.get('m'))
62             if get_image(urldict["murl"], path, count) < 0:
63                 continue
64             print("第%d张图片下载完成,总进度%d%%"%(count+1, (count+1)*100/num))
65             sys.stdout.flush()
66             count =count+1
67             time.sleep(0.01)
68         first = first + NUMS_PER_CRAWL
69         time.sleep(0.1)
70 
71 
72 '''
73     参数1:搜索关键词
74     参数2:图片存放路径
75     参数3:抓取数量
76 '''
77 if __name__ == '__main__':
78     tstart = time.time()
79     crawl_data(sys.argv[1], sys.argv[2], int(sys.argv[3]))
80     print("所有图片下载完毕,总用时%.2fs"%(time.time()-tstart))

猜你喜欢

转载自www.cnblogs.com/prophet-ss/p/9270665.html