批量爬取百度图片

#在根目录下建一个list.txt（notepad编辑，utf8格式）,输入你要爬取图片的关键字，一行输入一个关键字，不加逗号，类似于：
#蓝天
#花朵
#!/usr/bin/env python
# -*- coding:utf-8 -*-

import os
import re
import urllib
import json
import socket
import urllib.request import urllib.parse import urllib.error import time timeout = 5 socket.setdefaulttimeout(timeout) from collections import defaultdict import hashlib import imagehash import os from PIL import Image from PIL import ImageFile #from PIL import imagehash  ImageFile.LOAD_TRUNCATED_IMAGES = True class Crawler: __time_sleep = 0.1 __amount = 0 __start_amount = 0 __counter = 0 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} def __init__(self, t=0.1): self.time_sleep = t def __save_image(self, rsp_data, word): if not os.path.exists("./" + word): os.mkdir("./" + word) self.__counter = len(os.listdir('./' + word)) + 1 for image_info in rsp_data['imgs']: try: time.sleep(self.time_sleep) fix = self.__get_suffix(image_info['objURL']) urllib.request.urlretrieve(image_info['objURL'], './' + word + '/' + str(self.__counter) + str(fix)) except urllib.error.HTTPError as urllib_err: print(urllib_err) continue except Exception as err: time.sleep(1) print(err) print("unknown errors, do not save") continue else: print("Picture+1,already have" + str(self.__counter) + "pictures") self.__counter += 1 return @staticmethod def __get_suffix(name): m = re.search(r'\.[^\.]*$', name) if m.group(0) and len(m.group(0)) <= 5: return m.group(0) else: return '.jpeg' @staticmethod def __get_prefix(name): return name[:name.find('.')] def __get_images(self, word='sky'): search = urllib.parse.quote(word) pn = self.__start_amount while pn < self.__amount: url = 'http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=' + search + '&cg=girl&pn=' + str( pn) + '&rn=60&itg=0&z=0&fr=&width=&height=&lm=-1&ic=0&s=0&st=-1&gsm=1e0000001e' try: time.sleep(self.time_sleep) req = urllib.request.Request(url=url, headers=self.headers) page = urllib.request.urlopen(req) rsp = page.read().decode('unicode_escape') print("Already tried") except UnicodeDecodeError as e: print(e) print('-----UnicodeDecodeErrorurl:', url) except urllib.error.URLError as e: print(e) print("-----urlErrorurl:", url) except socket.timeout as e: print(e) print("-----socket timout:", url) else: try: rsp_data = json.loads(rsp,strict=False) self.__save_image(rsp_data, word) print("nextpage") pn += 60 except json.decoder.JSONDecodeError as e: print(e) pn += 60 print("json_decode problem") #continue  page.close() finally: page.close() print("finished") return def start(self, word, spider_page_num=10, start_page=1): self.__start_amount = (start_page - 1) * 60 self.__amount = spider_page_num * 60 + self.__start_amount self.__get_images(word) def removeDup(path): if not os.path.isdir(path): print (path + "is not a directory!") exit() hash_paths = [] image_hash = hashlib.md5() for file in os.listdir(path): #for root, dirs, files in os.walk(path, followlinks=True): # for name in files: # print(os.path.join(root, name)) # for name in dirs: # print(os.path.join(root, name)) file_path = os.path.join(path, file) ext = file_path.split('.')[-1] file_size = os.path.getsize(file_path) if (ext == 'jpg' or ext == 'jpeg' or ext == 'JPG'): try: hash_value = imagehash.average_hash(Image.open(file_path)) hash_paths.append((str(hash_value), file_path)) #print str(hash_paths) except IOError: continue dd = defaultdict(list) for k, v in hash_paths: dd[k].append(v) num_removed = 0 for list_paths in dd.values(): for index, image_path in enumerate(list_paths): if index > 0: os.remove(image_path) print ("Remove: " + image_path) num_removed += 1 print ("Removed {} images.".format(num_removed)) if __name__ == '__main__': keyword_list = open("list.txt", 'r', encoding='utf-8') lines = keyword_list.readlines() keyword_list.close() for keyword in lines: print (keyword) if keyword[-1] == "\n": keyword = keyword[:-1] crawler = Crawler(0.05) img_file=keyword crawler.start(img_file) removeDup(img_file)
批量爬取百度图片

猜你喜欢