Python百度图片批量下载器
环境
Python 3.7.4
urllib3==1.25.10
re
os
抓取页面
从https://image.baidu.com/
抓取想要的页面
def crawlPages(self):
pages = []
for i in range(0 , self.number):
url = self.url + ('&pn=%d' % i * self.offset)
request = urllib.request.Request(headers = self.headers, url = url)
response = urllib.request.urlopen(request)
page = response.read().decode('utf-8')
pages.append(page)
return pages
提取图片URL
利用re
库提取图片的URL
,观察获取的页面,选择正则表达式为self.pattern = re.compile(r'"hoverURL":"(.*?)", "pageNum"', re.S)
def extractLinks(self, pages):
linkList = []
for page in pages:
links = re.findall(self.pattern, str(page))
for link in links:
if link is not None and link != '':
linkList.append(link)
return linkList
下载图片
利用urllib.request.urlretrieve()
下载图片并保存到指定路径,需要from urllib import request
,由于版本问题,不能使用urllib.request
,由于用了request
作为变量名,所以导入包语句改为from urllib import request as req
def download(self, link, filename):
try:
print("downloading...")
req.urlretrieve(str(link), filename = filename)
print("downloaded succesfully")
except urllib.ContentTooShortError:
print("retrying...")
self.download(link, filename)
实现代码
# -*- coding:utf-8 -*-
import re
import urllib
from urllib import request as req
import os
class Downloader():
'''
Description:
Downloader to download picture from https://image.baidu.com/ in batches
Attributes:
keyword:
the keyword of the pictures you wanna download
path:
the path you wanna to save picture downloaded
number:
the number of pages you wanna download
'''
def __init__(self, keyword, path, number):
self.url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s' % urllib.parse.quote(keyword)
# regular expression
self.pattern = re.compile(r'"hoverURL":"(.*?)", "pageNum"', re.S)
self.headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"}
self.path = path
self.createDirectory()
self.number = number
self.offset = 20
'''
Description:
create the directory to save pictures
Args:
None
Returns:
None
'''
def createDirectory(self):
if not os.path.exists(self.path):
os.mkdir(self.path)
'''
Description:
crawl HTML pages
Args:
None
Returns:
pages : HTML pages
'''
def crawlPages(self):
pages = []
for i in range(0 , self.number):
url = self.url + ('&pn=%d' % i * self.offset)
request = urllib.request.Request(headers = self.headers, url = url)
response = urllib.request.urlopen(request)
page = response.read().decode('utf-8')
pages.append(page)
return pages
'''
Description:
extract links from pages
Args:
pages:
HTML pages
Returns:
linkList:
the list of links extract from pages by regular expression
'''
def extractLinks(self, pages):
linkList = []
for page in pages:
links = re.findall(self.pattern, str(page))
for link in links:
if link is not None and link != '':
linkList.append(link)
return linkList
'''
Description:
download picture according to link, and save the picture in the given path and named after filename
Args:
link:
link of picture
filename:
including path and filename
'''
def download(self, link, filename):
try:
print("downloading...")
req.urlretrieve(str(link), filename = filename)
print("downloaded succesfully")
except urllib.ContentTooShortError:
print("retrying...")
self.download(link, filename)
'''
Description:
download all picture according to links in linkList
Args:
linkList:
the list of pictures' links
Returns:
None
'''
def downloadAll(self, linkList):
for link in linkList:
filename = self.path + '/%d.jpg' % linkList.index(link)
self.download(link, filename)
'''
Description:
start to download
Args:
None
Returns:
None
'''
def startDownload(self):
pages = self.crawlPages()
linkList = self.extractLinks(pages)
self.downloadAll(linkList)
if __name__ == "__main__":
keyword = input("Please input the keyword of picture you wanna download:")
path = './' + input("Please input the path of pictures you wanna save:")
number = int(input("Please input the number of page you wanna get:"))
downloader = Downloader(keyword, path, number)
downloader.startDownload()
最后
- 由于博主水平有限,不免有疏漏之处,欢迎读者随时批评指正,以免造成不必要的误解!