Python百度图片批量下载器

环境

Python 3.7.4
urllib3==1.25.10
re
os

抓取页面

从https://image.baidu.com/抓取想要的页面

def crawlPages(self):
    pages = []

    for i in range(0 , self.number):
        url = self.url + ('&pn=%d' % i * self.offset)
        request = urllib.request.Request(headers = self.headers, url = url)
        response = urllib.request.urlopen(request)
        page = response.read().decode('utf-8')
        pages.append(page)

    return pages

提取图片URL

利用re库提取图片的URL，观察获取的页面，选择正则表达式为self.pattern = re.compile(r'"hoverURL":"(.*?)", "pageNum"', re.S)

def extractLinks(self, pages):
    linkList = []
    for page in pages:
        links = re.findall(self.pattern, str(page))
        for link in links:
            if link is not None and link != '':
                linkList.append(link)

    return linkList

下载图片

利用urllib.request.urlretrieve()下载图片并保存到指定路径，需要from urllib import request，由于版本问题，不能使用urllib.request，由于用了request作为变量名，所以导入包语句改为from urllib import request as req

def download(self, link, filename):
    try:
        print("downloading...")
        req.urlretrieve(str(link), filename = filename)
        print("downloaded succesfully")
    except urllib.ContentTooShortError:
        print("retrying...")
        self.download(link, filename)

实现代码

# -*- coding:utf-8  -*-
import re
import urllib
from urllib import request as req
import os

class Downloader():

    '''
        Description:
            Downloader to download picture from https://image.baidu.com/ in batches
        Attributes:
            keyword:
                the keyword of the pictures you wanna download
            path:
                the path you wanna to save picture downloaded
            number:
                the number of pages you wanna download
    '''

    def __init__(self, keyword, path, number):
        self.url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s' % urllib.parse.quote(keyword)

        # regular expression
        self.pattern = re.compile(r'"hoverURL":"(.*?)",            "pageNum"', re.S)

        self.headers = {
    
    "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"}

        self.path = path

        self.createDirectory()

        self.number = number
        self.offset = 20

    '''
        Description:
            create the directory to save pictures
        Args:
            None
        Returns:
            None
    '''

    def createDirectory(self):

        if not os.path.exists(self.path):
            os.mkdir(self.path)

    '''
        Description:
            crawl HTML pages
        Args:
            None
        Returns:
            pages : HTML pages
    '''

    def crawlPages(self):
        pages = []

        for i in range(0 , self.number):
            url = self.url + ('&pn=%d' % i * self.offset)
            request = urllib.request.Request(headers = self.headers, url = url)
            response = urllib.request.urlopen(request)
            page = response.read().decode('utf-8')
            pages.append(page)

        return pages

    '''
        Description:
            extract links from pages
        Args:
            pages:
                HTML pages
        Returns:
            linkList:
                the list of links extract from pages by regular expression
    '''

    def extractLinks(self, pages):
        linkList = []
        for page in pages:
            links = re.findall(self.pattern, str(page))
            for link in links:
                if link is not None and link != '':
                    linkList.append(link)

        return linkList

    '''
        Description:
            download picture according to link, and save the picture in the given path and named after filename
        Args:
            link:
                link of picture
            filename:
                including path and filename
    '''

    def download(self, link, filename):
        try:
            print("downloading...")
            req.urlretrieve(str(link), filename = filename)
            print("downloaded succesfully")
        except urllib.ContentTooShortError:
            print("retrying...")
            self.download(link, filename)


    '''
        Description:
            download all picture according to links in linkList
        Args:
            linkList:
                the list of pictures' links
        Returns:
            None
    '''

    def downloadAll(self, linkList):
        for link in linkList:
            filename = self.path + '/%d.jpg' % linkList.index(link)
            self.download(link, filename)

    '''
        Description:
            start to download
        Args:
            None
        Returns:
            None
    '''

    def startDownload(self):
        pages = self.crawlPages()
        linkList = self.extractLinks(pages)
        self.downloadAll(linkList)


if __name__ == "__main__":
    keyword = input("Please input the keyword of picture you wanna download:")
    path = './' + input("Please input the path of pictures you wanna save:")
    number = int(input("Please input the number of page you wanna get:"))

    downloader = Downloader(keyword, path, number)
    downloader.startDownload()

最后

由于博主水平有限，不免有疏漏之处，欢迎读者随时批评指正，以免造成不必要的误解！

Python百度图片批量下载器

Python百度图片批量下载器

环境

抓取页面

提取图片URL

下载图片

实现代码

最后

猜你喜欢