实现简单的爬取网站图片

from urllib.request import urlopen
from urllib.error import URLError
import re


def get_page(url):
    """
    获取页面内容

    :param url:
    :return:
    """
    try:
        urlObj = urlopen(url) #实现对目标网站的访问
    except URLError as e:
        print("爬取%s失败...." % (url))
    else:
        # 默认是bytes类型, 需要的是字符串, 二进制文件不能decode
        content = urlObj.read() #读取网站内容
        return content


def parser_content(content):
    """
    解析页面内容, 获取所有的图片链接
     :param content:
    :return:
    """
    content = content.decode('utf-8').replace('\n', ' ')
    pattern = re.compile(r'<img class="BDE_Image".*? src="(https://.*?\.jpg)".*?">')
    imgList = re.findall(pattern, content)
    return imgList


def get_page_img(page):
    url = "https://tieba.baidu.com/p/5752826839?pn=%s" %(page)
    content = get_page(url)
    print(content)

    # with open('tieba.html', 'w') as f:
    #     f.write(content)
    if content:
        imgList = parser_content(content)
        for imgUrl in imgList:
            # 依次遍历图片的每一个链接, 获取图片的内容;
            imgContent = get_page(imgUrl)
            # https://imgsa.baidu.com/forum/w%3D580/sign=a05cc58f2ca446237ecaa56aa8237246/94cd6c224f4a20a48b5d83329c529822700ed0e4.jpg
            imgName = imgUrl.split('/')[-1]
            with open('img/%s' %(imgName), 'wb') as f:
                f.write(imgContent)
                print("下载图片%s成功...." %(imgName))
if __name__ == '__main__':
    for page in range(1, 11):
        print("正在爬取第%s页的图片...." %(page))
        get_page_img(page)

猜你喜欢

转载自blog.csdn.net/qq_43279936/article/details/88093650