本地HTML中图片下载

单个文件中所有图片下载

import requests
from lxml import etree
import os

本地html文件读取到内存

这里需要注意下编码方式!

with open('爬虫与API(上).html','r',encoding = 'utf-8') as f:
    html = f.read()

页面解析

selector = etree.HTML(html)
img_list = selector.xpath('//img/@src')
img_list
['https://pic2.zhimg.com/v2-92e8bf502b2a8cb1c972215297161e40_b.jpg',
 'https://pic3.zhimg.com/v2-8a64c355393635e51f486e8f77a31b11_b.jpg',
 'https://pic3.zhimg.com/v2-b0b7e8426f7abe8bba55748830e1fedb_b.jpg',
 'https://pic3.zhimg.com/v2-1ad5fce7304021d5e8240513242b1842_b.jpg',
 'https://pic2.zhimg.com/v2-c4b13d820e724740b6d22d26cd1f78e4_b.jpg']

图片下载

num = 0
for img_url in img_list:
    img = requests.get(img_url)
    #下面是新建文件夹、图片文件名
    num += 1
    img_dir = os.getcwd() + '/爬虫与API(上)/'
    if not os.path.exists(img_dir):
        os.makedirs(img_dir)
    file_name = img_dir + str(num) + ".png"
    #下面是图片文件的保存
    with open(file_name,'wb') as f:
        f.write(img.content)

批量下载本目录所有文件的图片

import requests
from lxml import etree
import os
import glob

获取本目录下所有的.html文件名。

file_list = glob.glob('*.html')
file_list
['xpath+mongodb抓取伯乐在线实战.html', '代理IP设置.html', '多线程爬虫实现(上).html', '爬虫基本原理.html']

下面是批量下载所有图片过程。

for file in file_list:
    with open(file,'r',encoding = 'utf-8') as f:
        html = f.read()
    selector = etree.HTML(html)
    img_list = selector.xpath('//img/@src')

    #图片下载
    num = 0
    for img_url in img_list:
        img = requests.get(img_url)
        print(img_url)
        print(img.status_code)
        #下面是新建文件夹、图片文件名
        num += 1
        img_dir = os.getcwd() + '/' + file[:-5] + '/'
        if not os.path.exists(img_dir):
            os.makedirs(img_dir)
        file_name = img_dir + str(num) + ".jpg"
        #下面是图片文件的保存
        with open(file_name,'wb') as f:
            f.write(img.content)
https://pic1.zhimg.com/v2-1adc1eb4791afceffe35cd726cd1ee1c_b.jpg
200
https://pic3.zhimg.com/v2-7f577f74b40e98f6c31430b8e884837e_b.jpg
200
https://pic2.zhimg.com/v2-4d9ab580eec66877f4f90688ee856675_b.jpg
200
https://pic3.zhimg.com/v2-dc7f2877b020191711c67b5c059cb7b6_b.jpg
200
https://pic1.zhimg.com/v2-e7b6728b7a35bbe2c035755ad776c89c_b.jpg
200
https://pic2.zhimg.com/v2-0be8b34bd0bf2611715ff1fcd1b32651_b.jpg
200
https://pic4.zhimg.com/v2-782237911b1a2146b07dc5b790f27363_b.jpg
200
https://pic3.zhimg.com/v2-6dd15768a0d9303f6af923440705b346_b.jpg
200
https://pic3.zhimg.com/v2-efb63eef398fb9f3c89ff7a7bf624a96_b.jpg
200
https://pic3.zhimg.com/v2-60b94f730a916a010ee9969233d26b1a_b.jpg
200
https://pic4.zhimg.com/v2-1c42198298f2ed0191c0c8c9bcc1c83f_b.jpg
200
https://pic3.zhimg.com/v2-152abf7e81663e83091507574c579176_b.jpg
200
https://pic2.zhimg.com/v2-5aefef22c1315ea30494576fd7a8fe49_b.jpg
200
https://pic1.zhimg.com/v2-137a8ec31194a86c562dafb9f8886bac_b.jpg
200
https://pic2.zhimg.com/v2-616f2b58e1709c54f5eb73a302f2a64a_b.jpg
400
https://pic4.zhimg.com/v2-a4933e53972df61721540cd84b28d1b8_b.jpg
200
https://pic4.zhimg.com/v2-17a19920c1fb8771f076a38014c88cd0_b.jpg
200
https://pic4.zhimg.com/v2-6841a49976a11bbd6cadd54530edc2f0_b.jpg
200
https://pic3.zhimg.com/v2-cfb2e2d1ba89674777f37cc354f04a30_b.jpg
400
https://pic3.zhimg.com/v2-0778cca50a17f1f9d35d56bd0bedebfd_b.jpg
200
https://pic3.zhimg.com/v2-5f31d4e31af4ec37c56d0266fa26fc93_b.jpg
200
https://pic2.zhimg.com/v2-1b7f1861e6dbf85866fdc540675366d4_b.jpg
400
https://pic1.zhimg.com/v2-c420c79953b45aaedba381445bc5be78_b.jpg
400
https://pic2.zhimg.com/v2-48cc47aff189b5c722862ecd32a4516a_b.jpg
400
https://pic3.zhimg.com/v2-a2580253cde081db3e3f1b8b66dddf93_b.jpg
200
https://pic4.zhimg.com/v2-6841a49976a11bbd6cadd54530edc2f0_b.jpg
200
https://pic1.zhimg.com/v2-569c1425597defc7f2fd5b54e7e3c3d2_b.jpg
400
https://pic1.zhimg.com/v2-850dd573365d9c9a1c9d58fa7f27532c_b.jpg
400
https://pic2.zhimg.com/v2-13c20a4c25725fb9d363c567ab4eb08d_b.jpg
400
https://pic1.zhimg.com/v2-c0235ab217e08e205305de260bea60e0_b.jpg
400
https://pic2.zhimg.com/v2-99be53d259d1d0c0755a63b578816f05_b.jpg
400
https://pic4.zhimg.com/v2-bb0040576245087202432c2c4ebbc88b_b.jpg
200
https://pic3.zhimg.com/v2-184bf0e862d37b5e2297f2c4289d8662_b.jpg
200
https://pic2.zhimg.com/v2-7d761d77317867021fd59e4e90c1bddd_b.jpg
400
https://pic4.zhimg.com/v2-9a315bb94c08e58ed5f63202e8a25d5b_b.jpg
200
https://pic2.zhimg.com/v2-8499b2eb6e641620474641daedb61931_b.jpg
400
https://pic1.zhimg.com/v2-11e49b3e1474035316b4bd2ae4d59a4c_b.jpg
400
https://pic2.zhimg.com/v2-88b64ae8861ace4172d54a6cdb81da31_b.jpg
400

问题

上面的代码经常出现下载下来的图片无法打开,应该是没有下载成功。然后我看了如下的代码:

img = requests.get(img_url)
print(img.status_code)

发现很多请求返回的状态码是400,然后我看了下载下来的图片,确实正是那些返回的状态码为400的不能打开:

https://pic2.zhimg.com/v2-8499b2eb6e641620474641daedb61931_b.jpg
400
https://pic1.zhimg.com/v2-11e49b3e1474035316b4bd2ae4d59a4c_b.jpg
400
https://pic2.zhimg.com/v2-88b64ae8861ace4172d54a6cdb81da31_b.jpg
400

然后我手动点了链接,发现会报:

You do not have permission to get URL '/v2-88b64ae8861ace4172d54a6cdb81da31_b.jpg' from this server.

我猜测大概是因为我这个代码没有设置headers中的referer。下面是改进版本:

user_agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 safari/537.36"
referer="https://www.zhihu.com/"
headers={'User-Agent':user_agent,'Referer':referer}

for file in file_list:
    with open(file,'r',encoding = 'utf-8') as f:
        html = f.read()
    selector = etree.HTML(html)
    img_list = selector.xpath('//img/@src')

    #图片下载
    num = 0
    for img_url in img_list:
        img = requests.get(img_url,headers = headers)
        print(img_url)
        print(img.status_code)
        #下面是新建文件夹、图片文件名
        num += 1
        img_dir = os.getcwd() + '/' + file[:-5] + '/'
        if not os.path.exists(img_dir):
            os.makedirs(img_dir)
        file_name = img_dir + str(num) + ".jpg"
        #下面是图片文件的保存
        with open(file_name,'wb') as f:
            f.write(img.content)
https://pic2.zhimg.com/v2-616f2b58e1709c54f5eb73a302f2a64a_b.jpg
200
https://pic4.zhimg.com/v2-a4933e53972df61721540cd84b28d1b8_b.jpg
200
https://pic4.zhimg.com/v2-17a19920c1fb8771f076a38014c88cd0_b.jpg
200
https://pic4.zhimg.com/v2-6841a49976a11bbd6cadd54530edc2f0_b.jpg
200
https://pic3.zhimg.com/v2-cfb2e2d1ba89674777f37cc354f04a30_b.jpg
200
https://pic3.zhimg.com/v2-0778cca50a17f1f9d35d56bd0bedebfd_b.jpg
200
https://pic3.zhimg.com/v2-5f31d4e31af4ec37c56d0266fa26fc93_b.jpg
200
https://pic2.zhimg.com/v2-1b7f1861e6dbf85866fdc540675366d4_b.jpg
200
https://pic1.zhimg.com/v2-c420c79953b45aaedba381445bc5be78_b.jpg
200
https://pic2.zhimg.com/v2-48cc47aff189b5c722862ecd32a4516a_b.jpg
200
https://pic3.zhimg.com/v2-a2580253cde081db3e3f1b8b66dddf93_b.jpg
200
https://pic4.zhimg.com/v2-6841a49976a11bbd6cadd54530edc2f0_b.jpg
200
https://pic1.zhimg.com/v2-569c1425597defc7f2fd5b54e7e3c3d2_b.jpg
200
https://pic1.zhimg.com/v2-850dd573365d9c9a1c9d58fa7f27532c_b.jpg
200
https://pic2.zhimg.com/v2-13c20a4c25725fb9d363c567ab4eb08d_b.jpg
200
https://pic1.zhimg.com/v2-c0235ab217e08e205305de260bea60e0_b.jpg
200
https://pic2.zhimg.com/v2-99be53d259d1d0c0755a63b578816f05_b.jpg
200
https://pic4.zhimg.com/v2-bb0040576245087202432c2c4ebbc88b_b.jpg
200
https://pic3.zhimg.com/v2-184bf0e862d37b5e2297f2c4289d8662_b.jpg
200
https://pic2.zhimg.com/v2-7d761d77317867021fd59e4e90c1bddd_b.jpg
200
https://pic4.zhimg.com/v2-9a315bb94c08e58ed5f63202e8a25d5b_b.jpg
200
https://pic2.zhimg.com/v2-8499b2eb6e641620474641daedb61931_b.jpg
200
https://pic1.zhimg.com/v2-11e49b3e1474035316b4bd2ae4d59a4c_b.jpg
200
https://pic2.zhimg.com/v2-88b64ae8861ace4172d54a6cdb81da31_b.jpg
200

问题解决!

猜你喜欢

转载自blog.csdn.net/dta0502/article/details/82054974
今日推荐