单个文件中所有图片下载
import requests
from lxml import etree
import os
本地html文件读取到内存
这里需要注意下编码方式!
with open('爬虫与API(上).html','r',encoding = 'utf-8') as f:
html = f.read()
页面解析
selector = etree.HTML(html)
img_list = selector.xpath('//img/@src')
img_list
['https://pic2.zhimg.com/v2-92e8bf502b2a8cb1c972215297161e40_b.jpg',
'https://pic3.zhimg.com/v2-8a64c355393635e51f486e8f77a31b11_b.jpg',
'https://pic3.zhimg.com/v2-b0b7e8426f7abe8bba55748830e1fedb_b.jpg',
'https://pic3.zhimg.com/v2-1ad5fce7304021d5e8240513242b1842_b.jpg',
'https://pic2.zhimg.com/v2-c4b13d820e724740b6d22d26cd1f78e4_b.jpg']
图片下载
num = 0
for img_url in img_list:
img = requests.get(img_url)
#下面是新建文件夹、图片文件名
num += 1
img_dir = os.getcwd() + '/爬虫与API(上)/'
if not os.path.exists(img_dir):
os.makedirs(img_dir)
file_name = img_dir + str(num) + ".png"
#下面是图片文件的保存
with open(file_name,'wb') as f:
f.write(img.content)
批量下载本目录所有文件的图片
import requests
from lxml import etree
import os
import glob
获取本目录下所有的.html
文件名。
file_list = glob.glob('*.html')
file_list
['xpath+mongodb抓取伯乐在线实战.html', '代理IP设置.html', '多线程爬虫实现(上).html', '爬虫基本原理.html']
下面是批量下载所有图片过程。
for file in file_list:
with open(file,'r',encoding = 'utf-8') as f:
html = f.read()
selector = etree.HTML(html)
img_list = selector.xpath('//img/@src')
#图片下载
num = 0
for img_url in img_list:
img = requests.get(img_url)
print(img_url)
print(img.status_code)
#下面是新建文件夹、图片文件名
num += 1
img_dir = os.getcwd() + '/' + file[:-5] + '/'
if not os.path.exists(img_dir):
os.makedirs(img_dir)
file_name = img_dir + str(num) + ".jpg"
#下面是图片文件的保存
with open(file_name,'wb') as f:
f.write(img.content)
https://pic1.zhimg.com/v2-1adc1eb4791afceffe35cd726cd1ee1c_b.jpg
200
https://pic3.zhimg.com/v2-7f577f74b40e98f6c31430b8e884837e_b.jpg
200
https://pic2.zhimg.com/v2-4d9ab580eec66877f4f90688ee856675_b.jpg
200
https://pic3.zhimg.com/v2-dc7f2877b020191711c67b5c059cb7b6_b.jpg
200
https://pic1.zhimg.com/v2-e7b6728b7a35bbe2c035755ad776c89c_b.jpg
200
https://pic2.zhimg.com/v2-0be8b34bd0bf2611715ff1fcd1b32651_b.jpg
200
https://pic4.zhimg.com/v2-782237911b1a2146b07dc5b790f27363_b.jpg
200
https://pic3.zhimg.com/v2-6dd15768a0d9303f6af923440705b346_b.jpg
200
https://pic3.zhimg.com/v2-efb63eef398fb9f3c89ff7a7bf624a96_b.jpg
200
https://pic3.zhimg.com/v2-60b94f730a916a010ee9969233d26b1a_b.jpg
200
https://pic4.zhimg.com/v2-1c42198298f2ed0191c0c8c9bcc1c83f_b.jpg
200
https://pic3.zhimg.com/v2-152abf7e81663e83091507574c579176_b.jpg
200
https://pic2.zhimg.com/v2-5aefef22c1315ea30494576fd7a8fe49_b.jpg
200
https://pic1.zhimg.com/v2-137a8ec31194a86c562dafb9f8886bac_b.jpg
200
https://pic2.zhimg.com/v2-616f2b58e1709c54f5eb73a302f2a64a_b.jpg
400
https://pic4.zhimg.com/v2-a4933e53972df61721540cd84b28d1b8_b.jpg
200
https://pic4.zhimg.com/v2-17a19920c1fb8771f076a38014c88cd0_b.jpg
200
https://pic4.zhimg.com/v2-6841a49976a11bbd6cadd54530edc2f0_b.jpg
200
https://pic3.zhimg.com/v2-cfb2e2d1ba89674777f37cc354f04a30_b.jpg
400
https://pic3.zhimg.com/v2-0778cca50a17f1f9d35d56bd0bedebfd_b.jpg
200
https://pic3.zhimg.com/v2-5f31d4e31af4ec37c56d0266fa26fc93_b.jpg
200
https://pic2.zhimg.com/v2-1b7f1861e6dbf85866fdc540675366d4_b.jpg
400
https://pic1.zhimg.com/v2-c420c79953b45aaedba381445bc5be78_b.jpg
400
https://pic2.zhimg.com/v2-48cc47aff189b5c722862ecd32a4516a_b.jpg
400
https://pic3.zhimg.com/v2-a2580253cde081db3e3f1b8b66dddf93_b.jpg
200
https://pic4.zhimg.com/v2-6841a49976a11bbd6cadd54530edc2f0_b.jpg
200
https://pic1.zhimg.com/v2-569c1425597defc7f2fd5b54e7e3c3d2_b.jpg
400
https://pic1.zhimg.com/v2-850dd573365d9c9a1c9d58fa7f27532c_b.jpg
400
https://pic2.zhimg.com/v2-13c20a4c25725fb9d363c567ab4eb08d_b.jpg
400
https://pic1.zhimg.com/v2-c0235ab217e08e205305de260bea60e0_b.jpg
400
https://pic2.zhimg.com/v2-99be53d259d1d0c0755a63b578816f05_b.jpg
400
https://pic4.zhimg.com/v2-bb0040576245087202432c2c4ebbc88b_b.jpg
200
https://pic3.zhimg.com/v2-184bf0e862d37b5e2297f2c4289d8662_b.jpg
200
https://pic2.zhimg.com/v2-7d761d77317867021fd59e4e90c1bddd_b.jpg
400
https://pic4.zhimg.com/v2-9a315bb94c08e58ed5f63202e8a25d5b_b.jpg
200
https://pic2.zhimg.com/v2-8499b2eb6e641620474641daedb61931_b.jpg
400
https://pic1.zhimg.com/v2-11e49b3e1474035316b4bd2ae4d59a4c_b.jpg
400
https://pic2.zhimg.com/v2-88b64ae8861ace4172d54a6cdb81da31_b.jpg
400
问题
上面的代码经常出现下载下来的图片无法打开,应该是没有下载成功。然后我看了如下的代码:
img = requests.get(img_url)
print(img.status_code)
发现很多请求返回的状态码是400,然后我看了下载下来的图片,确实正是那些返回的状态码为400的不能打开:
https://pic2.zhimg.com/v2-8499b2eb6e641620474641daedb61931_b.jpg
400
https://pic1.zhimg.com/v2-11e49b3e1474035316b4bd2ae4d59a4c_b.jpg
400
https://pic2.zhimg.com/v2-88b64ae8861ace4172d54a6cdb81da31_b.jpg
400
然后我手动点了链接,发现会报:
You do not have permission to get URL '/v2-88b64ae8861ace4172d54a6cdb81da31_b.jpg' from this server.
我猜测大概是因为我这个代码没有设置headers中的referer。下面是改进版本:
user_agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 safari/537.36"
referer="https://www.zhihu.com/"
headers={'User-Agent':user_agent,'Referer':referer}
for file in file_list:
with open(file,'r',encoding = 'utf-8') as f:
html = f.read()
selector = etree.HTML(html)
img_list = selector.xpath('//img/@src')
#图片下载
num = 0
for img_url in img_list:
img = requests.get(img_url,headers = headers)
print(img_url)
print(img.status_code)
#下面是新建文件夹、图片文件名
num += 1
img_dir = os.getcwd() + '/' + file[:-5] + '/'
if not os.path.exists(img_dir):
os.makedirs(img_dir)
file_name = img_dir + str(num) + ".jpg"
#下面是图片文件的保存
with open(file_name,'wb') as f:
f.write(img.content)
https://pic2.zhimg.com/v2-616f2b58e1709c54f5eb73a302f2a64a_b.jpg
200
https://pic4.zhimg.com/v2-a4933e53972df61721540cd84b28d1b8_b.jpg
200
https://pic4.zhimg.com/v2-17a19920c1fb8771f076a38014c88cd0_b.jpg
200
https://pic4.zhimg.com/v2-6841a49976a11bbd6cadd54530edc2f0_b.jpg
200
https://pic3.zhimg.com/v2-cfb2e2d1ba89674777f37cc354f04a30_b.jpg
200
https://pic3.zhimg.com/v2-0778cca50a17f1f9d35d56bd0bedebfd_b.jpg
200
https://pic3.zhimg.com/v2-5f31d4e31af4ec37c56d0266fa26fc93_b.jpg
200
https://pic2.zhimg.com/v2-1b7f1861e6dbf85866fdc540675366d4_b.jpg
200
https://pic1.zhimg.com/v2-c420c79953b45aaedba381445bc5be78_b.jpg
200
https://pic2.zhimg.com/v2-48cc47aff189b5c722862ecd32a4516a_b.jpg
200
https://pic3.zhimg.com/v2-a2580253cde081db3e3f1b8b66dddf93_b.jpg
200
https://pic4.zhimg.com/v2-6841a49976a11bbd6cadd54530edc2f0_b.jpg
200
https://pic1.zhimg.com/v2-569c1425597defc7f2fd5b54e7e3c3d2_b.jpg
200
https://pic1.zhimg.com/v2-850dd573365d9c9a1c9d58fa7f27532c_b.jpg
200
https://pic2.zhimg.com/v2-13c20a4c25725fb9d363c567ab4eb08d_b.jpg
200
https://pic1.zhimg.com/v2-c0235ab217e08e205305de260bea60e0_b.jpg
200
https://pic2.zhimg.com/v2-99be53d259d1d0c0755a63b578816f05_b.jpg
200
https://pic4.zhimg.com/v2-bb0040576245087202432c2c4ebbc88b_b.jpg
200
https://pic3.zhimg.com/v2-184bf0e862d37b5e2297f2c4289d8662_b.jpg
200
https://pic2.zhimg.com/v2-7d761d77317867021fd59e4e90c1bddd_b.jpg
200
https://pic4.zhimg.com/v2-9a315bb94c08e58ed5f63202e8a25d5b_b.jpg
200
https://pic2.zhimg.com/v2-8499b2eb6e641620474641daedb61931_b.jpg
200
https://pic1.zhimg.com/v2-11e49b3e1474035316b4bd2ae4d59a4c_b.jpg
200
https://pic2.zhimg.com/v2-88b64ae8861ace4172d54a6cdb81da31_b.jpg
200
问题解决!