搞Python爬虫这么久了,是时候搞点不务正业的东西了
今天就来爬取妹子图全站图片
来要网站的已经可以出去了,因为后期不会在讲到网址了
妹子图网站容易封ip,所以这里最好用代理ip池,具体的可以看我之前博客爬虫ip总被封?教你构造代理ip池
库准备:requests lxml os fake_useragent
这里我随便找了一个代理ip用,你们也可以用其他的(直接从数据库里取就好了)
完整代码如下
# -*- encoding = utf-8 -*-
# D:\Program\Pycharm\PyCharm Community Edition 2019.3.3\Project
import requests
from lxml import etree
import os
from fake_useragent import UserAgent
def get_url_list():
try:
html = etree.HTML(requests.get('https://www.mzitu.com/',headers=headers,proxies=proxies).text)
return html.xpath('//ul[@id="pins"]/li/a/@href')
except:
pass
def download(url):
try:
html = etree.HTML(requests.get(url,proxies=proxies,headers=headers).text)
name = html.xpath('//h2/text()')[0]
number = int(html.xpath('//div[@class="pagenavi"]/a[5]/span/text()')[0])
id = html.xpath('//div[@class="main-image"]//img/@src')[0][22:33]
if not os.path.exists(name):
os.mkdir(name)
os.chdir(name)
list = []
for i in range(number):
if i < 10:
list.append('https://i3.mmzztt.com/%s0%d.jpg'%(id,(i+1)))
else:
list.append('https://i3.mmzztt.com/%s%d.jpg'%(id,(i+1)))
for i in range(len(list)):
file = open('%s%d.jpg'%(name,(i+1)),'wb')
file.write(requests.get(list[i],headers=headers).content)
file.close()
print("%s%d 保存成功"%(name,i+1))
os.chdir('..')
except:
pass
if __name__ == '__main__':
ua=UserAgent()
headers = {
'user-agent': ua.random,
'Referer': 'https://www.mzitu.com/',
}
proxies={
'HTTPS':'61.164.39.67:53281'
}
if not os.path.exists('image'):
os.mkdir('image')
os.chdir('image')
for item in get_url_list():
download(item)