爬取站长之家
爬取图片
https://sc.chinaz.com/tupian/
可以随便找一个分类进行爬取练习
注意:这里的src存在一个懒加载,你先定位到第一张图片的代码块,然后找到最后一张图片的代码块,你会发现这里的src还是src2,如果你把页面翻到最后一张图片,你再去看src就是src了,多看源码
这里我就没有去做分页爬取了,大家也可以去试一试分页爬取
import requests
from lxml import etree
from urllib import request
import os
def chinaz(url):
res = requests.get(url=url,headers=headers)
res.encoding = 'utf-8'
content = res.text
tree = etree.HTML(content)
div_list = tree.xpath('//div[contains(@class,"box")]')
# print(len(div_list))
path = 'chinaz'
if not os.path.exists(path):
os.mkdir(path)
for div in div_list:
src = 'https:'+div.xpath('./div//img/@src2')[0]# 注意查看源码
# os.path.splitext(src)将一个路径的名字与后缀名分开
alt = div.xpath('./div//img/@alt')[0] + os.path.splitext(src)[1]
try:
request.urlretrieve(url=src,filename=f'{path}/{alt}')
except Exception as e:
print(e)
print(alt,'下载失败')
else:
print(alt,'下载成功')
if __name__ == '__main__':
url = 'https://sc.chinaz.com/tupian/shuaigetupian.html'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
}
chinaz(url)
爬取免费简历模板
https://aspx.sc.chinaz.com/query.aspx?keyword=%E5%85%8D%E8%B4%B9&classID=864
import os
import requests
from lxml import etree
import threading
def get_resume(url,page):
content = requests.get(url=url,headers=headers).text
mytree = etree.HTML(content)
div_list = mytree.xpath('//div[@class="box col3 ws_block"]/a')
path = f'resume{page}'
if not os.path.exists(path):
os.mkdir(path)
for div in div_list:
href = div.xpath('./@href')[0]
name = div.xpath('./img/@alt')[0]
# 对那个下载地址的href发送请求
response = requests.get(url='https:'+href,headers=headers)
response.encoding = 'utf8'
content1 = response.text
tree = etree.HTML(content1)
# 获取第一个下载地址的href
down_href = tree.xpath('//div[@class="clearfix mt20 downlist"]//li[1]/a/@href')[0]
suffix = os.path.splitext(down_href)[1]
# 下载图片、音频还有压缩包要获取的是content二进制数据
down_content = requests.get(url=down_href,headers=headers).content
# 写入文件的时候也要使用二进制的方式写入
with open(f'{path}/{name}{suffix}','wb') as fp:
fp.write(down_content)
fp.flush()
print(f'第{page}页下载成功')
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Wind ows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
for page in range(1,6):
# 这是免费的简历模板
url = f'https://aspx.sc.chinaz.com/query.aspx?keyword=%E5%85%8D%E8%B4%B9&issale=&classID=864&page={page}'
threading.Thread(target=get_resume,args=(url,page)).start()