爬取网站:https://sc.chinaz.com/jianli/biaoge.html 中的模板下载到本地(压缩包.rar格式)
这里只挑了一个免费的进行下载
注意:
压缩包为二进制格式数据,需要使用content来接收
# 爬取网站:https://sc.chinaz.com/jianli/biaoge.html
# 中的模板下载到本地(压缩包.rar格式) 这里只挑了一个免费的进行下载
import requests
from lxml import etree
if __name__ == '__main__':
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
}
url = "https://sc.chinaz.com/jianli/biaoge.html"
response = requests.get(url=url, headers=headers).text
tree = etree.HTML(response)
# 获取第一次请求到的页面--简历模板汇总页面
new_url_arr = tree.xpath('//div[@class="sc_warp mt20"]/div/div/div[4]/a/@href')
# 获取需要爬取文件的下载地址所在的地址
new_url = new_url_arr[0]
new_response = requests.get(url="https:"+new_url, headers=headers)
# 对请求对象的编码格式进行处理
new_response.encoding = new_response.apparent_encoding
# with open('./jianli.html', 'w', encoding=new_response.apparent_encoding) as fp:
# fp.write(new_response.text)
# print(new_response)
new_tree = etree.HTML(new_response.text)
# 获取下载地址
download_path_arr = new_tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href')
# 获取对应的二进制格式的数据
file_rar = requests.get(url=download_path_arr[0], headers=headers).content
# 获取文件名
file_name = new_tree.xpath('//div[@class="ppt_tit clearfix"]/h1/text()')[0]
# 持久化存储
# 设置文件存储地址
file_save_url = './第三章:数据解析/code/'+file_name+'.rar'
with open(file_save_url, 'wb') as fp:
fp.write(file_rar)
print('爬取成功!')