爬取站长之家

爬取站长之家

爬取图片

https://sc.chinaz.com/tupian/
可以随便找一个分类进行爬取练习
注意:这里的src存在一个懒加载,你先定位到第一张图片的代码块,然后找到最后一张图片的代码块,你会发现这里的src还是src2,如果你把页面翻到最后一张图片,你再去看src就是src了,多看源码
这里我就没有去做分页爬取了,大家也可以去试一试分页爬取

import requests
from lxml import etree
from urllib import request
import os

def chinaz(url):
    res = requests.get(url=url,headers=headers)
    res.encoding = 'utf-8'
    content = res.text
    tree = etree.HTML(content)
    div_list = tree.xpath('//div[contains(@class,"box")]')
    # print(len(div_list))
    path = 'chinaz'
    if not os.path.exists(path):
        os.mkdir(path)
    for div in div_list:
        src = 'https:'+div.xpath('./div//img/@src2')[0]# 注意查看源码
        # os.path.splitext(src)将一个路径的名字与后缀名分开
        alt = div.xpath('./div//img/@alt')[0] + os.path.splitext(src)[1]
        try:
            request.urlretrieve(url=src,filename=f'{path}/{alt}')
        except Exception as e:
            print(e)
            print(alt,'下载失败')
        else:
            print(alt,'下载成功')

if __name__ == '__main__':
    url = 'https://sc.chinaz.com/tupian/shuaigetupian.html'
    headers = {
    
    
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
    }
    chinaz(url)

爬取免费简历模板

https://aspx.sc.chinaz.com/query.aspx?keyword=%E5%85%8D%E8%B4%B9&classID=864

import os
import requests
from lxml import etree
import threading
def get_resume(url,page):
    content = requests.get(url=url,headers=headers).text
    mytree = etree.HTML(content)
    div_list = mytree.xpath('//div[@class="box col3 ws_block"]/a')
    path = f'resume{page}'
    if not os.path.exists(path):
        os.mkdir(path)
    for div in div_list:
        href = div.xpath('./@href')[0]
        name = div.xpath('./img/@alt')[0]
        # 对那个下载地址的href发送请求
        response = requests.get(url='https:'+href,headers=headers)
        response.encoding = 'utf8'
        content1 = response.text
        tree = etree.HTML(content1)
        # 获取第一个下载地址的href
        down_href = tree.xpath('//div[@class="clearfix mt20 downlist"]//li[1]/a/@href')[0]
        suffix = os.path.splitext(down_href)[1]
        # 下载图片、音频还有压缩包要获取的是content二进制数据
        down_content = requests.get(url=down_href,headers=headers).content
        # 写入文件的时候也要使用二进制的方式写入
        with open(f'{path}/{name}{suffix}','wb') as fp:
            fp.write(down_content)
            fp.flush()
    print(f'第{page}页下载成功')

if __name__ == '__main__':

    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Wind    ows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
    }
    for page in range(1,6):
    # 这是免费的简历模板
        url = f'https://aspx.sc.chinaz.com/query.aspx?keyword=%E5%85%8D%E8%B4%B9&issale=&classID=864&page={page}'
        threading.Thread(target=get_resume,args=(url,page)).start()

猜你喜欢

转载自blog.csdn.net/hmh4640219/article/details/114890871
今日推荐