用requests爬书趣阁榜一小说

import requests
from lxml import etree
import time
import re
'''功能: 爬取排行榜第一的小说'''


# 这里我是手动创建了一个文件夹
fill_path = './top1_neval/'

headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'
}

'''用于下载小说内容'''
def download_text(url):
    res3 = requests.get(url,headers=headers)
    res3.encoding = res3.apparent_encoding # 去乱码
    res3_text = etree.HTML(res3.text)
    title = res3_text.xpath('//*[@id="wrapper"]/div[4]/div[2]/h1/text()')
    print(title)
    content = res3_text.xpath('//*[@id="content"]//text()')
    content = ''.join(content)
    with open(fill_path+title[0]+'.txt','a',encoding='utf-8') as f:
        f.write(content)

'''用于获取总榜第一名的小说的详情页url'''
def get_link(url):
    res = requests.get(url,headers=headers)
    res = etree.HTML(res.text)
    url2 = res.xpath('//div[4]/div[1]/ul/li[1]/a/@href')[0]
    return url2

'''用于获取每一章小说的url'''
def get_detail_link(url2):
    res2 = requests.get(url=url2,headers=headers)
    res2_text = etree.HTML(res2.text)
    detail_urls_list= res2_text.xpath('//div[5]/dl/dd')
    detail_urls = []
    for detail_url_list in detail_urls_list:
        content_url = detail_url_list.xpath('./a/@href')[0]

        # 使用正则由目录页url组合出内容页的url
        content_url= re.sub(r'index.html', content_url, url2)

        detail_urls.append(content_url)
    detail_urls = list(set(detail_urls)) #去重,因为最新章节和所有章节的内容可能重复
    return detail_urls

'''主体'''
def main():
    top_url = 'http://www.shuquge.com/top.html' # 排行榜页面的地址

    url2 = get_link(top_url)

    detail_urls = get_detail_link(url2)

    for detail_url in detail_urls:
        download_text(detail_url)
        print('正在爬取:',detail_url)

        time.sleep(3) # 降低一下被封的几率


if __name__ == '__main__':
    main()
发布了3 篇原创文章 · 获赞 1 · 访问量 37

猜你喜欢

转载自blog.csdn.net/medusee/article/details/105696506