import requests
from lxml import etree
import time
import re
'''功能: 爬取排行榜第一的小说'''
# 这里我是手动创建了一个文件夹
fill_path = './top1_neval/'
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'
}
'''用于下载小说内容'''
def download_text(url):
res3 = requests.get(url,headers=headers)
res3.encoding = res3.apparent_encoding # 去乱码
res3_text = etree.HTML(res3.text)
title = res3_text.xpath('//*[@id="wrapper"]/div[4]/div[2]/h1/text()')
print(title)
content = res3_text.xpath('//*[@id="content"]//text()')
content = ''.join(content)
with open(fill_path+title[0]+'.txt','a',encoding='utf-8') as f:
f.write(content)
'''用于获取总榜第一名的小说的详情页url'''
def get_link(url):
res = requests.get(url,headers=headers)
res = etree.HTML(res.text)
url2 = res.xpath('//div[4]/div[1]/ul/li[1]/a/@href')[0]
return url2
'''用于获取每一章小说的url'''
def get_detail_link(url2):
res2 = requests.get(url=url2,headers=headers)
res2_text = etree.HTML(res2.text)
detail_urls_list= res2_text.xpath('//div[5]/dl/dd')
detail_urls = []
for detail_url_list in detail_urls_list:
content_url = detail_url_list.xpath('./a/@href')[0]
# 使用正则由目录页url组合出内容页的url
content_url= re.sub(r'index.html', content_url, url2)
detail_urls.append(content_url)
detail_urls = list(set(detail_urls)) #去重,因为最新章节和所有章节的内容可能重复
return detail_urls
'''主体'''
def main():
top_url = 'http://www.shuquge.com/top.html' # 排行榜页面的地址
url2 = get_link(top_url)
detail_urls = get_detail_link(url2)
for detail_url in detail_urls:
download_text(detail_url)
print('正在爬取:',detail_url)
time.sleep(3) # 降低一下被封的几率
if __name__ == '__main__':
main()
用requests爬书趣阁榜一小说
猜你喜欢
转载自blog.csdn.net/medusee/article/details/105696506
今日推荐
周排行