爬取起点中文网小说(只爬了第一页的小说,可以爬去所有的小说,但是太多了,就只爬了第一页的小说)
源代码:
""" created on Web Jan 02 2019 @author: Super Huan """ # Python爬取起点小说 import requests from lxml import etree import os class spider(): def startRequest(self): response = requests.get('https://www.qidian.com/all') html = etree.HTML(response.content.decode()) bigTitleList = html.xpath('//div[@class="book-mid-info"]/h4/a/text()') bigSrcList = html.xpath('//div[@class="book-mid-info"]/h4/a/@href') for bigTitle, bigSrc in zip(bigTitleList, bigSrcList): if os.path.exists(bigTitle) == False: os.mkdir(bigTitle) self.fileData(bigTitle, bigSrc) def fileData(self, bigTitle, bigSrc): response = requests.get('http:' + bigSrc) html = etree.HTML(response.content.decode()) litTitleList = html.xpath('//ul[@class="cf"]/li/a/text()') litSrcList = html.xpath('//ul[@class="cf"]/li/a/@href') for litTitle, litSrc in zip(litTitleList, litSrcList): self.finallyFile(litTitle, litSrc, bigTitle) def finallyFile(self, title, url, bigTitle): response = requests.get('https:' + url) html = etree.HTML(response.content.decode()) text = ' '.join(html.xpath('//div[@class="read-content j_readContent"]/p/text()')) fileName = bigTitle + '/' + title + '.txt' print('正在抓取文章', fileName) if os.path.exists(fileName) == False: with open(fileName, 'a', encoding='utf-8') as f: f.write(text) spider = spider() spider.startRequest()
进群:960410445 即可获取书十套PDF!