强化:
爬取最新的小说圣墟
代码:
#coding=utf-8
import os
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from Spider import getHtmlCode
from bs4 import BeautifulSoup
import re
#第一章的地址
url = 'https://www.biquge5200.com/52_52542/20380548.html'
def getTree(url):
temp = getHtmlCode(url)
soup = BeautifulSoup(temp,'html.parser')
return soup
#输入为:章节网页地址
#输出为:(章节名,内容)
def getAll(url):
temp = getTree(url)
chaptername = temp.h1.string
print u'章节名:',chaptername
content = temp.find_all('div',id='content')
content = str(content[0])
content = content.replace('<br />','\n')
pattern = re.compile('<(.*)>')
list_line = pattern.findall(content)
for line in list_line:
line = '<' + line +'>'
content = content.replace(line,'')
# print u'内容 :',content,'\n'
return(chaptername,content)
#输入为:章节地址
#输出为:创建一个章节名为文件名的txt文本
def creatFile(url):
(fileName,txt) = getAll(url)
fileName = fileName + '.txt'
f = open(fileName,'a+')
f.write(txt)
f.close()
def nextUrl(url):
tree = getTree(url)
aSpan = tree('a',href=re.compile('.*52_52542'))
for nextChapter in aSpan:
# print type(nextChapter.string)
if u'下一章' == nextChapter.string:
pathUrl = nextChapter['href']
print pathUrl
break
else:
pathUrl = ''
return pathUrl
# nextUrl(url)
#输入为:章节地址
#输出为:整本小说(每一百章为一个文件夹放置)
def main(url):
count = 1
flag = True
cmd = 'del /q /s *.txt'
os.system(cmd)
while flag:
creatFile(url)
print 'adress = ',url
url = nextUrl(url)
count = count + 1
if 0 == (count % 100) :
filename = count / 100
cmd_md = 'md ' + str(filename)
cmd_mv = 'move *.txt ' + str(filename)
os.system(cmd_md)
os.system(cmd_mv)
if -1 == url.find('.html'):
filename = count / 100 + 1
cmd_md = 'md ' + str(filename)
cmd_mv = 'move *.txt ' + str(filename)
os.system(cmd_md)
os.system(cmd_mv)
flag = False
main(url)
结果截图:
在执行时发现这种方式容易报错,后面改为所有章节合并为一本书。
代码:
#coding=utf-8
import os
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from Spider import getHtmlCode
from bs4 import BeautifulSoup
import re
#第一章的地址
url = 'https://www.biquge5200.com/52_52542/20380548.html'
def getTree(url):
temp = getHtmlCode(url)
soup = BeautifulSoup(temp,'html.parser')
return soup
#输入为:章节网页地址
#输出为:(章节名,内容)
def getAll(url):
temp = getTree(url)
chaptername = temp.h1.string
print u'章节名:',chaptername
content = temp.find_all('div',id='content')
content = str(content[0])
content = content.replace('<br />','\n')
pattern = re.compile('<(.*)>')
list_line = pattern.findall(content)
for line in list_line:
line = '<' + line +'>'
content = content.replace(line,'')
# print u'内容 :',content,'\n'
return(chaptername,content)
#输入为:章节地址
#输出为:创建一个章节名为文件名的txt文本
def creatFile(url):
(fileName,txt) = getAll(url)
txt = fileName + '\n' + txt
storyFileName = u'圣墟.txt'
f = open(storyFileName,'a+')
f.write(txt)
f.close()
def nextUrl(url):
tree = getTree(url)
aSpan = tree('a',href=re.compile('.*52_52542'))
for nextChapter in aSpan:
# print type(nextChapter.string)
if u'下一章' == nextChapter.string:
pathUrl = nextChapter['href']
print pathUrl
break
else:
pathUrl = ''
return pathUrl
# nextUrl(url)
#输入为:章节地址
#输出为:整本小说
def main(url):
flag = True
cmd = 'del /q /s *.txt'
os.system(cmd)
while flag:
creatFile(url)
print 'adress = ',url
url = nextUrl(url)
if -1 == url.find('.html'):
flag = False
main(url)
结果:
(划重点)所有代码以及小说见我的下载资源,没有积分的qq私聊我