python 爬小说

初学python爬虫,爬了笔趣阁的全部小说,倒是能爬下来,可是运行速度贼慢,怀疑有Bug,各位大佬可以帮忙看看


# -*- coding:utf-8 -*-
import urllib
import re
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding( "gbk" )
f = open(" r.txt", "w+")

def get_synopsis(url):
    page=urllib.urlopen(url)
    soup = BeautifulSoup(page,"html.parser")
    for tag in soup.find_all('div',id="info"):
        item=tag.get_text().encode('utf-8')
        item=str(item)
        item=re.sub("</?\w+[^>]*>",'',item)
        print item
        f.write('简介'+'\n'+item)

def get_content(url):
    page=urllib.urlopen(url)
    soup = BeautifulSoup(page,"html.parser")
    for content1 in soup.find_all('h1'):
        content1=str(content1)
        content1=re.sub("</?\w+[^>]*>",'',content1)
        print content1
    for content2 in soup.find_all('div',id="content"):
        content2=content2.get_text().replace('<br /><br />    ','\n\t').encode('utf-8')
        content2=str(content2)
        content2=content2.decode('utf-8')
        content2=re.sub("</?\w+[^>]*>",'',content2)
        print content2.encode('utf-8')
        f.write('章节名称'+'\n'+content1+'\n'+'章节内容'+'\n'+content2.encode('utf-8'))

def get_big_list():
     page=urllib.urlopen("http://www.biquge.com/")
     big_list=[]
     soup = BeautifulSoup(page, "html.parser")
     list=soup.select("ul,a")
     del(list[0:14])
     for i in list:
         i = i.get("href")
         i=str(i)
         i='http://www.biquge.com'+i
         big_list.append(i)
     big_list =big_list[9:-1]
     return big_list

def get_url_list():
     get_big_list()
     for p in get_big_list():
         page=urllib.urlopen(p)
         soup = BeautifulSoup(page, "html.parser")
         get_synopsis(p)
         url_list=[]
         list=soup.select("dl,a")
         del(list[0:18])
         for i in list:
             i = i.get("href")
             i=str(i)
             i='http://www.biquge.com'+i
             url_list.append(i)
         url_list = url_list[9:-1]
         return url_list

url_ls=get_url_list()
for url in url_ls:
    get_content(url)
f.close()

猜你喜欢

转载自blog.csdn.net/qq_40024605/article/details/78241842