初学python爬虫,爬了笔趣阁的全部小说,倒是能爬下来,可是运行速度贼慢,怀疑有Bug,各位大佬可以帮忙看看
# -*- coding:utf-8 -*- import urllib import re from bs4 import BeautifulSoup import sys reload(sys) sys.setdefaultencoding( "gbk" ) f = open(" r.txt", "w+") def get_synopsis(url): page=urllib.urlopen(url) soup = BeautifulSoup(page,"html.parser") for tag in soup.find_all('div',id="info"): item=tag.get_text().encode('utf-8') item=str(item) item=re.sub("</?\w+[^>]*>",'',item) print item f.write('简介'+'\n'+item) def get_content(url): page=urllib.urlopen(url) soup = BeautifulSoup(page,"html.parser") for content1 in soup.find_all('h1'): content1=str(content1) content1=re.sub("</?\w+[^>]*>",'',content1) print content1 for content2 in soup.find_all('div',id="content"): content2=content2.get_text().replace('<br /><br /> ','\n\t').encode('utf-8') content2=str(content2) content2=content2.decode('utf-8') content2=re.sub("</?\w+[^>]*>",'',content2) print content2.encode('utf-8') f.write('章节名称'+'\n'+content1+'\n'+'章节内容'+'\n'+content2.encode('utf-8')) def get_big_list(): page=urllib.urlopen("http://www.biquge.com/") big_list=[] soup = BeautifulSoup(page, "html.parser") list=soup.select("ul,a") del(list[0:14]) for i in list: i = i.get("href") i=str(i) i='http://www.biquge.com'+i big_list.append(i) big_list =big_list[9:-1] return big_list def get_url_list(): get_big_list() for p in get_big_list(): page=urllib.urlopen(p) soup = BeautifulSoup(page, "html.parser") get_synopsis(p) url_list=[] list=soup.select("dl,a") del(list[0:18]) for i in list: i = i.get("href") i=str(i) i='http://www.biquge.com'+i url_list.append(i) url_list = url_list[9:-1] return url_list url_ls=get_url_list() for url in url_ls: get_content(url) f.close()