第一次用python写爬虫

刚开始接触python,看网上好多都说写写爬虫锻炼一下，于是乎周末加班的时候边查边学的写了个轻之国度小说的脚本，会有很多不足之处，仅仅是锻炼尝试用python写代码~~
# -*- coding: UTF-8 -*- 
# Python:      2.7.8
# Platform:    Windows
# Program:     Get Novels From Internet
# Author:      dxl
# Description: Get Novels
# Version:     1.0
# History:     
import urllib2,os,codecs
from bs4 import BeautifulSoup
#跟网址 http://lknovel.lightnovel.cn/main/vollist/66.html
url=''
#存储路径
title_path=''
#抽取正则
reg = '(?<=a href=")http:[^\s].+(?=")'
#request消息头
heads = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 
            'Accept-Charset':'GB2312,utf-8;q=0.7,*;q=0.7', 
            'Accept-Language':'zh-cn,zh;q=0.5', 
            'Cache-Control':'max-age=0', 
            'Connection':'keep-alive', 
            'Host':'John', 
            'Keep-Alive':'115', 
            'Referer':url, 
            'User-Agent':'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.14) Gecko/20110221 Ubuntu/10.10 (maverick) Firefox/3.6.14'}
#获取网页信息
def getHtml(url):
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
    urllib2.install_opener(opener) 
    req = urllib2.Request(url)
    opener.addheaders = heads.items()
    respHtml = opener.open(req).read()
    return respHtml;
#获取小说列表
def getTextList(url):
    urls=[]
    respHtml = getHtml(url)
    soup = BeautifulSoup(respHtml)
    list = soup.find_all('dd')
    for x in list:
        urls.append(x.find_all('a')[1].get('href'))
    return urls
#获取文本信息到本地
def loadText(urls):
    for url in urls :
        load(url)
#下载资源
def load(url):
    soup = BeautifulSoup(getHtml(url))
    title=soup.find_all("h1",class_='ft-24')
    title_name= title[0].find_all('strong')[0].get_text().replace('\r\n','').replace('\t','')
    title_path='D:/pms_branches/MyPython/src/com/dxl/%s'%title_name
    if not os.path.exists(title_path):
        os.mkdir(title_path)
    list= soup.find_all("li",class_='span3') 
    for x in list:
        title_list_name= x.find_all('a')[0].find_all('span')[0].get_text().replace('\r\n','').replace('\t','').replace('?','').replace('<','').replace('>','').replace('|','').replace('*','').replace('"','').replace(':','')
        title_list_path= title_path+'\%s'%title_list_name 
        if not os.path.exists(title_list_path):
            soup_html=BeautifulSoup(getHtml(x.find_all('a')[0].get('href')))
            text_lists=soup_html.find_all("div",class_='lk-view-line')
            with codecs.open(title_list_path,'wb','GB18030') as fp:
                for text in text_lists:
                    fp.write(text.get_text())
        print '完成%s'%title_list_name
if __name__=="__main__":
#     url=raw_input("""输入目标网址\n       按回车键结束\n""")
     url='http://lknovel.lightnovel.cn/main/vollist/573.html'
     urls=getTextList(url)
     loadText(urls)
第一次用python写爬虫

猜你喜欢