# -*- coding:UTF-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import chardet
import re
def download(urls):
#下载
for item in urls:
print(item[3] + ':' + 'http://www.biquge.com.tw' + item[1])
#输出一下找到了那些章节,与对应的网址
sure = input("是否下载这些章节y/n(下载在当前目录下,访问上面链接可以直接观看):")
if sure == 'y' or sure == 'Y':
for item in urls:
f = open(item[3], mode='w+', encoding='utf-8')
#创建文本用章节名命名
download_url = 'http://www.biquge.com.tw' + item[1]
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'
download_req = request.Request(url=download_url, headers=head)
download_response = request.urlopen(download_req)
download_html = download_response.read().decode('gbk', 'ignore')
soup_texts = BeautifulSoup(download_html, 'lxml')
#获取章节的内容页,解码并结构化
texts = soup_texts.find_all(id="content")
soup_text = BeautifulSoup(str(texts), 'lxml')
#找到需要的内容
f.write(soup_text.div.text.replace('\xa0', ''))
#把不要的字符去掉如何写入文本
f.close
print(item[3]+" 下载完成\n")
print('all down')
return
def find_url(target_url):
#从目录页获取每一个章节的网址
head = {}
head[
'User-Agent'] = "Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19"
req = request.Request(url=target_url, headers=head)
#设置代理头,这个网站不知道python的默认头能不能访问
response = request.urlopen(req)
html = response.read()
code = chardet.detect(html)['encoding']
#这个不能用,因为它会识别为简体但是还有一部分是繁体
html = html.decode('gbk')
#用gbk解码支持简繁
soup = BeautifulSoup(html, 'html.parser')
#用beatifulsoup结果化一下,不要也可以
result = re.findall('<a(.*?)href\=\"(.*?)\"(.*?)>(.*?)</a>', str(soup.find_all(id='list')))
#正则匹配
download(urls=result)
def search(txt_name):
#利用笔趣阁网站的搜索功能
target_name = str(txt_name.encode("GBK")).upper()
#这个网站用的get方法,搜索的内容用GBK编码加在后面,但并不是直接加要变换
target_url = 'http://www.biquge.com.tw/modules/article/soshu.php?searchkey='
i = 2
while i < len(target_name) - 1:
if target_name[i] == '\\':
target_url += '%'
i += 2
continue
target_url += target_name[i]
i += 1
#此时的target_url的内容就是处理后的网址
head = {}
head[
'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'
req = request.Request(url=target_url, headers=head)
html = request.urlopen(req)
if target_url != html.geturl():
#这个网站如果搜索结果只有一个就会自动打开
print("已经找到,正在提取\n")
target_url=html.geturl()
find_url(target_url)
return
html = html.read().decode('GBK')
result = re.finditer(r'<td class="odd"><a href="(.*?)">(.*?)</a>', html)
#正则匹配
flag = 0
for item in result:
#在多个结果中找到我们要的那个,此处修改可以支持模糊化搜索
if item[2] == txt_name:
target_url = item[1]
print("已经找到,正在提取\n")
find_url(target_url=target_url)
return
print('找不到')
if __name__=='__main__':
name = input("输入要找的小说名称:")
search(txt_name=name)
python 人生的第一个小爬虫 爬小说 保存一下 自动化搜索
猜你喜欢
转载自blog.csdn.net/qq_30754565/article/details/81084576
今日推荐
周排行