# 增加了:1、使面向对象化 2、加入了异常判断,防止程序因报错中断 3、检查txt文件是否存在,如存在,跳过并下载下一个文件 # 增加了:多线程,可同时download多个文件 2018.1.11 import requests from bs4 import BeautifulSoup import time import os import threading class Book1: def __init__(self,start_url): self.headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} ##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦) self.start_url = start_url # 开始爬虫 def star_url(self): try: books=self.requests(self.start_url) # 定位目标小说并解析 books_url=books.find('div', class_="tab-item clearfix").find_all('div',class_="yd-book-item yd-book-item-pull-left") except: print('目标小说解析错误!!!!!') for a in books_url: try: book1 = a.find('a') book1_href = book1['href'] book1_gy=self.requests(book1_href) # 定位小说概要页面 allbook_url = book1_gy.find('div', class_="b-oper").find('a')['href'] open_url=self.requests(allbook_url) name = open_url.find('div', class_="chapName").find('strong').get_text() # 定位小说名字 path = 'G:/小说/' + name + '.txt' exist = os.path.exists(path) if exist: print('\n《' + name + '》' + ',已存在\n') continue print('《' + name + '》' + ',下载开始') chapter = open_url.find('div', class_="clearfix dirconone").find_all('a') # 定位小说目录 except: print(name + ',目录读取错误!') continue for i in chapter: try: title = i.get_text() # 目录章节 href = i['href'] # 取出a标签的href属性 html=self.requests(href) # 解析具体章节页面 content = html.find('div', class_="mainContenr").get_text() text = open(path, 'a', encoding='utf-8') text.write('\n' + title + '\n\n\n' + content + '\n\n\n') print(name+' '+'《' + title + '》' + '下载完成') except: print(name+': '+title + ',章节内容读取错误!') continue print('《' + name + '》' + ',下载完成'+'\n\n\n') # 解析网页 def requests(self,url): try: content = requests.get(url, headers=self.headers) content.encoding = 'gbk' soup=BeautifulSoup(content.text,'lxml') return soup except: print('网页解析发生错误!!!!') # 定义多线程执行函数 def threads(count): for i in range(count): threading.Thread(target=Book1.star_url, args=()).start() time.sleep(5) # 执行多线程,要几个加几个(加鸡腿的效率操作O(∩_∩)O哈) Book1=Book1('http://www.quanshuwang.com/all/lastupdate_5_0_0_0_1_0_1.html') threads(10)
python爬虫-多线程小说批量下载
猜你喜欢
转载自blog.csdn.net/zou407479250/article/details/80251573
今日推荐
周排行