python爬虫-多线程小说批量下载

 
 
# 增加了:1、使面向对象化   2、加入了异常判断,防止程序因报错中断   3、检查txt文件是否存在,如存在,跳过并下载下一个文件
# 增加了:多线程,可同时download多个文件  2018.1.11

import requests
from bs4 import BeautifulSoup
import time
import os
import threading

class Book1:
    def __init__(self,start_url):
        self.headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}  ##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)
        self.start_url = start_url

    # 开始爬虫
    def star_url(self):
        try:
            books=self.requests(self.start_url)
            # 定位目标小说并解析
            books_url=books.find('div', class_="tab-item clearfix").find_all('div',class_="yd-book-item yd-book-item-pull-left")
        except:
            print('目标小说解析错误!!!!!')
        for a in books_url:
            try:
                book1 = a.find('a')
                book1_href = book1['href']
                book1_gy=self.requests(book1_href)     # 定位小说概要页面
                allbook_url = book1_gy.find('div', class_="b-oper").find('a')['href']
                open_url=self.requests(allbook_url)
                name = open_url.find('div', class_="chapName").find('strong').get_text()  # 定位小说名字
                path = 'G:/小说/' + name + '.txt'
                exist = os.path.exists(path)
                if exist:
                    print('\n《' + name + '》' + ',已存在\n')
                    continue
                print('《' + name + '》' + ',下载开始')
                chapter = open_url.find('div', class_="clearfix dirconone").find_all('a')    # 定位小说目录
            except:
                print(name + ',目录读取错误!')
                continue
            for i in chapter:
                try:
                    title = i.get_text()  # 目录章节
                    href = i['href']  # 取出a标签的href属性
                    html=self.requests(href)
                    # 解析具体章节页面
                    content = html.find('div', class_="mainContenr").get_text()
                    text = open(path, 'a', encoding='utf-8')
                    text.write('\n' + title + '\n\n\n' + content + '\n\n\n')
                    print(name+' '+'《' + title + '》' + '下载完成')
                except:
                     print(name+':   '+title + ',章节内容读取错误!')
                     continue
            print('《' + name + '》'  + ',下载完成'+'\n\n\n')

    # 解析网页
    def requests(self,url):
        try:
            content = requests.get(url, headers=self.headers)
            content.encoding = 'gbk'
            soup=BeautifulSoup(content.text,'lxml')
            return soup
        except:
            print('网页解析发生错误!!!!')

# 定义多线程执行函数
def threads(count):
    for i in range(count):
        threading.Thread(target=Book1.star_url, args=()).start()
        time.sleep(5)

# 执行多线程,要几个加几个(加鸡腿的效率操作O(∩_∩)O哈)
Book1=Book1('http://www.quanshuwang.com/all/lastupdate_5_0_0_0_1_0_1.html')
threads(10)

猜你喜欢

转载自blog.csdn.net/zou407479250/article/details/80251573