python多线程小说爬虫可提供查询功能

pycharm+chromedriver+bs4+re+threading+queue模拟登录小说多线程爬虫

首先要安装selenium,BeautifulSoup库,下载对应的chromedriver版本

一般在cmd里面pip install ×××,作者是用pycharm解释器的,里面可用搜索库名进行下载,写python爬虫一般推荐这个
而chromedriver版本对应关系可以自行百度,下载完安装到自己选定的路径,并记得在下面的代码修改路径

具体步骤如下:

①首先是根据输入的小说名模拟登录网站http://www.biquge.tv/进行模拟检索,如有多种可能会生成一个选择表格 (如果只有一本检索结果,则会跳过这个选择步骤,直接进行下一步)

			pattern1 = re.compile(r'<td class="odd"><a href="(.*?)">(.*?)</a>', re.S)
            contents1 = re.findall(pattern1, driver.page_source)
            pattern2 = re.compile(r'<td class="odd">(.*?)</td>', re.S)
            contents2 = re.findall(pattern2, driver.page_source)
            if len(contents2) and len(contents1):
                URLlist = []
                namelist = []
                authorlist = []
                for content in contents1:
                    URLlist.append(content[0])
                    namelist.append(content[1])
                flag = False
                for content in contents2:
                    if flag == True:
                        authorlist.append(content)
                        flag = False
                    else:
                        flag = True
                print('小说网站搜索的结果如下:')
                print('\t'+'编号'+'\t\t'+'小说'+'\t\t'+'作者'+'\t')
                num = 1
                for name, author, in zip(namelist,authorlist):
                    print('\t'+str(num)+'\t\t'+name+'\t\t'+author+'\t')
                    num += 1
                step = int(input('请选择所需的小说,输入对应的编号:'))
                want_url = str(URLlist[step-1])

②爬取所需要下载的小说各个章节的URL,将其依次存入队列
在第一步中获得对应网页的URL进行队列存储

				driver.get(want_url) #want_url是在上面爬取所需的小说的目录网页的URL
                page_source = driver.page_source
                pattern2 = re.compile(r'<dd><a href="(.*?)">(.*?)</a></dd>', re.S)
                All_html = re.findall(pattern2, page_source)

                for ones in All_html[9:]:
                    part_url = ones[0]
                    title = ones[1]
                    print(title + '+' + base_url+part_url)
                    q.put(title + '+' + base_url+part_url)
                driver.close()
                driver.quit()

③首先会打印出章节名字和对应的URL,这可以检查是否成功爬到所需的小说,选择所需要的线程数量,一般跟自己的电脑cpu性能有很大关系,选择20~40就够了
在这里插入图片描述
下面是线程的生成与最后的结束关闭线程

 threadnum = int(input('请输入所要开启的爬虫线程数量:'))
    start_time = time.time()
    for i in range(1, threadnum+1, 1):
        threadList.append('Spider_Thread-'+str(i))
    queueLock = threading.Lock() # 避免多个线程保卫同一块数据的时候,产生错误,所以加锁
    threads = []
    threadID = 1
      # 创建新线程
    for tName in threadList:
        thread = myThread(threadID, tName, q)
        thread.start()
        threads.append(thread)
        threadID += 1
      # 等待队列清空
    while not q.empty():
        pass
      # 通知线程是时候退出
    exitFlag = 1
      # 等待所有线程完成
    for t in threads:
        t.join()
        print(t.name+'退出成功')

④选择完线程数 启动成功后,就开始爬取各章节到指定的文件夹
在这里插入图片描述
此外,最重要的多线程重载threading如下,一般把主要的运行代码放在重载run(self)函数里面

 class myThread(threading.Thread): # 继承父类threading.Thread

    def __init__(self, threadID, name, counter):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.counter = counter

    def run(self):
          # 把要执行的代码写到run函数里面 线程在创建后会直接运行run函数
        print(self.name+'启动成功')
        while not exitFlag:
            queueLock.acquire() #锁定线程
            if not q.empty():
                item = q.get()
                queueLock.release() #释放线程
                title = item.split('+')[0]
                href = item.split('+')[1]
                get_content(title, href)
            else:
                print('数据全部结束')
                queueLock.release()# 释放线程

还需要记住开启线程锁,防止冲突,定义后,在run函数内使用 ,如上图

queueLock = threading.Lock() # 避免多个线程保卫同一块数据的时候,产生错误,所以加锁

好了,具体全部代码如下(只需更改driverchrome安装的路径和存储小说的文件夹路径就可以运行了):

import queue
import threading
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import re
from bs4 import BeautifulSoup
import time

#多线程爬取笔趣阁小说,可进行手动搜索所需的小说
#@author Himit_ZH
#qq:372347736

exitFlag = 0
q = queue.Queue()
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#driverchrome安装的路径
driver_path = r'E:\py\chromedriver\chromedriver.exe'

base_url = r'http://www.biquge.tv'

#存储小说的路径
txt_path = r'E://py//小说//'

#小说总章节数
Sum_Chapters = 0.0

#所要搜索的小说名字
novel_name = str()

class scrapy_biquge():

    def get_url(self):

        driver = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options)
        driver.get('http://www.biquge.tv/')
        driver.find_element_by_id('wd').send_keys(novel_name)
        driver.find_element_by_id('sss').click()
        # 设置窗口句柄跳到该抓取的网页
        handles = driver.window_handles
        driver.switch_to.window(handles[1])
        if '出现错误!' in driver.page_source:
            driver.close()
            driver.quit()
            print('输入错误,请重新输入')
            return False
        current_url = driver.current_url
        if 'search.php?' in current_url :
            pattern1 = re.compile(r'<td class="odd"><a href="(.*?)">(.*?)</a>', re.S)
            contents1 = re.findall(pattern1, driver.page_source)
            pattern2 = re.compile(r'<td class="odd">(.*?)</td>', re.S)
            contents2 = re.findall(pattern2, driver.page_source)
            if len(contents2) and len(contents1):
                URLlist = []
                namelist = []
                authorlist = []
                for content in contents1:
                    URLlist.append(content[0])
                    namelist.append(content[1])
                flag = False
                for content in contents2:
                    if flag == True:
                        authorlist.append(content)
                        flag = False
                    else:
                        flag = True
                print('小说网站搜索的结果如下:')
                print('\t'+'编号'+'\t\t'+'小说'+'\t\t'+'作者'+'\t')
                num = 1
                for name, author, in zip(namelist,authorlist):
                    print('\t'+str(num)+'\t\t'+name+'\t\t'+author+'\t')
                    num += 1
                step = int(input('请选择所需的小说,输入对应的编号:'))
                want_url = str(URLlist[step-1])
                driver.get(want_url)
                page_source = driver.page_source
                pattern2 = re.compile(r'<dd><a href="(.*?)">(.*?)</a></dd>', re.S)
                All_html = re.findall(pattern2, page_source)

                for ones in All_html[9:]:
                    part_url = ones[0]
                    title = ones[1]
                    print(title + '+' + base_url+part_url)
                    q.put(title + '+' + base_url+part_url)
                driver.close()
                driver.quit()
                return True

        if '抱歉,搜索没有结果^_^' in driver.page_source:
            driver.close()
            driver.quit()
            print('抱歉,搜索没有结果,请重新输入')
            return False

class myThread(threading.Thread): # 继承父类threading.Thread

    def __init__(self, threadID, name, counter):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.counter = counter

    def run(self):
          # 把要执行的代码写到run函数里面 线程在创建后会直接运行run函数
        print(self.name+'启动成功')
        while not exitFlag:
            queueLock.acquire() #锁定线程
            if not q.empty():
                item = q.get()
                queueLock.release() #释放线程
                title = item.split('+')[0]
                href = item.split('+')[1]
                get_content(title, href)
            else:
                print('数据全部结束')
                queueLock.release()# 释放线程
def get_content(title, href):
        driver = webdriver.Chrome(executable_path=driver_path,chrome_options=chrome_options)
        driver.get(href)
        bs4 = BeautifulSoup(driver.page_source, 'lxml')
        title = bs4.h1.get_text()  # 章节名
        filename = txt_path+''.join(title.split()[0])+'.txt'
        content = bs4.find('div', id='content')
        content = content.get_text()
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("\r"+title+"\r\n")
            f.write(content)
        print('['+title+']  成功下载,'+'现已下载总章节数的{:.2f}%'.format(((1.0 - q.qsize()/Sum_Chapters))*100))
        driver.close()
        driver.quit()


if __name__ == '__main__':
    # 所有url进队列以后,启动线程
    while True:
        try:
            novel_name = input('请输入你想要搜索的小说名字:')
            if scrapy_biquge().get_url():
                break
        except KeyError:
            pass
    Sum_Chapters = q.qsize()
    threadList = []
    threadnum = int(input('请输入所要开启的爬虫线程数量:'))
    start_time = time.time()
    for i in range(1, threadnum+1, 1):
        threadList.append('Spider_Thread-'+str(i))
    queueLock = threading.Lock() # 避免多个线程保卫同一块数据的时候,产生错误,所以加锁
    threads = []
    threadID = 1
      # 创建新线程
    for tName in threadList:
        thread = myThread(threadID, tName, q)
        thread.start()
        threads.append(thread)
        threadID += 1
      # 等待队列清空
    while not q.empty():
        pass
      # 通知线程是时候退出
    exitFlag = 1
      # 等待所有线程完成
    for t in threads:
        t.join()``
        print(t.name+'退出成功')
    end_time = time.time()
    print('本次爬取小说耗时为'+str(round(end_time-start_time, 2))+'秒')
发布了11 篇原创文章 · 获赞 307 · 访问量 6753

猜你喜欢

转载自blog.csdn.net/weixin_43853097/article/details/103964204