简易小说爬虫(带有GUI界面)

简易小说爬虫(带有GUI界面)

效果:

在这里插入图片描述
在这里插入图片描述

特点:

  • 可实现简单的小说搜索功能
  • 可选择下载目录
  • 下载进度可视化
  • 多线程下载

代码部分:

python部分:

import random
from threading import Thread
from urllib.parse import quote

from PyQt5.QtCore import QThread, pyqtSignal, QFile, Qt
from PyQt5.QtGui import QIcon, QPalette, QBrush, QPixmap
from PyQt5.QtWidgets import QGridLayout, QLabel, QLineEdit, QPushButton, QListWidget, QProgressBar, QMessageBox, \
    QApplication, QFileDialog, QWidget
from bs4 import BeautifulSoup
import requests
import win # 引入qrc资源文件,代码在后面
from lxml import etree
import sys


def dataGet(url):
    """网页源代码获取"""
    
    user_agent_list = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
    ]
    user_agent = random.choice(user_agent_list)
    headers = {'User-Agent': user_agent}
    i = 0
    while i < 4:  # 4s超时, 4次重试
        try:
            response = requests.get(url, headers, timeout=4)
            response.encoding = 'gbk'
            return response.text
        except requests.exceptions.RequestException:
            i += 1


def novelSearch(data):
    """在小说搜索网页获取小说信息"""
    
    soup = BeautifulSoup(data, features='lxml')
    lis = soup.find_all('li')
    novelList = []
    novelInfoList = []
    linkList = []
    for li in lis:
        html = etree.HTML(str(li))
        class_ = html.xpath('//span[@class="s1"]/text()')
        name = html.xpath('//span[@class="s2"]/a/text()')
        link = html.xpath('//span[@class="s2"]/a/@href')
        new = html.xpath('//span[@class="s3"]/a/text()')
        author = html.xpath('//span[@class="s4"]/text()')
        time = html.xpath('//span[@class="s5"]/text()')
        now = html.xpath('//span[@class="s7"]/text()')
        if class_ and now and new:
            novelList.append(name[0])
            novelInfoList.append([class_[0], name[0], link[0], new[0], author[0], time[0], now[0]])
            linkList.append(link[0])
    return [novelList, novelInfoList, linkList]


def chapterGet(data):
    """在目录界面获取小说章节"""
    
    html = etree.HTML(data)
    chapters_name = html.xpath('//dl/dd/a/text()')
    chapters_link = html.xpath('//dl/dd/a/@href')
    chapters = []
    for i, j in zip(chapters_name, chapters_link):
        chapters.append([i, j])
    return chapters


def contentGet(data):
    """获取小说内容"""
    
    string = data.replace('<br />', '').replace('<br>', '')
    html = etree.HTML(string)
    title = html.xpath('//div[@class="bookname"]/h1/text()')
    content = html.xpath('//div[@id="content"]/text()')
    return [title[0], content[0]]


def Del_line(file_path):
    """删除文件空行"""
    
    with open(file_path, "r", encoding='utf-8') as f:
        res = f.readlines()
    res = [x for x in res if x.split()]
    with open(file_path, "w", encoding='utf-8') as f:
        f.write("".join(res))


class WorkThread(Thread):
    """多线程类"""

    def __init__(self, func, args=()):
        super(WorkThread, self).__init__()
        self.func = func
        self.args = args

    def run(self):
        self.result = self.func(*self.args)

    def get_result(self):
        """获取线程返回值"""
        try:
            return self.result
        except Exception:
            return None


class SearchThread(QThread):
    """搜索线程"""
    
    _signal1 = pyqtSignal(list)
    _signal2 = pyqtSignal(list)
    _signal3 = pyqtSignal()

    def __init__(self):
        super(SearchThread, self).__init__()

    def __del__(self):
        self.wait()

    def set_name(self, string):
        self.novelName = string

    def run(self):
        searchURL = 'https://www.52bqg.com/modules/article/search.php?searchkey=' + self.novelName
        url = quote(searchURL, safe=";/?:@&=+$,", encoding="gbk")
        data = dataGet(url)
        lists = novelSearch(data)
        if lists[0]:
            self._signal1.emit(lists[0])
            self._signal2.emit(lists[1])
        else:
            self._signal3.emit()


class DownThread(QThread):
    """下载线程"""
    
    _signal1 = pyqtSignal(int)
    _signal2 = pyqtSignal()

    def __init__(self):
        super(DownThread, self).__init__()

    def __del__(self):
        self.wait()

    def set_link(self, string):
        self.link = string

    def set_name(self, string):
        self.name = string

    def set_path(self, string):
        self.path = string

    def run(self):
        tar_url = self.link
        data = dataGet(tar_url)
        chapters = chapterGet(data)

        threads1 = []
        for i in chapters:
            link = self.link + i[1]
            t = WorkThread(dataGet, args=(link,))
            threads1.append(t)

        for i in threads1:
            i.start()
        datas = []
        n = len(threads1)
        for i in threads1:
            s = threads1.index(i)
            index = int(100 * (s / n))
            i.join()
            self._signal1.emit(index)
            datas.append(i.get_result())

        contents = []

        for i in datas:
            content = contentGet(i)
            contents.append(content)

        path = self.path + '/' + self.name + '.txt'
        f = open(path, 'a', encoding='utf-8')

        for i in contents:
            f.write(i[0] + '\n')
            f.write(i[1] + '\n')

        f.close()

        Del_line(path)

        self._signal1.emit(1000)

        self._signal2.emit()


class MainWin(QWidget):
    """主窗口"""

    novelList = []
    novelInfoList = []
    dir_path = ''

    def __init__(self):
        super(MainWin, self).__init__()

        self.setWindowTitle("Downloader")
        self.setWindowIcon(QIcon(":/sources/images/icon.png"))
        self.setFixedSize(400, 600)

        self.layout = QGridLayout()
        self.layout.setSpacing(5)

        self.label = QLabel("小说名称:")
        self.layout.addWidget(self.label, 0, 0)

        self.input = QLineEdit()
        self.input.setPlaceholderText("输入小说名称")
        self.layout.addWidget(self.input, 0, 1, 1, 5)

        self.button = QPushButton("搜索")
        self.button.clicked.connect(self.search)
        self.layout.addWidget(self.button, 0, 6)

        self.label1 = QLabel("搜索结果:")
        self.layout.addWidget(self.label1, 1, 0)

        self.list = QListWidget()
        self.list.clicked.connect(self.choice)
        self.layout.addWidget(self.list, 2, 0, 1, 7)

        self.label2 = QLabel("下载地址:")
        self.label2.setObjectName("address")
        self.layout.addWidget(self.label2, 3, 0)

        self.input1 = QLineEdit()
        self.input1.setPlaceholderText("下载地址")
        self.input1.setFocusPolicy(Qt.NoFocus)
        self.layout.addWidget(self.input1, 3, 1, 1, 5)

        self.button1 = QPushButton("...")
        self.button1.clicked.connect(self.path)
        self.layout.addWidget(self.button1, 3, 6)

        self.button2 = QPushButton("开始下载")
        self.button2.clicked.connect(self.download)
        self.layout.addWidget(self.button2, 4, 0, 1, 7)

        self.label3 = QLabel("下载进度:")
        self.label3.setObjectName("pbar")
        self.layout.addWidget(self.label3, 5, 0)

        self.pbar = QProgressBar()
        self.layout.addWidget(self.pbar, 5, 1, 1, 6)

        self.setLayout(self.layout)

        self.thread = SearchThread()
        self.thread._signal1.connect(self.callback_1)
        self.thread._signal2.connect(self.callback_2)
        self.thread._signal3.connect(self.nothing)

        self.thread_1 = DownThread()
        self.thread_1._signal1.connect(self.pbarindex)
        self.thread_1._signal2.connect(self.finish)

        self.palette = QPalette()
        self.palette.setBrush(QPalette.Background, QBrush(QPixmap(":/sources/images/bg.jpg")))
        self.setPalette(self.palette)

        self.show()

    def search(self):
        self.pbar.setValue(0)
        name = self.input.text()
        if name == '':
            QMessageBox.information(self, "提示", "请输入关键字", QMessageBox.Ok, QMessageBox.Ok)
            return
        self.thread.set_name(name)
        self.thread.start()

    def addItem(self):
        for i in self.novelList:
            n = self.novelList.index(i)
            self.list.addItem(i)
        QApplication.processEvents()

    def callback_1(self, msg):
        self.list.clear()
        self.novelList = msg
        self.addItem()

    def callback_2(self, msg):
        self.linList = []
        for i in msg:
            self.linList.append(i[2])
            i[0] = '小说分类:' + i[0]
            i[1] = '小说名称:' + i[1]
            i[2] = '小说链接:' + i[2]
            i[3] = '最新章节:' + i[3]
            i[4] = '小说作者:' + i[4]
            i[5] = '最近更新时间:' + i[5]
            i[6] = '更新状态:' + i[6]
            self.novelInfoList.append([i[0], i[1], i[2], i[3], i[4], i[5], i[6]])

    def nothing(self):
        QMessageBox.information(self, "提示", "未搜索到任何结果", QMessageBox.Ok, QMessageBox.Ok)

    def choice(self, index):
        r = index.row()
        string = '\n'.join(self.novelInfoList[r])
        box = QMessageBox.information(self, "详细信息", string, QMessageBox.No | QMessageBox.Yes, QMessageBox.Yes)
        if box == QMessageBox.Yes:
            self.novel_name = self.novelList[r]
            self.link = self.linList[r]

    def path(self):
        self.dir_path = QFileDialog.getExistingDirectory(self, "choose directory", "D:\\")
        self.input1.setText(self.dir_path)

    def download(self):
        if self.dir_path == '':
            QMessageBox.information(self, "提示", "未选择下载路径", QMessageBox.Ok, QMessageBox.Ok)
            return
        name = self.novel_name
        path = self.dir_path
        link = self.link
        self.thread_1.set_link(link)
        self.thread_1.set_name(name)
        self.thread_1.set_path(path)
        self.thread_1.start()

    def pbarindex(self, msg):
        if msg == 1000:
            self.pbar.setValue(100)
            return
        self.pbar.setValue(msg)

    def finish(self):
        QMessageBox.information(self, "提示", "下载完成", QMessageBox.Ok, QMessageBox.Ok)
        self.pbar.setValue(0)


if __name__ == '__main__':
    app = QApplication(sys.argv)
    win = MainWin()
    file = QFile(':/sources/qss/style.css')
    file.open(QFile.ReadOnly)
    qss = str(file.readAll(), encoding='utf-8')
    file.close()
    win.setStyleSheet(qss)
    sys.exit(app.exec_())

qss部分 (个人缺少艺术细菌):

QMessageBox > QLabel {
    font-size: 17px;
    font-family: 微软雅黑;
    font-weight: bold;
    color: black;
}

QLabel {
    font-size: 17px;
    font-family: 微软雅黑;
    font-weight: bold;
    color: white;
}

QLabel#address {
    font-size: 17px;
    font-family: 微软雅黑;
    font-weight: bold;
    color: black;
}

QLabel#pbar {
    font-size: 17px;
    font-family: 微软雅黑;
    font-weight: bold;
    color: black;
}

QLineEdit {
    background-color: rgb(255, 255, 255, 180);
    border: 2px groove gray;
    border-radius: 10px;
    padding: 2px 4px;
}

QLineEdit:hover {
    border: 2px groove gray;
    background-color: rgb(255, 255, 255, 180);
    border-radius: 10px;
    padding: 2px 4px;
}

QPushButton {
    min-width: 50px;
    background-color: rgb(255, 255, 255, 180);
    border: 2px groove gray;
    border-radius: 10px;
    padding: 2px 4px;
}

QPushButton:hover {
    min-width: 50px;
    background-color: gainsboro;
    border: 2px groove #007bff;
    background-color: #007bff;
    border-radius: 10px;
    padding: 2px 4px;
}

QPushButton:pressed {
    min-width: 30px;
    border: 2px groove #007bff;
    background-color: azure;
    border-radius: 10px;
    padding: 2px 4px;
}

QListWidget {
    border: 2px groove gray;
    background-color: rgb(255, 255, 255, 180);
    border-radius: 10px;
    padding: 2px 4px;
}

QProgressBar {
    background-color: rgb(255, 255, 255, 180);
    max-height: 15px;
    border: 2px groove gray;
    border-radius: 10px;
    padding: 2px 4px;
}

win.py:
不行,太多了,等下附上文件吧。
点击下载
提取码:cj8q
附上我的目录结构:
在这里插入图片描述

发布了3 篇原创文章 · 获赞 1 · 访问量 104

猜你喜欢

转载自blog.csdn.net/HosheaDi/article/details/105174242