Python 多线程 爬虫 通信 示例

# _*_ coding=utf-8 _*_

import threading
import requests
from queue import Queue
from bs4 import BeautifulSoup

_sentinel = object()


class GetHtml(threading.Thread):
    """获取Html信息,输出文章标题"""
    def __init__(self, queue):
        super().__init__()
        self.queue = queue

    def run(self):
        while True:
            url = self.queue.get()
            if url is _sentinel:
                break
            con = requests.get(url)
            print(BeautifulSoup(con.text, 'html.parser').find('title').text)


class GetUrl(threading.Thread):
    """获取目标URL列表"""
    def __init__(self, queue):
        super().__init__()
        self.queue = queue

    def run(self):
        i = 1
        while True:
            context = requests.get('http://python.jobbole.com/category/project/page/{}/'.format(i))
            if context.status_code == 404:
                break
            urls = [url_text['href'] for url_text in BeautifulSoup(context.text, 'html.parser').find_all('a', class_='archive-title')]
            for url in urls:
                self.queue.put(url)
            i += 1
        # 添加标志位,在所有url添加完成后并爬取后,终止线程
        self.queue.put(_sentinel)


if __name__ == '__main__':
    # 利用队列进行线程之间的通信
    queue_main = Queue(maxsize=3)
    thread1 = GetUrl(queue_main)
    thread2 = GetHtml(queue_main)

    thread2.start()
    thread1.start()

猜你喜欢

转载自blog.csdn.net/wei_bo_cai/article/details/88109141