# _*_ coding=utf-8 _*_
import threading
import requests
from queue import Queue
from bs4 import BeautifulSoup
_sentinel = object()
class GetHtml(threading.Thread):
"""获取Html信息,输出文章标题"""
def __init__(self, queue):
super().__init__()
self.queue = queue
def run(self):
while True:
url = self.queue.get()
if url is _sentinel:
break
con = requests.get(url)
print(BeautifulSoup(con.text, 'html.parser').find('title').text)
class GetUrl(threading.Thread):
"""获取目标URL列表"""
def __init__(self, queue):
super().__init__()
self.queue = queue
def run(self):
i = 1
while True:
context = requests.get('http://python.jobbole.com/category/project/page/{}/'.format(i))
if context.status_code == 404:
break
urls = [url_text['href'] for url_text in BeautifulSoup(context.text, 'html.parser').find_all('a', class_='archive-title')]
for url in urls:
self.queue.put(url)
i += 1
# 添加标志位,在所有url添加完成后并爬取后,终止线程
self.queue.put(_sentinel)
if __name__ == '__main__':
# 利用队列进行线程之间的通信
queue_main = Queue(maxsize=3)
thread1 = GetUrl(queue_main)
thread2 = GetHtml(queue_main)
thread2.start()
thread1.start()
Python 多线程 爬虫 通信 示例
猜你喜欢
转载自blog.csdn.net/wei_bo_cai/article/details/88109141
今日推荐
周排行