- 使用的生产者和消费者模式,刚开始我的img_queque 中始终为空,取不出值,那是主线程执行太快,该线程还没有执行,就已经结束。使用线程等待 join。
代码
import os
import re
import threading
from queue import Queue
from urllib import request
import requests
headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
class Prouducer(threading.Thread):
"""生产者img_url"""
def __init__(self, page_url_queue, img_queue, *args, **kwargs):
super(Prouducer, self).__init__(*args, **kwargs)
self.page_url_queue = page_url_queue
self.img_queue = img_queue
def get_response(self, url, headers=None):
try:
resp = requests.get(url, headers=headers)
if resp.status_code != 200:
raise RequestException("The request failed")
return resp.text
except Exception as e:
print(e)
def run(self):
while not self.page_url_queue.empty():
url = self.page_url_queue.get()
resp = self.get_response(url, headers)
params = re.compile(r'<a.*?class="col-xs-4 col-md-3">.*?<img.*?data-original="(.*?)".*?alt="(.*?)".*?</a>',
re.S)
for img in re.findall(params, resp):
self.img_queue.put(img)
self.page_url_queue.task_done()
class Consumer(threading.Thread):
"""消费者img_url"""
def __init__(self, img_queue, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.img_queue = img_queue
def get_response(self, url, headers):
content = (request.urlopen(request.Request(url, headers=headers))).read()
print(url)
return content
def save(self, url, name, content):
name = "".join(re.split(r"[|\\/?:*<>!!]+", name))
suffix = os.path.splitext(url)[1]
file_name = "".join(['images/', name, suffix])
os.makedirs("images", exist_ok=True)
with open(file_name, "wb") as f:
f.write(content)
def run(self):
while not self.img_queue.empty():
url, name = self.img_queue.get()
print(url, name)
content = self.get_response(url, headers)
self.save(url, name, content)
self.img_queue.task_done()
class RequestException(Exception):
"""自定义异常"""
pass
def main():
page_url_queue = Queue()
img_queue = Queue()
for i in range(1, 2):
url = f"https://www.doutula.com/zz/list?page={i}"
page_url_queue.put(url)
for i in range(5):
t1 = Prouducer(page_url_queue, img_queue)
t1.start()
t1.join()
for i in range(5):
t2 = Consumer(img_queue)
t2.start()
t2.join()
if __name__ == '__main__':
main()