python小白学习记录 生产者消费者模型爬取斗图啊网站 (源码有待修改)

from lxml import etree
import requests
from urllib import request
import time
import os
from queue import Queue
import threading
import re

class Procuder(threading.Thread):
    def __init__(self,page_queue,image_queue,*args,**kwargs):
        super(Procuder, self).__init__(*args,**kwargs)
        self.image_queue = image_queue
        self.page_queue = page_queue

    def run(self):
        while True:
            if self.page_queue.empty():
                break
            href = self.page_queue.get()
            print(href)
            self.get_package(href)

    def get_package(self , href):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36"
        }
        time.sleep(0.4)
        resp = requests.get(href, headers=headers)
        result = resp.text
        html = etree.HTML(result)

        imagetitle = html.xpath('//div[@class]/h1/a/text()')[0]

        imagenames = html.xpath('//div[@class="artile_des"]//img/@alt')
        imagecontentes = html.xpath('//div[@class="artile_des"]//img/@src')
        global number
        path = "f:/testimages/" + "--" + str(imagetitle)
        path = path.replace(".", "")
        path = path.replace(",", "[")
        path = path.replace("", "[")
        path = path.replace("<", "[")
        path = path.replace(">", "]")
        path = path.replace("?", "")
        path = path.replace("|", "")
        os.makedirs(path)
        # print(path)
        for index in range(len(imagecontentes)):
            suffix = os.path.splitext(imagecontentes[index])
            indexname = str(imagenames[index]).replace("?", "")
            # request.urlretrieve(imagecontentes[index],path+"/"+indexname+"--"+str(index)+"--"+suffix[1])
            self.image_queue.put((path + "/" + indexname + "--" + str(index) + "--" + suffix[1], imagecontentes[index]))
class Consumer(threading.Thread):
    def __init__(self,page_queue,image_queue,*args,**kwargs):
        super(Consumer, self).__init__(*args,**kwargs)
        self.image_queue = image_queue
        self.page_queue = page_queue
    def run(self):
        while True:
            if self.image_queue.empty() and self.page_queue.empty():
                break
            image_path,image_content = self.image_queue.get()
            request.urlretrieve(image_content,image_path)
            # res = requests.get(image_content)
            #print(image_content)
            # res.raise_for_status()
            # playFile = open(image_path, 'wb')
            # for chunk in res.iter_content(100000):
            #     playFile.write(chunk)
            print(image_content," 已完成")


def getpackagetag(url,page_queue):
    time.sleep(0.4)

def main():
    image_queue = Queue(1000)
    page_queue = Queue(1000)
    #os.makedirs("f:/testimages")
    for x in range(1, 10):
        url = "https://www.doutula.com/article/list/?page=%s" % x
        print(url)
        # getpackagetag(url,page_queue)
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36"
        }
        resp = requests.get(url, headers=headers)
        time.sleep(0.4)
        result = resp.text
        print(result)
        html = etree.HTML(result)
        listhref = html.xpath('//a[(@class="list-group-item random_list tg-article" or @class="list-group-item random_list") and @href]/@href')

        # listtitle = html.xpath('//a[(@class="list-group-item random_list tg-article" or @class="list-group-item random_list") and @href]/div[@class="random_title"]/text()')
        for index in range(len(listhref)):
            # page_queue.put((listtitle[index],listhref[index]))
            # while True:
            #     if page_queue.empty():
            #         break
            #     title,href =  page_queue.get()
            #     print(" title ",title," href ",href)
            print(listhref[index])
            page_queue.put(listhref[index])
    for index in range(3):
        x = Procuder(page_queue, image_queue)
        x.start()
    for index in range(3):
        x = Consumer(page_queue, image_queue)
        x.start()

if __name__ == '__main__':
    main()

源码出问题

  使用单线程不会出错

 使用多线程时

在queue队列中放至url时,会同时在一个位置放置多个url 大概????  

导致解析url时报错

猜你喜欢

转载自www.cnblogs.com/jswf/p/12316909.html