多线程爬虫

多线程爬虫

多线程基础

import threading

多线程基础

import threading

class A(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
    def run(self):
        for i in range(10):
            print("我是线程A")
class B(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
    def run(self):
        for i in range(10):
            print("我是线程B")

t1 = A()
t2 = B()
t1.start()
t2.start()

队列基础

依次访问放入队列的url

import queue

a = queue.Queue()
a.put("hello")
a.task_done()   #该次入队列任务完成
print(a.get())

多线程爬虫

线程1爬url并处理成真实url，放入队列urlqueue
线程2从urlqueue中取url爬对应信息
线程3判断程序是否完成，延迟60s执行
这段代码编码还有问题

import threading
import queue
import re
import urllib.request
import time
import urllib.error

urlqueue = queue.Queue()
headers = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)  #将opener安装为全局

listurl = []
#使用代理服务器的函数
def use_proxy(proxy_addr, url):
    try:
        # proxy = urllib.request.ProxyHandler({'http':proxy_addr})
        # opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
        # urllib.request.install_opener(opener)
        data = urllib.request.urlopen(url).read().decode('utf-8')
        return data
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
        time.sleep(10)
    except Exception as e:
        print("exception:" + str(e))
        time.sleep(1)

#线程一，专门获取对应网址并处理为真实网址
class geturl(threading.Thread):
    def __init__(self, key, pagestart, pageend, proxy, urlqueue):
        threading.Thread.__init__(self)
        self.pagestart = pagestart
        self.pageend = pageend
        self.proxy = proxy
        self.urlqueue = urlqueue
        self.key = key
    def run(self):
        page = self.pagestart
        #编码关键词key
        keycode = urllib.request.quote(self.key)
        for page in range(self.pagestart, self.pageend+1):
            url = "http://weixin.sogou.com/weixin?type=2&query="+keycode+"&page="+str(page)
            data1 = use_proxy(self.proxy, url)
            listurlpat = '<div class="txt-box">.*?(http://.*?)"'
            listurl.append(re.compile(listurlpat, re.S).findall(data1))
        #便于调试
        print("获取到"+str(len(listurl))+"页")
        for i in range(0, len(listurl)):
            #等一等线程2，合理分配资源
            time.sleep(7)
            for j in range(0, len(listurl[i])):
                try:
                    url = listurl[i][j]
                    #处理成真实url，读者亦可以观察对应网址的关系自行分析，采集网址比真实网址多了一串amp
                    url = url.replace("amp;","")
                    print("第"+str(i)+"i"+str(j)+"j次入队")
                    self.urlqueue.put(url)
                    self.urlqueue.task_done()
                except urllib.error.URLError as e:
                    if hasattr(e, "code"):
                        print(e.code)
                    if hasattr(e, "reason"):
                        print(e.reason)
                    time.sleep(10)
                except Exception as e:
                    print("exception:"+str(e))
                    time.sleep(1)

#线程2，与线程1并行执行，从线程1提供的文章网址中一次爬去对应文章信息并处理
class getcontent(threading.Thread):
    def __init__(self, urlqueue, proxy):
        threading.Thread.__init__(self)
        self.urlqueue = urlqueue
        self.proxy = proxy
    def run(self):
        html1='''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
        <html xmlns="http://www.w3.org/1999/xhtml">
        <head>
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
        <title>微信文章页面</title>
        </head>
        <body>'''
        html2 = '''</body></html>'''
        i = 1
        while(True):
            if not self.urlqueue.empty():
                fh = open("../weixin/"+ str(i) +".html", "wb")
                fh.write(html1.encode('utf-8'))

                try:
                    url = self.urlqueue.get()
                    data = str(urllib.request.urlopen(url).read().decode('utf-8'))
                    titlepat = "<title>(.*?)</title>"
                    contentpat = 'id="js_content">(.*?)id="js_sg_bar"'
                    title = re.compile(titlepat).findall(data)
                    content = re.compile(contentpat).findall(data)
                    thistitle="此次没有获取到"
                    thiscontent = "此次没有获取到"
                    if(title != []):
                        thistitle = title[0]
                    if(content != []):
                        thiscontent = content[0]
                    dataall = "<p>标题为："+thistitle+"</p><p>内容为："+thiscontent+"</p><br>"
                    fh.write(dataall.encode('utf-8'))
                    print("第"+str(i)+"个页面处理") #便于调试
                    i += 1
                except urllib.error.URLError as e:
                    if hasattr(e, "code"):
                        print(e.code)
                    if hasattr(e, "reason"):
                        print(e.reason)
                    time.sleep(10)
                except Exception as e:
                    print("exceptioin:" + str(e))
                    time.sleep(1)

                fh.write(html2.encode('utf-8'))
                fh.close()

#并行控制程序 若60s未响应，并且存url的队列已空，则判断为执行成功
class conrl(threading.Thread):
    def __init__(self, urlqueue):
        threading.Thread.__init__(self)
        self.urlqueue = urlqueue
    def run(self):
        while(True):
            print("程序执行中")
            time.sleep(60)
            if(self.urlqueue.empty()):
                print("程序执行完毕！")
                exit()


key = "人工智能"
proxy = "101.236.35.98:8866"
proxy2 = ""
pagestart = 1
pageend = 2
t1 = geturl(key, pagestart, pageend, proxy, urlqueue)
t1.start()
t2 = getcontent(urlqueue, proxy)
t2.start()
t3 = conrl(urlqueue)
t3.start()

《精通python网络爬虫》学习笔记四——多线程爬虫

多线程爬虫

多线程基础

队列基础

多线程爬虫

猜你喜欢