Python爬虫---队列模拟递归遍历(广度遍历)

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/hiphopxiao/article/details/82633952

1.导入re模块:正则

2.导入urllib.request模块:爬虫

3.导入deque模块:双向队列

4.extend():列表末尾一次性追加另一个序列中的多个值

5.findall():相匹配的全部字串,返回形式为数组

6.compile():将一个字符串编译为字节

7.popleft():队列为先进后出,都是添加在列表最前面那个值

import urllib.request
import urllib
import re
from collections import deque
#http://bbs.tianya.cn/m/post-140-393974-4.shtml
#http://bbs.tianya.cn
#<a class="u-btn pre-btn" href="/m/post-140-393974-4.shtml"></a>
#s广度遍历使用队列

def  geteveryurl(data):
    alllist=[]
    mylist1=[]
    mylist2=[]

    mylist1=getallhttp(data)
    if len(mylist1) >0:
        mylist2=getabsurl(mylist1[0],data)

    alllist.extend(mylist1)
    alllist.extend(mylist2)
    return  alllist


#<a class="u-btn pre-btn" href="/m/post-140-393974-4.shtml"></a>
def  getabsurl(url,data):
    try:
        regex=re.compile("href=\"(.*?)\"",re.IGNORECASE)
        httplist=regex.findall(data)
        newhttplist=httplist.copy()#深拷贝
        for data  in  newhttplist:
            if  data.find("http://")!=-1:
                httplist.remove(data)
            if  data.find("javascript")!=-1:
                httplist.remove(data)
        hostname=gethostname(url)
        if hostname!=None:
            for  i  in range(len(httplist)):
                httplist[i]=hostname+httplist[i]

        return httplist
    except:
        return []


#http://bbs.tianya.cn/post-140-393974-1.shtml'
#http://bbs.tianya.cn
def  gethostname(httpstr):
    try:
        mailregex = re.compile(r"(http://\S*?)/", re.IGNORECASE)
        mylist = mailregex.findall(httpstr)
        if  len(mylist)==0:
            return None
        else:
            return mylist[0]
    except:
        return None


def  getallhttp(data):
    try:
        mailregex = re.compile(r"(http://\S*?)[\"|>|)]", re.IGNORECASE)
        mylist = mailregex.findall(data)
        return mylist
    except:
        return []





def  getallemail(data):
    try:
        mailregex = re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})", re.IGNORECASE)
        mylist = mailregex.findall(data)
        return mylist
    except:
        return []




def  getdata(url):
    try:
        data=urllib.request.urlopen(url).read().decode("utf-8")
        return data  #没有异常返回字符串
    except:
        return "" #发生异常返回空



def  BFS(urlstr):
    urlqueue=deque([]) #队列
    urlqueue.append(urlstr)
    while len(urlqueue)!=0:
        url=urlqueue.popleft()#队列弹出的数据
        print(url)#打印url链接
        pagedata=getdata(url)#获取网页源代码
        emaillist=getallemail(pagedata) #提取邮箱到列表
        if len(emaillist)!=0:  #邮箱不为空
            for  email in emaillist: #打印所有邮箱
                print(email)
        newurllist=geteveryurl(pagedata)#抓取所有的url
        if  len(newurllist)!=0:  #判断长度
            for urlstr in newurllist: #循环处理每一个url,
                if urlstr not in urlqueue: #判断存在或者不存在
                    urlqueue.append(urlstr)   #插入



#BFS("http://bbs.tianya.cn/m/post-140-393974-5.shtml")
BFS("http://www.baidu.com/")

猜你喜欢

转载自blog.csdn.net/hiphopxiao/article/details/82633952