版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/hiphopxiao/article/details/82633952
1.导入re模块:正则
2.导入urllib.request模块:爬虫
3.导入deque模块:双向队列
4.extend():列表末尾一次性追加另一个序列中的多个值
5.findall():相匹配的全部字串,返回形式为数组
6.compile():将一个字符串编译为字节
7.popleft():队列为先进后出,都是添加在列表最前面那个值
import urllib.request
import urllib
import re
from collections import deque
#http://bbs.tianya.cn/m/post-140-393974-4.shtml
#http://bbs.tianya.cn
#<a class="u-btn pre-btn" href="/m/post-140-393974-4.shtml"></a>
#s广度遍历使用队列
def geteveryurl(data):
alllist=[]
mylist1=[]
mylist2=[]
mylist1=getallhttp(data)
if len(mylist1) >0:
mylist2=getabsurl(mylist1[0],data)
alllist.extend(mylist1)
alllist.extend(mylist2)
return alllist
#<a class="u-btn pre-btn" href="/m/post-140-393974-4.shtml"></a>
def getabsurl(url,data):
try:
regex=re.compile("href=\"(.*?)\"",re.IGNORECASE)
httplist=regex.findall(data)
newhttplist=httplist.copy()#深拷贝
for data in newhttplist:
if data.find("http://")!=-1:
httplist.remove(data)
if data.find("javascript")!=-1:
httplist.remove(data)
hostname=gethostname(url)
if hostname!=None:
for i in range(len(httplist)):
httplist[i]=hostname+httplist[i]
return httplist
except:
return []
#http://bbs.tianya.cn/post-140-393974-1.shtml'
#http://bbs.tianya.cn
def gethostname(httpstr):
try:
mailregex = re.compile(r"(http://\S*?)/", re.IGNORECASE)
mylist = mailregex.findall(httpstr)
if len(mylist)==0:
return None
else:
return mylist[0]
except:
return None
def getallhttp(data):
try:
mailregex = re.compile(r"(http://\S*?)[\"|>|)]", re.IGNORECASE)
mylist = mailregex.findall(data)
return mylist
except:
return []
def getallemail(data):
try:
mailregex = re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})", re.IGNORECASE)
mylist = mailregex.findall(data)
return mylist
except:
return []
def getdata(url):
try:
data=urllib.request.urlopen(url).read().decode("utf-8")
return data #没有异常返回字符串
except:
return "" #发生异常返回空
def BFS(urlstr):
urlqueue=deque([]) #队列
urlqueue.append(urlstr)
while len(urlqueue)!=0:
url=urlqueue.popleft()#队列弹出的数据
print(url)#打印url链接
pagedata=getdata(url)#获取网页源代码
emaillist=getallemail(pagedata) #提取邮箱到列表
if len(emaillist)!=0: #邮箱不为空
for email in emaillist: #打印所有邮箱
print(email)
newurllist=geteveryurl(pagedata)#抓取所有的url
if len(newurllist)!=0: #判断长度
for urlstr in newurllist: #循环处理每一个url,
if urlstr not in urlqueue: #判断存在或者不存在
urlqueue.append(urlstr) #插入
#BFS("http://bbs.tianya.cn/m/post-140-393974-5.shtml")
BFS("http://www.baidu.com/")