import re
import urllib
import urllib.request as request
def getlink(url):
#模拟成浏览器
headers = ('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0')
opener = request.build_opener()
opener.addheaders = [headers]
#将opener安装成全局
request.install_opener(opener)
file = request.urlopen(url)
data = str(file.read())
pat = '(https?://[^\s)";]+\.(\w|/)*)'
link = re.compile(pat).findall(data)
#去重
link = list(set(link))
return link
url = "http://blog.csdn.net/"
link = getlink(url)
for i in link:
print(i[0])
结果:
http://blog.csdn.net
https://avatar.csdn.net/5/1/1/1_liumiaocn.jpg
https://avatar.csdn.net/C/3/E/1_csdnnews.jpg
https://avatar.csdn.net/4/C/8/1_super828.jpg
https://avatar.csdn.net/5/4/0/1_qq_43168841.jpg
https://blog.csdn.net/turingbooks/article/details/82995901
https://avatar.csdn.net/2/8/1/1_qq_40196321.jpg
https://avatar.csdn.net/A/E/8/1_zwjweb.jpg
https://csdnimg.cn/feed/20181122/78125a13a79e15dfeb089c2c0148f79e.png
https://blog.csdn.net/n994298535/article/details/84451828
https://blog.csdn.net/valada/article/details/84660265
https://avatar.csdn.net/6/3/D/1_weixin_42882439.jpg
https://blog.csdn.net/u012999985/article/details/80877671
https://avatar.csdn.net/5/C/1/1_j_java1.jpg
https://blog.csdn.net/qq_43202482
https://csdnimg.cn/feed/20181009/d0cb3efb1bb3ea90705dcbefeb17884e.jpg
。。。
其中正则表达式的基本格式为https//xxx.yyy/