链接爬虫实战

import re
import urllib
import urllib.request as request

def getlink(url):
    #模拟成浏览器
    headers = ('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0')
    opener = request.build_opener()
    opener.addheaders = [headers]

    #将opener安装成全局
    request.install_opener(opener)
    file = request.urlopen(url)
    data = str(file.read())
    pat = '(https?://[^\s)";]+\.(\w|/)*)'
    link = re.compile(pat).findall(data)
    #去重
    link = list(set(link))
    return link

url = "http://blog.csdn.net/"
link = getlink(url)
for i in link:
    print(i[0])

结果:

http://blog.csdn.net
https://avatar.csdn.net/5/1/1/1_liumiaocn.jpg
https://avatar.csdn.net/C/3/E/1_csdnnews.jpg
https://avatar.csdn.net/4/C/8/1_super828.jpg
https://avatar.csdn.net/5/4/0/1_qq_43168841.jpg
https://blog.csdn.net/turingbooks/article/details/82995901
https://avatar.csdn.net/2/8/1/1_qq_40196321.jpg
https://avatar.csdn.net/A/E/8/1_zwjweb.jpg
https://csdnimg.cn/feed/20181122/78125a13a79e15dfeb089c2c0148f79e.png
https://blog.csdn.net/n994298535/article/details/84451828
https://blog.csdn.net/valada/article/details/84660265
https://avatar.csdn.net/6/3/D/1_weixin_42882439.jpg
https://blog.csdn.net/u012999985/article/details/80877671
https://avatar.csdn.net/5/C/1/1_j_java1.jpg
https://blog.csdn.net/qq_43202482
https://csdnimg.cn/feed/20181009/d0cb3efb1bb3ea90705dcbefeb17884e.jpg
。。。

其中正则表达式的基本格式为https//xxx.yyy/



 

猜你喜欢

转载自blog.csdn.net/qq_41359265/article/details/84674707