《python网络数据采集》读后感 第三章:开始采集

 1 from urllib.request import urlopen
 2 from bs4 import BeautifulSoup
 3 import re
 4 import random
 5 import datetime
 6 
 7 pages = set()
 8 random.seed(datetime.datetime.now())
 9 
10 #获取当前页面所有内链的表
11 def getInternalLinks(bsObj,includeUrl):
12     internalLinks = []
13     #找出所有以"/"开头的链接
14     for link in bsObj.findAll("a",href = re.compile("^(/|.*"+includeUrl+")")):
15         if link.attrs['href'] is not None:
16             if link.attrs['href'] not in internalLinks:
17                 internalLinks.append(link.attrs['href'])
18 
19     return internalLinks
20 
21 #获取当前页面所有外链的表
22 def getExternalLinks(bsObj,excludeUrl):
23     externalLinks = []
24     #找出所有以'http'和'www'开头且不包含当前URL的链接
25     for link in bsObj.findAll("a",
26                               href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
27         if link.attrs['href'] is not None:
28             if link.attrs['href'] not in externalLinks:
29                 externalLinks.append(link.attrs['href'])
30 
31     return externalLinks
32 
33 def splitAddress(address):
34     addressParts = address.replace("http://","").split("/")
35     return addressParts
36 
37 
38 def getNextExternalLink(param):
39     pass
40 
41 
42 def getRandomExternalLink(startingPage):
43     html = urlopen(startingPage)
44     bsObj = BeautifulSoup(html)
45     externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])
46     if len(externalLinks) == 0:
47         internalLinks = getInternalLinks(startingPage)
48         return getNextExternalLink(internalLinks[random.randint(0,
49                                     len(internalLinks)-1)])
50     else:
51         return externalLinks[random.randint(0, len(externalLinks)-1)]
52 
53 def followExternalOnly(startingSite):
54     externalLink = getRandomExternalLink("http://oreilly.com")
55     print("随机外链是:"+externalLink)
56     followExternalOnly(externalLink)
57 
58 followExternalOnly("http://oreilly.com")

猜你喜欢

转载自www.cnblogs.com/chengchengaqin/p/9508860.html
今日推荐