from urllib.request import urlopen
from urllib.error import URLError,HTTPError
import re
import time
#url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E8%A5%BF%E5%AE%89&kw=python&sm=0&p=1'
#url = 'http://httpstat.us/500'
def download(url,retries_num=3):
try:
print('download... %s' % url)
res = urlopen(url)
html = res.read().decode('utf-8' )
except HTTPError as e:
print(e.code)
html = None
if retries_num > 0:
print('[E]HTTPError!,retry times %d' % (4-retries_num))
if hasattr(e,'code') and 500 <=e.code <=600:
html = download(url,retries_num-1)
else:
print('[E]Failed!')
except URLError as e:
html = None
print('[E]Unlocated URL!',url)
return html
def get_links(html):
webpage_regex = re.compile(r'<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
return webpage_regex.findall(html)
def link_crawler(home_url,link_regex,depth_regex=None):
crawl_queue = [home_url]
seen = set()
while crawl_queue:
url = crawl_queue.pop(0)
time.sleep(0.5)
html = download(url)
if depth_regex and re.match(re.compile(depth_regex,re.IGNORECASE),url):
continue
for link in get_links(html):
if re.match(re.compile(link_regex,re.IGNORECASE),link):
if link not in seen:
crawl_queue.append(link)
seen.add(link)
return seen
def main():
home_url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E8%A5%BF%E5%AE%89&kw=python&p=1&isadv=0'
link_regex = 'http://jobs.zhaopin.com/[\d]{15}.htm|http://sou.zhaopin.com/jobs/searchresult.ashx\?jl=%e8%a5%bf%e5%ae%89&kw=python'
depth_regex = 'http://jobs.zhaopin.com/[\d]{15}.htm'
res = link_crawler(home_url=home_url,link_regex=link_regex,depth_regex=depth_regex)
main()
爬虫-链接深度
猜你喜欢
转载自blog.csdn.net/ywf331/article/details/79688926
今日推荐
周排行