爬虫-链接深度

from urllib.request import urlopen
from urllib.error import URLError,HTTPError
import re
import time

#url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E8%A5%BF%E5%AE%89&kw=python&sm=0&p=1'
#url = 'http://httpstat.us/500'
def download(url,retries_num=3):
    try:
        print('download... %s' % url)
        res = urlopen(url)
        html = res.read().decode('utf-8' )
    except HTTPError as e:
        print(e.code)
        html = None
        if retries_num > 0:
            print('[E]HTTPError!,retry times %d' % (4-retries_num))
            if hasattr(e,'code') and 500 <=e.code <=600:
                html = download(url,retries_num-1)
        else:
            print('[E]Failed!')
    except URLError as e:
        html = None
        print('[E]Unlocated URL!',url)

    return html

def get_links(html):
    webpage_regex = re.compile(r'<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
    return webpage_regex.findall(html)

def link_crawler(home_url,link_regex,depth_regex=None):
    crawl_queue = [home_url]
    seen = set()
    while crawl_queue:
        url = crawl_queue.pop(0)
        time.sleep(0.5)
        html = download(url)
        if depth_regex and re.match(re.compile(depth_regex,re.IGNORECASE),url):
            continue
        for link in get_links(html):
            if re.match(re.compile(link_regex,re.IGNORECASE),link):
                if link not in seen:
                    crawl_queue.append(link)
                    seen.add(link)
    return seen

def main():
    home_url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E8%A5%BF%E5%AE%89&kw=python&p=1&isadv=0'
    link_regex = 'http://jobs.zhaopin.com/[\d]{15}.htm|http://sou.zhaopin.com/jobs/searchresult.ashx\?jl=%e8%a5%bf%e5%ae%89&amp;kw=python'
    depth_regex = 'http://jobs.zhaopin.com/[\d]{15}.htm'
    res = link_crawler(home_url=home_url,link_regex=link_regex,depth_regex=depth_regex)                                          


main()
猜你喜欢