python爬虫入门（1）简单爬取网页源码

转载请注明链接

初步学习python爬虫，本文直接上程序，http及python相关基本知识不再此处赘述。
环境：ubuntu14.04+python3.4+pycharm

from urllib import request
import urllib.error
import requests
import zhihuspider

#该方法使用urllib.request模块爬取
def urllibTest(head):
    try:
        #这里需要配置代理上网，注意这里只是指定http代理
        #如果密码有特殊符号，比如@符号，需要使用unicode字符代替：%40
        proxyHandler = request.ProxyHandler({'http': 'http://username:1qaz%[email protected]:8080/'})
        opener = request.build_opener(proxyHandler)
        # opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19')]
        # opener.addheaders =  [('user_agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')]
        request.install_opener(opener)
        req = request.Request("www.baidu.com", headers=head)
        #  response = opener.open(req)
        response = request.urlopen(req)
        html = response.read()
    except urllib.error.HTTPError as e:
        print(e)
    else:
        print(html.decode("utf8"))

#该方法采用requests模块爬取
def requestsTest(head):
    #data = {'word': 'asd'}
    #注意这里针对http及https都提供了代理
    proxy = {'http': 'http://username:1qaz%[email protected]:8080/',
             'https': 'http://username:1qaz%[email protected]:8080/'}
    #这里使用get方法，可指定data，构造get类型的url查询
    response = requests.get('https://www.zhihu.com/topic/19606591/hot', headers=head, proxies = proxy)
    print(response.url)
    print(response.text)

#将爬取的网页源码保存
def savaToFile(data):
    path = "/home/user/pythonspider/"
    f = open(path+'/test', 'w')
    f.write(data)
    f.close()

if __name__ == '__main__':
    #指定user-Agent，个别网站阻止非法的user-Agent，具体的user-Agent可在这查询：
    #https://tool.lu/useragent
    head = {
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'
        #'Connection': 'keep-alive',
        #'Host': 'zhihu-web-analytics.zhihu.com:443',
        #'Proxy-Connection': 'keep-alive'
    }
    #urllibTest(head)
    requestsTest(head)

python爬虫入门（1）简单爬取网页源码

猜你喜欢