转载请注明链接
初步学习python爬虫,本文直接上程序,http及python相关基本知识不再此处赘述。
环境:ubuntu14.04+python3.4+pycharm
from urllib import request
import urllib.error
import requests
import zhihuspider
#该方法使用urllib.request模块爬取
def urllibTest(head):
try:
#这里需要配置代理上网,注意这里只是指定http代理
#如果密码有特殊符号,比如@符号,需要使用unicode字符代替:%40
proxyHandler = request.ProxyHandler({'http': 'http://username:1qaz%[email protected]:8080/'})
opener = request.build_opener(proxyHandler)
# opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19')]
# opener.addheaders = [('user_agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')]
request.install_opener(opener)
req = request.Request("www.baidu.com", headers=head)
# response = opener.open(req)
response = request.urlopen(req)
html = response.read()
except urllib.error.HTTPError as e:
print(e)
else:
print(html.decode("utf8"))
#该方法采用requests模块爬取
def requestsTest(head):
#data = {'word': 'asd'}
#注意这里针对http及https都提供了代理
proxy = {'http': 'http://username:1qaz%[email protected]:8080/',
'https': 'http://username:1qaz%[email protected]:8080/'}
#这里使用get方法,可指定data,构造get类型的url查询
response = requests.get('https://www.zhihu.com/topic/19606591/hot', headers=head, proxies = proxy)
print(response.url)
print(response.text)
#将爬取的网页源码保存
def savaToFile(data):
path = "/home/user/pythonspider/"
f = open(path+'/test', 'w')
f.write(data)
f.close()
if __name__ == '__main__':
#指定user-Agent,个别网站阻止非法的user-Agent,具体的user-Agent可在这查询:
#https://tool.lu/useragent
head = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'
#'Connection': 'keep-alive',
#'Host': 'zhihu-web-analytics.zhihu.com:443',
#'Proxy-Connection': 'keep-alive'
}
#urllibTest(head)
requestsTest(head)