python 爬虫urllib基础示例

环境使用python3.5.2  urllib3-1.22  

下载安装

wget https://www.python.org/ftp/python/3.5.2/Python-3.5.2.tgz

tar -zxf Python-3.5.2.tgz

cd Python-3.5.2/

./configure --prefix=/usr/local/python

make && make install

mv /usr/bin/python /usr/bin/python275

ln -s /usr/local/python/bin/python3 /usr/bin/python

wget https://files.pythonhosted.org/packages/ee/11/7c59620aceedcc1ef65e156cc5ce5a24ef87be4107c2b74458464e437a5d/urllib3-1.22.tar.gz

tar zxf urllib3-1.22.tar.gz 

cd urllib3-1.22/

python setup.py install


浏览器模拟示例

添加headers一:build_opener()
import urllib.request
url="http://www.baidu.com"
headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
data=opener.open(url).read()
fl=open("/home/urllib/test/1.html","wb")
fl.write(data)
fl.close()
添加headers二:add_header()
import urllib.request
url="http://www.baidu.com"
req=urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36")
data=urllib.request.urlopen(req).read()
fl=open("/home/urllib/test/2.html","wb")
fl.write(data)
fl.close()


增加超时设置

timeout超时
import urllib.request
for i in range(1,100):
	try:
		file=urllib.request.urlopen("http://www.baidu.com",timeout=1)
		data=file.read()
		print(len(data))
	except Exception as e:
		print("出现异常---->"+str(e))
		


HTTP协议GET请求一

get请求
import urllib.request
keywd="hello"
url="http://www.baidu.com/s?wd="+keywd
req=urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36")
data=-urllib.request.urlopen(req).read()
fl=open("/home/urllib/test/3.html","wb")
fl.write(data)
fl.close()

HTTP协议GET请求二

get请求 (编码)
import urllib.request
keywd="中国"
url="http://www.baidu.com/s?wd="
key_code=urllib.request.quote(keywd)
url_all=url+key_code
req=urllib.request.Request(url_all)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36")
data=-urllib.request.urlopen(req).read()
fl=open("/home/urllib/test/4.html","wb")
fl.write(data)
fl.close()


HTTP协议POST请求

post请求
import urllib.request
import urllib.parse
url="http://www.baidu.com/mypost/"
postdata=urllib.parse.urlencode({
"user":"testname",
"passwd":"123456"
}).encode('utf-8')
req=urllib.request.Request(url,postdata)
red.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36")
data=urllib.request.urlopen(req).read()
fl=open("/home/urllib/test/5.html","wb")
fl.write(data)
fl.close()


使用代理服务器

def use_proxy(proxy_addr,url):
	import urllib.request
	proxy=urllib.request.ProxyHandler({'http':proxy_addr})
	opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
	urllib.request.install_opener(opener)
	data=urllib.request.urlopen(url).read().decode('utf-8')
	return data
proxy_addr="201.25.210.23:7623"
url="http://www.baidu.com"
data=use_proxy(proxy_addr,url)
fl=open("/home/urllib/test/6.html","wb")
fl.write(data)
fl.close()


开启DebugLog

import urllib.request
url="http://www.baidu.com"
httpd=urllib.request.HTTPHandler(debuglevel=1)
httpsd=urllib.request.HTTPSHandler(debuglevel=1)
opener=urllib.request.build_opener(opener)
urllib.request.install_opener(opener)
data=urllib.request.urlopen(url)
fl=open("/home/urllib/test/7.html","wb")
fl.write(data)
fl.close()


URLError异常处理

URLError异常处理
import urllib.request
import urllib.error
try:
	urllib.request.urlopen("http://blog.csdn.net")
except urllib.error.URLError as e:
	print(e.reason)

HTTPError处理	
import urllib.request
import urllib.error
try:
	urllib.request.urlopen("http://blog.csdn.net")
except urllib.error.HTTPError as e:
	print(e.code)
	print(e.reason)

结合使用
import urllib.request
import urllib.error
try:
	urllib.request.urlopen("http://blog.csdn.net")
except urllib.error.HTTPError as e:
	print(e.code)
	print(e.reason)
except urllib.error.URLError as e:
	print(e.reason)

推荐方法:
import urllib.request
import urllib.error
try:
	urllib.request.urlopen("http://blog.csdn.net")
except urllib.error.URLError as e:
	if hasattr(e,"code"):
		print(e.code)
	if hasattr(e,"reason"):
		print(e.reason)
		


示例仅供参考


猜你喜欢

转载自blog.51cto.com/superleedo/2121859