Python爬虫(一) | urllib

urllib包含四个模块

  • request:发送http请求
  • error:异常处理模块
  • parse:一个工具模块,负责url处理
  • robotparser:用来识别robot.txt

1.发送请求 urllib.request

import urllib.request
#发送请求
response = urllib.request.urlopen(url='http://python.org')
print(type(response))
# print(response.read().decode('utf-8'))
print(response.status)
print(response.getheaders())


<class 'http.client.HTTPResponse'>
200
[('Server', 'nginx'), ('Content-Type', 'text/html; charset=utf-8'), ('X-Frame-Options', 'DENY'), ('Via', '1.1 vegur'), ('Via', '1.1 varnish'), ('Content-Length', '48820'), ('Accept-Ranges', 'bytes'), ('Date', 'Sat, 23 Mar 2019 09:26:41 GMT'), ('Via', '1.1 varnish'), ('Age', '970'), ('Connection', 'close'), ('X-Served-By', 'cache-iad2140-IAD, cache-hnd18744-HND'), ('X-Cache', 'HIT, HIT'), ('X-Cache-Hits', '2, 184'), ('X-Timer', 'S1553333202.865127,VS0,VE0'), ('Vary', 'Cookie'), ('Strict-Transport-Security', 'max-age=63072000; includeSubDomains')]



urlopen参数:
                    url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,*,
                    cafile=None, capath=None, cadefault=False, context=None

     1.1data

#添加了data参数 成为POST请求
#data 需要转换成bytes字节流
data = bytes(urllib.parse.urlencode({'name':'tom','age':17}),encoding='utf-8')
response = urllib.request.urlopen(url='http://httpbin.org/post',data=data)
print(response.read().decode('utf-8'))


{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "age": "17", 
    "name": "tom"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Content-Length": "15", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.7"
  }, 
  "json": null, 
  "origin": "124.234.225.3, 124.234.225.3", 
  "url": "https://httpbin.org/post"
}

    1.2. timeout 设置超时请求 超过时间返回异常  urllib.error.URLError: <urlopen error timed out>

#timeout
response = urllib.request.urlopen(url='http://httpbin.org/get',timeout=0.1)
print(response.read().decode('utf-8'))


During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:/Users/404NoFound/Desktop/workhose/test/3_23urllib/urllib_t.py", line 19, in <module>
    response = urllib.request.urlopen(url='http://httpbin.org/get',timeout=0.1)
  File "C:\Users\404NoFound\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 222, in urlopen
    return opener.open(url, data, timeout)
\request.py", line 1345, in http_open
    return self.do_open(http.client.HTTPConnection, req)
  File "C:\Users\404NoFound\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 1319, in do_open
    raise URLError(err)
urllib.error.URLError: <urlopen error timed out>

2.urllib.request.Requst 用来构造更复杂的请求 挂代理 加请求头....

#Request参数

  • url
  • data=None, 必须为字节流,用来提交from表单
  • headers={},加请求头,可以直接写,或者用add_header()添加
  • origin_req_host=None,修改请求方的host名称或ip地址
  • unverifiable=False,表示这个请求是否是无法验证
  • method=None,表示请求使用的方法,比如GET,POST,PUT等
#Request 构造Headers
#Request参数 url, data=None, headers={},origin_req_host=None, unverifiable=False,method=None
data = bytes(urllib.parse.urlencode({'name':'tom','age':17}),encoding='utf-8')
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36'
}
req = urllib.request.Request(url='http://httpbin.org/post',data=data,headers=headers,method='POST')
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))



{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "age": "17", 
    "name": "tom"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Content-Length": "15", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"
  }, 
  "json": null, 
  "origin": "124.234.225.3, 124.234.225.3", 
  "url": "https://httpbin.org/post"
}

3.高级用法, Handler

1.添加代理

#高级用法,添加代理,处理cookies
from urllib.request import ProxyHandler,build_opener

proxy_handle = ProxyHandler({
    'http':'http://110.52.235.163:9999',
    'https':'https://110.52.235.163:9999',
})
opener = build_opener(proxy_handle)
try:
    response = opener.open('http://httpbin.org/get')
    print(response.read().decode('utf-8'))
except:
    print('erro')


{
  "args": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Cache-Control": "max-age=259200", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.7"
  }, 
  "origin": "110.52.235.163, 110.52.235.163", 
  "url": "https://httpbin.org/get"
}

2.Cookies

  • 获取cookies并且输出
#保存cookies
import http.cookiejar,urllib.request
cookie = http.cookiejar.CookieJar()
handle = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handle)
response = opener.open('https://baidu.com')
for item in cookie:
    print(item.name+'='+item.value)



BAIDUID=78E646426124E1F7E5A8F76BC7DA9EFE:FG=1
BIDUPSID=78E646426124E1F7E5A8F76BC7DA9EFE
H_PS_PSSID=1469_21119_28721_28558_28697_28584_28604_20719
PSTM=1553336743
delPer=0
BDSVRTM=0
BD_HOME=0
  • 将cookies保存成Mozilla型预览器的Cookies格式
cookie = http.cookiejar.MozillaCookieJar('cookies.txt')
handle = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handle)
response = opener.open('https://baidu.com')
print(response.status)
cookie.save(ignore_discard=True,ignore_expires=True)



# Netscape HTTP Cookie File
# http://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file!  Do not edit.

.baidu.com	TRUE	/	FALSE	3700820687	BAIDUID	1B02BEC8A89901648C1A184BBF5A9BB7:FG=1
.baidu.com	TRUE	/	FALSE	3700820687	BIDUPSID	1B02BEC8A89901648C1A184BBF5A9BB7
.baidu.com	TRUE	/	FALSE		H_PS_PSSID	1425_21127_28722_28558_28697_28585_28641_26350_28603_28625_28606
.baidu.com	TRUE	/	FALSE	3700820687	PSTM	1553337036
.baidu.com	TRUE	/	FALSE		delPer	0
www.baidu.com	FALSE	/	FALSE		BDSVRTM	0
www.baidu.com	FALSE	/	FALSE		BD_HOME	0

gnore_discard: save even cookies set to be discarded.

ignore_expires: save even cookies that have expiredThe file is overwritten if it already exists

由此可见,ignore_discard的意思是即使cookies将被丢弃也将它保存下来,ignore_expires的意思是如果在该文件中cookies已经存在,则覆盖原文件写入,在这里,我们将这两个全部设置为True。运行之后,cookies将被保存到cookie.txt文件中,我们查看一下内容,附图如下

  • 将cookies保存成LWP的Cookies格式
cookie = http.cookiejar.LWPCookieJar('cookies2.txt')
handle = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handle)
response = opener.open('https://baidu.com')
print(response.status)
cookie.save(ignore_discard=True,ignore_expires=True)


#LWP-Cookies-2.0
Set-Cookie3: BAIDUID="5BCF4AA8ED0DC6EF39DE705D5FBF8CFC:FG=1"; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2087-04-10 13:54:10Z"; version=0
Set-Cookie3: BIDUPSID=5BCF4AA8ED0DC6EF39DE705D5FBF8CFC; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2087-04-10 13:54:10Z"; version=0
Set-Cookie3: H_PS_PSSID=1465_21125_28721_28558_28697_28585_28518_28625_28606; path="/"; domain=".baidu.com"; path_spec; domain_dot; discard; version=0
Set-Cookie3: PSTM=1553337599; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2087-04-10 13:54:10Z"; version=0
Set-Cookie3: delPer=0; path="/"; domain=".baidu.com"; path_spec; domain_dot; discard; version=0
Set-Cookie3: BDSVRTM=0; path="/"; domain="www.baidu.com"; path_spec; discard; version=0
Set-Cookie3: BD_HOME=0; path="/"; domain="www.baidu.com"; path_spec; discard; version=0
  • 读取文件,以LWP为例子
#加载LWP格式的cookie
import http.cookiejar,urllib.request
cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookies2.txt',ignore_discard=True,ignore_expires=True)
handle = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handle)
response = opener.open('https://baidu.com')
print(response.read().decode('utf-8'))

4.处理异常

# 处理异常 URLError
import urllib.error,urllib.request
try:
    response = urllib.request.urlopen('https://cuiqingcai/index.htm')
except urllib.error.URLError as e:
    print(e.reason)
# 处理异常 HTTPError
from urllib import error,request
try:
    response = request.urlopen('https://www.baidu.com/a.htm')
except error.HTTPError as e:
    print(e.reason,e.code)

5.解析链接

1.urlparse()

from urllib.parse import urlparse
url = 'https://www.baidu.com/index.html;user?id=5#comment'
result = urlparse(url)
print(type(result))
print(result)



<class 'urllib.parse.ParseResult'>
ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')

url解析后分为六部分

  • scheme :协议
  • netloc :域名
  • path :路径
  • params :参数
  • query :查询条件
  • fragment :锚点

所以一个Url格式大概为:scheme://netloc/path;params?query#fragment

2.urlunparse()

from  urllib.parse import urlunparse
data = ['https', 'www.baidu.com', '/index.html','user', 'id=5', 'comment']
print(urlunparse(data))

https://www.baidu.com/index.html;user?id=5#comment

3.urlsplit()

相比缺少params

from  urllib.parse import urlsplit
url = 'https://www.baidu.com/index.html;user?id=5#comment'
print(urlsplit(url))

SplitResult(scheme='https', netloc='www.baidu.com', path='/index.html;user', query='id=5', fragment='comment')

4.urlunsplit()

from urllib.parse import urlunsplit
data = ['https','www.baidu.com', '/index.html;user', 'id=5', 'comment']
print(urlunsplit(data))

https://www.baidu.com/index.html;user?id=5#comment

5.urljoin(base, url, allow_fragments=True)

from urllib.parse import urljoin
base_url='http://www.baidu.com'
url='/index.html;a?id=5'
new_url=urljoin(base='http://www.baidu.com',url='/index.html;a?id=5')
print(new_url)
new_url=urljoin(base='http://www.baidu.com',url='http://www.caixukun.com/index.html')
print(new_url)
new_url=urljoin(base='http://www.baidu.com',url='https://www.caixukun.com/index.html')
print(new_url)
new_url=urljoin(base='http://www.baidu.com/index.html',url='http://www.caixukun.com/')
print(new_url)
new_url=urljoin(base='www.baidu.com',url='?id=5')
print(new_url)


http://www.baidu.com/index.html;a?id=5
http://www.caixukun.com/index.html
https://www.caixukun.com/index.html
http://www.caixukun.com/
www.baidu.com?id=5

6.ulrencode()

序列化GET请求的参数。

from urllib.parse import urlencode
data={
    'name':'tom',
    'age':20,
}
base_url='https://www.baidu.com?'
new_url=base_url+urlencode(data)
print(new_url)

https://www.baidu.com?name=tom&age=20

7.parse_qs(),parse_qsl()

反序列化还原成字典、元组

from  urllib.parse import parse_qs,parse_qsl
query = 'name=tom&age=20'
print(parse_qs(query))
print(parse_qsl(query))

{'name': ['tom'], 'age': ['20']}
[('name', 'tom'), ('age', '20')]

8.quote()

将中文转换成URL编码

from urllib.parse import quote
k='练习'
print(quote(k))

%E7%BB%83%E4%B9%A0

9.unquote()

将URL编码转换成中文

from urllib.parse import unquote
k='%E7%BB%83%E4%B9%A0'
print(unquote(k))

练习

猜你喜欢

转载自blog.csdn.net/qq_41179280/article/details/88764467
今日推荐