python爬虫--Urllib库与URLError异常处理


第一天开始学习基础爬虫,并没有太多的注释,主要笔者自己学习巩固
内容仅供参考!!!

import urllib.request
file = urllib.request.urlopen("http://www.baidu.com")
data=file.read()
dataline=file.readline()
dataall=file.readlines()
dataline
b''
data
b'<!DOCTYPE html><!--STATUS OK--><html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"><meta content="always" name="referrer"><meta name="theme-color" content="#2932e1"><link rel="shortcut icon" href="/favicon.ico" type="image/x-icon" /><link rel="search" type="application/opensearchdescription+xml" href="/content-search.xml" title="\xe7\x99\xbe\xe5\xba\xa6\xe6\x90\x9c\xe7\xb4\xa2" /><link rel="icon" sizes="any" mask href="//www.baidu.com/img/baidu_85beaf5496f291521eb75ba38eacbd87.svg"><link rel="dns-prefetch" href="//dss0.bdstatic.com"/><link rel="dns-prefetch" href="//dss1.bdstatic.com"/><link rel="dns-prefetch" href="//ss1.bdstatic.com"/><link rel="dns-prefetch" href="//sp0.baidu.com"/><link rel="dns-prefetch" href="//sp1.baidu.com"/><link rel="dns-prefetch" href="//sp2.baidu.com"/>(爬取的内容 由于太多我进行了删除)
dataall
[]
fhandle = open("C:/Users/Administrator/Desktop/爬虫学习/baidu.html","wb")
fhandle.write(data)
224993

1.关闭该文件

fhandle.close()
filename=urllib.request.urlretrieve("http://i.mooc.chaoxing.com/space/index?t=1584514561359",filename="C:/Users/Administrator/Desktop/爬虫学习/学习通.html")
filename=urllib.request.urlretrieve("https://blog.csdn.net/qq_40651017/article/details/105405031",filename="C:/Users/Administrator/Desktop/爬虫学习/51.html")

2.清除retrieve执行之后的缓存

urllib.request.urlcleanup()

3.返回环境有关信息

file.info()
<http.client.HTTPMessage at 0x175605ee188>

4.获取当前网页状态码 (只有200正确)

file.getcode()
200
file.geturl()
'http://www.baidu.com'

对网址编码.解码

urllib.request.quote("http://www.baidu.com")
'http%3A//www.baidu.com'
urllib.request.unquote("http%3A//www.baidu.com")
'http://www.baidu.com'

Headers属性

 import urllib.request
url="http://blog.csdn.net"
file = urllib.request.urlopen(url)
data=file.read()
fhandle = open("C:/Users/Administrator/Desktop/爬虫学习/1.html","wb")
fhandle.write(data)
346882
fhandle.close()

1.使用build_opener()修改报头

import urllib.request
url="http://blog.csdn.net/weiwei_pig/article/details/51178226"
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders=[headers]
data = opener.open(url).read()
fhandle = open("C:/Users/Administrator/Desktop/爬虫学习/2.html","wb")
fhandle.write(data)
fhandle.close()

2. 使用urllib.request.Request()下的add_header()添加表头

import urllib.request
url="http://blog.csdn.net/weiwei_pig/article/details/51178226"
req=urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36')
data=urllib.request.urlopen(req).read()
fhandle = open("C:/Users/Administrator/Desktop/爬虫学习/3.html","wb")
fhandle.write(data)
fhandle.close()

超时设置 timeout

import urllib.request
for i in range(1,100):
    try:
        file = urllib.request.urlopen("https://blog.csdn.net/lina_acm/article/details/54808910",timeout=0.5)
        data=file.read()
        print(len(data))
    except Exception as e:
        print("出现异常-->"+str(e))

出现异常-->The read operation timed out
241858
241419
241788
241769
244861
241559
241699
241559
243140
243229
243210
241699
241788
241788
241858
241699
244931
241559
241699
241769
241741
243210
241559
244861
241769
243280
241559



---------------------------------------------------------------------------

http协议请求实战

import urllib.request
keywd="hello"
url = "http://www.baidu.com/s?wd="+keywd
req=urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")

data = urllib.request.urlopen(req).read()
fhandle = open("C:/Users/Administrator/Desktop/爬虫学习/4.html","wb")
fhandle.write(data)
fhandle.close()

编码问题

import urllib.request
keywd="佳诚"
url = "http://www.baidu.com/s?wd="
key_code=urllib.request.quote(keywd)
url_all=url+key_code
req = urllib.request.Request(url_all)
data = urllib.request.urlopen(req).read()
fhandle = open("C:/Users/Administrator/Desktop/爬虫学习/5.html","wb")
fhandle.write(data)
fhandle.close()

post请求实例分析

import urllib.request
import urllib.parse
url="https://www.iqianyue.com/mypost/"
postdata = urllib.parse.urlencode({"name":"zjc","pass":"123456"}).encode('utf-8')
req = urllib.request.Request(url,postdata)
#req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")
data = urllib.request.urlopen(req).read()
fhandle = open("C:/Users/Administrator/Desktop/爬虫学习/6.html","wb")
fhandle.write(data)
fhandle.close()
def use_proxy(proxy_addr,url):
    import urllib.request
    proxy = urllib.request.ProxyHandler({'http':proxy_addr})
    opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    data = urllib.request.urlopen(url).read().decode('utf-8')
    return data
proxy_addr="1.227.56.236:3128"
url = "http://www.baidu.com"
data = use_proxy(proxy_addr,url)
print(len(data))

debuglog实战

import urllib.request
http = urllib.request.HTTPHandler(debuglevel=1)
https = urllib.request.HTTPHandler(debuglevel=1)
opener = urllib.request.build_opener(http,https)
urllib.request.install_opener(opener)
data = urllib.request.urlopen("http://edu.51cto.com")

异常处理神器–URLError实战

import urllib.request
import urllib.error
try:
    urllib.request.urlopen("http://blog.baidusss.net")
except urllib.error.URLError as e:
    print(e.reason)
import urllib.request
import urllib.error
try:
    urllib.request.urlopen("http://blog.baidusss.net")
except urllib.error.HTTPError as e:
    print(e.code)
    print(e.reason)
except urllib.error.URLError as e:
    print(e.reason)
import urllib.request
import urllib.error
try:
    urllib.request.urlopen("http://www.baisddu.com")
except urllib.error.URLError as e:
    if hasattr(e,"code"):
        print(e.code)
    if hasattr(e,"reason"):
        print(e.reason)
    
原创文章 31 获赞 60 访问量 3099

猜你喜欢

转载自blog.csdn.net/qq_40651017/article/details/105785489