爬虫遇到的编码问题


import requests
import sys
import chardet #检测字符编码,但是有时候会有误差

#查看两个网址的编码情况
response1 = requests.get("https://github.com/favicon.ico")
print(chardet.detect(response1.content)) #参数为字节型
response2 = requests.get("http://www.baidu.com/")
print(chardet.detect(response2.content))

--------------结果----------------

 {'encoding': None, 'confidence': 0.0, 'language': None}
 {'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}

#造成这种情况的可能是由于网页压缩,从而乱码,编码解码也无效

--------------------编码-----------------------

data = response1.content
print(sys.getdefaultencoding()) #查看默认编码
print(type(data))
#两种字节转换为字符串的方法
print(str(data,encoding='utf-8'))
print(data.decode('utf-8'))

data2= response2.content
print(type(data2))
#两种字节转换为字符串的方法
print(str(data2,encoding='utf-8'))
print(data2.decode('utf-8'))

-----------------结果-------------
data报错:

Traceback (most recent call last):
File "E:/PythonStudy/练习题/Request_ex.py", line 52, in <module>
print(str(data,encoding='utf-8'))
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc5 in position 101: invalid continuation byte

  data2正确返回:

<!DOCTYPE html><html><head><meta http-equiv="content-type" content="text/html;charset=utf-8"><meta http-equiv="X-UA-Compatible" content="IE=Edge"><meta content="never" name="referrer"><title>百度一下,你就知道</title><style>html,body{height:100%}html{overflow-y:auto}body{font:12px arial;background:#fff}body,p,form,ul,li{margin:0;padding:0;list-style:none}body,form{position:relative}td{text-align:left}img{border:0}a{color:#00c}a:active{color:#f60}input{border:0;padding:0}#wrapper{position:relative;_position:;min-height:100%}#head{padding-bottom:100px;text-align:center;}#ftCon{height:100px;position:absolute;bottom:23px;text-align:left;width:100%;margin:0 auto;z-index:0;overflow:hidden}.ftCon-Wrapper{overflow:hidden;margin:0 auto;text-align:center;}#qrcode{display:inline-block;;}#qrcode .qrcode-item{float:left}#qrcode .qrcode-item-2{margin-left:33px}#qrcode .qrcode-img{float:left;width:60px;height:60px}#qrcode .qrcode-item-1 .qrcode-img{background:url(https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/home/img/qrcode/zbios_a4b2d86f.png) 0 0 no-repeat}#qrcode .qrcode-item-2 .qrcode-img{background:url(https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/home/img/qrcode/nuomi_510f7472.png) 0 0 no-repeat}#qrcode .qrcode-text{float:left;color:#999;line-height:23px;margin:8px 0 0 10px}#qrcode .qrcode-text a{color:#999;text-decoration:none}#qrcode .qrcode-text p{text-align:left}#qrcode .qrcode-text b{color:#666;font-weight:700}#qrcode .qrcode-text span{letter-spacing:1px}#ftConw{display:inline-block;text-align:left;margin-left:33px;line-height:22px;position:relative;top:-2px;;;}#ftConw,#ftConw a{color:#999}#lh a{margin-left:25px}#lh #seth,#lh #setf{margin-left:0}#wrapper{min-width:810px;height:100%;min-height:600px}#head{position:relative;padding-bottom:0;height:100%;min-height:600px}#head .head_wrapper{height:100%}#form{margin:22px auto 0;width:641px;text-align:left;z-index:100}#form .bdsug{top:35px}#kw{position:relative}#cp .c-icon-icrlogo,#jgwab .c-icon-jgwablogo{width:14px;height:17px;display:inline-block;overflow:hidden;background:url(https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/global/img/icons_0e814c16.png) no-repeat;}#cp .c-icon-icrlogo{background-position:-600px -96px;position:relative;top:3px}#jgwab .c-icon-jgwablogo{background-position:-623px -96px;position:relative;top:3px;margin-right:6px}.s_btn{width:95px;height:32px;padding-top:2px\9;font-size:14px;background-color:#ddd;background-position:0 -48px;cursor:pointer}.s_btn{width:100px;height:36px;color:#fff;font-size:15px;letter-spacing:1px;background:#3385ff;border-bottom:1px solid #2d78f4;outline:medium;;-webkit-appearance:none;-webkit-border-radius:0}.s_btn.btnhover{background:#317ef3;border-bottom:1px solid #2868c8;;box-shadow:1px 1px 1px #ccc}.s_btn_wr{width:97px;height:34px;display:inline-block;background-position:-120px -48px;;z-index:0;vertical-align:top}.s_btn_wr{width:auto;height:auto;border-bottom:1px solid transparent;}.s_ipt_wr{height:34px}.s_ipt_wr.bg,.s_btn_wr.bg,#su.bg{background-image:none}.s_ipt_wr{border:1px solid #b6b6b6;border-color:#7b7b7b #b6b6b6 #b6b6b6 #7b7b7b;background:#fff;display:inline-block;vertical-align:top;width:539px;margin-right:0;border-right-width:0;border-color:#b8b8b8 transparent #ccc #b8b8b8;overflow:hidden}.s_ipt{width:526px;height:22px;font:16px/18px arial;line-height:22px\9;margin:6px 0 0 7px;padding:0;background:0 0;border:0;outline:0;-webkit-appearance:none}.bdsug{position:absolute;width:418px;background:#fff;display:none;border:1px solid #817f82}.bdsug li{width:511px;color:#000;font:14px arial;line-height:25px;padding:0 8px;position:relative;cursor:default}.bdsug{top:35px;width:538px;border-color:#ccc;box-shadow:1px 1px 3px #ededed;;-webkit-box-shadow:1px 1px 3px #ededed;-moz-box-shadow:1px 1px 3px #ededed;-o-box-shadow:1px 1px 3px #ededed}.s_form{position:relative;top:38.2%}.s_form_wrapper{position:relative;top:-191px}#u1{z-index:2;color:#fff;position:absolute;right:0;top:0;margin:19px 0 5px 0;padding:0 96px 0 0}#u1 a:link,#u1 a:visited{color:#666;text-decoration:none}#u1 a:hover,#u1 a:active{text-decoration:underline}#u1 a:active{color:#00c}#u1 a.bri,#u1 a.bri:visited{display:inline-block;position:absolute;right:10px;width:60px;height:23px;float:left;color:#fff;background:#38f;line-height:24px;font-size:13px;text-align:center;overflow:hidden;border-bottom:1px solid #38f;margin-left:19px;margin-right:2px}#u1 a.mnav,#u1 a.mnav:visited{float:left;color:#333;font-weight:700;line-height:24px;margin-left:20px;font-size:13px;text-decoration:underline}</style></head><body><div id="wrapper"><div id="head"><div class="head_wrapper"><div class="s_form"><div class="s_form_wrapper"><div id="lg"><img src="http://www.baidu.com/img/bd_logo1.png" width="270" height="129"></div><form id="form" name="f" action="https://www.baidu.com/s" class="fm" method="get"><span class="bg s_ipt_wr"><span id="ipt_photo"></span> <input id="kw" name="wd" class="s_ipt" value="" maxlength="255" autocomplete="off"> <input type="hidden" name="ie" value="utf-8"> <input type="hidden" name="rsv_op" value=""> <input type="hidden" name="tn" value="93879309_hao_pg"> <input type="hidden" name="ch" value=""> <input type="hidden" name="rsv_su" value=""></span><span class="bg s_btn_wr"><input type="submit" id="su" value="百度一下" class="bg s_btn"></span></form></div></div><div id="u1"><a href="http://www.nuomi.com" class="mnav">糯米</a><a href="http://news.baidu.com" class="mnav">新闻</a><a href="http://www.hao123.com" class="mnav">hao123</a><a href="http://map.baidu.com" class="mnav">地图</a><a href="http://v.baidu.com" class="mnav">视频</a><a href="http://tieba.baidu.com" class="mnav">贴吧</a><a href="https://passport.baidu.com/v2/?login&tpl=mn&u=https%3A%2F%2Fwww.baidu.com%2F" class="mnav">登录</a><a href="http://www.baidu.com/gaoji/preferences.html" class="mnav">设置</a><a href="http://www.baidu.com/more/" class="bri" style="display:block">更多产品</a></div></div></div><div id="ftCon"><div class="ftCon-Wrapper"><div id="qrcode"><div class="qrcode-item qrcode-item-1"><div class="qrcode-img"></div><div class="qrcode-text"><p><b>手机百度</b></p><p><span>快人一步</span></p></div></div><div class="qrcode-item qrcode-item-2"><div class="qrcode-img"></div><div class="qrcode-text"><p><b>百度糯米</b></p><p><span>一元大餐</span></p></div></div></div><div id="ftConw"><p id="lh"><a id="setf" href="//www.baidu.com/cache/sethelp/help.html" onmousedown="return ns_c({fm:&quot;behs&quot;,tab:&quot;favorites&quot;,pos:0})" target="_blank">把百度设为主页</a><a href="http://home.baidu.com">关于百度</a><a href="http://ir.baidu.com">About&nbsp;&nbsp;Baidu</a><a href="http://e.baidu.com/?refer=888">百度推广</a></p><p id="cp">©2016&nbsp;Baidu&nbsp;<a href="http://www.baidu.com/duty/">使用百度前必读</a>&nbsp;<a href="http://jianyi.baidu.com/" class="cp-feedback">意见反馈</a>&nbsp;京ICP证030173号&nbsp;<i class="c-icon-icrlogo"></i></p><a id="jgwab" target="_blank" href="http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=11000002000001"><i class="c-icon-jgwablogo"></i>京公网安备11000002000001号</a></div></div></div><div id="wrapper_wrapper"></div></div><script type="text/javascript">function gen(len){var s="",r,t;for(var i=len;i--;){r=Math.random();t=r.toString(36).substr(2,1);s+=r>0.5?t.toUpperCase():t}return s.substr(0,len)}function $n(n,p){p=(p||D);var s=p.getElementsByName(n);if(s.length)return s[0]}function $sa(e,o){if(o)for(var n in o)e.setAttribute(n,o[n]),e.n=o[n]}function fc(){var h=location.host,x='=;expires='+new Date(0).toUTCString(),y=x+';path=',z=y+'/;domain=',l=[x,y,y+'/',z+h,z+h.substr(h.indexOf('.'))],o=D.cookie.match(/[^ =;]+(?=\=)/g);if(o&&S)for(var i=o.length;i--;)for(var j=5;j--;)D.cookie=o[i]+l[j];if(window.localStorage)localStorage.clear();setTimeout(fc,500)}var D=document,d=D,S=!D.cookie.match(/home=s/i);D.oncontextmenu=function(){return false};try{fc();if(D.URL.match('#'))location.replace('http://www.baidu.com/s?'+location.hash.replace(/^#/,''));$sa($n("rsv_op"),{value:gen(96)});$sa($n("rsv_su"),{value:gen(96)})}catch(e){}</script></body></html>

猜你喜欢

转载自www.cnblogs.com/wljlxx/p/12116758.html
今日推荐