Resolve coding errors

1. check charset browser e.g. <meta charset="utf-8">, 'utf-8' more common, there are also "gbk",

Get the encoding target return, be adjusted

2. Try to use text instead of the content property

3. For topical distortion (mostly Chinese), e.g. img tag "alt" attribute may try to use '' iso-8859-1 "is encoded, then '' gbk" decodes

import requests
from urllib import request
from bs4 import BeautifulSoup
url_model = 'http://pic.netbian.com/4kdongman/index_{index}.html'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
pre = 'http://pic.netbian.com'
for i in range(2,6):
    url = url_model.format(index = i)
    response = requests.get(url = url,headers = headers)
    response.encoding = 'gbk'   #----------------  1
    ht = response.text
    exit()
    soup = BeautifulSoup(ht,'lxml')
    tagImg = soup.select('.slist > ul > li > a')
    for j in tagImg:
        imgurl = pre + j['href']
        response1 = requests.get(url = imgurl,headers = headers)
        ht1 = response1.content  #使用text属性乱码(先使用gbk编码没问题)  # -----------2
        soup1 = BeautifulSoup(ht1,'lxml')
        ImgDate = soup1.select('#img > img')[0]
        final_url = 'http://pic.netbian.com' + ImgDate['src']
        imgname = ImgDate['alt'] +'.jpg'  # ---------------3
        imgpath = './图片/' + imgname
        request.urlretrieve(final_url,imgpath)
        

Guess you like

Origin www.cnblogs.com/notfind/p/11490999.html