SVG反爬虫绕过-Python Spider

In [1]: import re

In [2]: import requests

In [3]: url_css = 'http://www.porters.vip/confusion/css/food.css'

In [4]: url_svg = 'http://www.porters.vip/confusion/font/food.svg'

In [5]: css_resp = requests.get(url_css).text

In [6]: svg_resp = requests.get(url_svg).text

In [7]: # 提取css对应的坐标值

In [8]: css_class_name = 'vhkbvu' #查找所对应的css数据

In [9]: pile = '.%s{background:-(\d+)px-(\d+)px;}'%css_class_name

In [10]: pattern = re.compile(pile) #先编译

In [11]: css = css_resp.replace('\n','').replace(' ','')

In [12]: css
Out[12]: '#tips{color:#c72222;}d[class^="vhk"]{width:14px;height:30px;margin-top:-9px;background-image:url(../font/food.svg);background-repeat:no-repeat;display:inline-block;vertical-align:middle;margin-left:-6px;}.vhk08k{background:-274px-141px;}.vhk6zl{background:-7px-15px;}.vhk0ao{background:-133px-97px;}.vhk9or{background:-330px-141px;}.vhkfln{background:-428px-15px;}.vhkbvu{background:-386px-97px;}.vhk84t{background:-176px-141px;}.vhkvxd{background:-246px-141px;}.vhkqsc{background:-288px-141px;}.vhkjj4{background:-316px-141px;}.vhk0f1{background:-316px-97px;}.col.action{padding:0px;width:120px;height:40px;text-align:center;background-color:#66a3ff;display:table-cell;vertical-align:middle;}.write_action{color:white;}.lay1{height:10%;}.lay2{height:80%;}.lay3{height:10%;}.lay4{height:100%;}.lay5{height:100%;}.details{height:40%;border:1pxsolid#d6d1d1;padding:20px;font-size:15px;}.ad{margin-top:5px;height:10%;border:1pxsolid#d6d1d1;}.photo{height:28%;border:1pxsolid#d6d1d1;padding:10px;}.photo_head{height:80%;}.photo_headimg{width:100%;height:100%;}.photo_list{height:20%;}.photo_listul{margin-top:10px;width:100%;height:100%;list-style:none;}.photo_listulli{margin-left:10px;width:20%;height:80%;float:left;}.photo_listulliimg{width:100%;height:100%;}.map{margin-top:5px;height:10%;border:1pxsolid#d6d1d1;}.title{height:20%;font-size:30px;margin-left:10px;}.titlea{font-size:10px;}.score{height:12%;font-size:10px;}.score.comments{margin-left:10px;}.score.avgPriceTitle{margin-left:10px;}.score.comment_score.item{margin-left:10px;}.address{height:12%;}.tel{height:12%;}.characteristic{height:12%;}.more{height:12%;}.threes{background:-246px-141px;height:30px;width:14px;background-image:url(number.svg);}'

In [13]: coord = pattern.findall(css)

In [14]: coord
Out[14]: [('386', '97')]

In [15]: if coord:
    ...:     x,y = coord[0]
    ...:     x,y = int(x),int(y)
    ...:

In [16]: print(x,y)
386 97

In [17]: #因为svg有4个text,需要寻找css标签所对应的哪一个svg图像

In [18]: from parsel import Selector

In [19]: svg_data = Selector(svg_resp)

In [20]: texts = svg_data.xpath('//text')

In [21]: texts
Out[21]:
[<Selector xpath='//text' data='<text x="14 28 42 56 70 84 98 112 126...'>,
 <Selector xpath='//text' data='<text x="14 28 42 56 70 84 98 112 126...'>,
 <Selector xpath='//text' data='<text x="14 28 42 56 70 84 98 112 126...'>,
 <Selector xpath='//text' data='<text x="14 28 42 56 70 84 98 112 126...'>]

In [22]: axiy = [i.attrib.get('y') for i in texts if y<=int(i.attrib.ge
    ...: t('y')) ][0] #根据y值来确定,css布局位置》=svg对应的y值,取最近的一个值

In [23]: axiy
Out[23]: '120'

In [24]: #提取对应y的text

In [25]: svg_text = svg_data.xpath('//text[@y=%s]/text()'%axiy).extract
    ...: _first()

In [26]: svg_text
Out[26]: '671260781104096663000892328440489239185923'

In [27]: #提取字体大小

In [28]: font_size =re.search('font-size:(\d+)px',svg_resp).group(1)

In [29]: font_size
Out[29]: '14'

In [30]: position = x//int(font_size) #css对应的x坐标/字体大小=svg坐标

In [31]: number = svg_text[position]

In [32]: number
Out[32]: '4'

In [33]:                            
发布了101 篇原创文章 · 获赞 46 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/weixin_40539952/article/details/104648464