In [1]: import re
In [2]: import requests
In [3]: url_css = 'http://www.porters.vip/confusion/css/food.css'
In [4]: url_svg = 'http://www.porters.vip/confusion/font/food.svg'
In [5]: css_resp = requests.get(url_css).text
In [6]: svg_resp = requests.get(url_svg).text
In [7]: # 提取css对应的坐标值
In [8]: css_class_name = 'vhkbvu' #查找所对应的css数据
In [9]: pile = '.%s{background:-(\d+)px-(\d+)px;}'%css_class_name
In [10]: pattern = re.compile(pile) #先编译
In [11]: css = css_resp.replace('\n','').replace(' ','')
In [12]: css
Out[12]: '#tips{color:#c72222;}d[class^="vhk"]{width:14px;height:30px;margin-top:-9px;background-image:url(../font/food.svg);background-repeat:no-repeat;display:inline-block;vertical-align:middle;margin-left:-6px;}.vhk08k{background:-274px-141px;}.vhk6zl{background:-7px-15px;}.vhk0ao{background:-133px-97px;}.vhk9or{background:-330px-141px;}.vhkfln{background:-428px-15px;}.vhkbvu{background:-386px-97px;}.vhk84t{background:-176px-141px;}.vhkvxd{background:-246px-141px;}.vhkqsc{background:-288px-141px;}.vhkjj4{background:-316px-141px;}.vhk0f1{background:-316px-97px;}.col.action{padding:0px;width:120px;height:40px;text-align:center;background-color:#66a3ff;display:table-cell;vertical-align:middle;}.write_action{color:white;}.lay1{height:10%;}.lay2{height:80%;}.lay3{height:10%;}.lay4{height:100%;}.lay5{height:100%;}.details{height:40%;border:1pxsolid#d6d1d1;padding:20px;font-size:15px;}.ad{margin-top:5px;height:10%;border:1pxsolid#d6d1d1;}.photo{height:28%;border:1pxsolid#d6d1d1;padding:10px;}.photo_head{height:80%;}.photo_headimg{width:100%;height:100%;}.photo_list{height:20%;}.photo_listul{margin-top:10px;width:100%;height:100%;list-style:none;}.photo_listulli{margin-left:10px;width:20%;height:80%;float:left;}.photo_listulliimg{width:100%;height:100%;}.map{margin-top:5px;height:10%;border:1pxsolid#d6d1d1;}.title{height:20%;font-size:30px;margin-left:10px;}.titlea{font-size:10px;}.score{height:12%;font-size:10px;}.score.comments{margin-left:10px;}.score.avgPriceTitle{margin-left:10px;}.score.comment_score.item{margin-left:10px;}.address{height:12%;}.tel{height:12%;}.characteristic{height:12%;}.more{height:12%;}.threes{background:-246px-141px;height:30px;width:14px;background-image:url(number.svg);}'
In [13]: coord = pattern.findall(css)
In [14]: coord
Out[14]: [('386', '97')]
In [15]: if coord:
...: x,y = coord[0]
...: x,y = int(x),int(y)
...:
In [16]: print(x,y)
386 97
In [17]: #因为svg有4个text,需要寻找css标签所对应的哪一个svg图像
In [18]: from parsel import Selector
In [19]: svg_data = Selector(svg_resp)
In [20]: texts = svg_data.xpath('//text')
In [21]: texts
Out[21]:
[<Selector xpath='//text' data='<text x="14 28 42 56 70 84 98 112 126...'>,
<Selector xpath='//text' data='<text x="14 28 42 56 70 84 98 112 126...'>,
<Selector xpath='//text' data='<text x="14 28 42 56 70 84 98 112 126...'>,
<Selector xpath='//text' data='<text x="14 28 42 56 70 84 98 112 126...'>]
In [22]: axiy = [i.attrib.get('y') for i in texts if y<=int(i.attrib.ge
...: t('y')) ][0] #根据y值来确定,css布局位置》=svg对应的y值,取最近的一个值
In [23]: axiy
Out[23]: '120'
In [24]: #提取对应y的text
In [25]: svg_text = svg_data.xpath('//text[@y=%s]/text()'%axiy).extract
...: _first()
In [26]: svg_text
Out[26]: '671260781104096663000892328440489239185923'
In [27]: #提取字体大小
In [28]: font_size =re.search('font-size:(\d+)px',svg_resp).group(1)
In [29]: font_size
Out[29]: '14'
In [30]: position = x//int(font_size) #css对应的x坐标/字体大小=svg坐标
In [31]: number = svg_text[position]
In [32]: number
Out[32]: '4'
In [33]:
SVG反爬虫绕过-Python Spider
猜你喜欢
转载自blog.csdn.net/weixin_40539952/article/details/104648464
今日推荐
周排行