'''
爬虫前奏:
1.明确目的
2.找到数据相对应得网页
3.分析网页结构,找到数据的所在标签的位置
操作:
模拟HTTP请求,向服务器发送这个请求,获取到服务器返回的页面
正则表达式提取我们需要的数据
'''
虎牙主页主播和人气
爬虫前奏:
1.明确目的
2.找到数据相对应得网页
3.分析网页结构,找到数据的所在标签的位置
操作:
模拟HTTP请求,向服务器发送这个请求,获取到服务器返回的页面
正则表达式提取我们需要的数据
'''
虎牙主页主播和人气
'''
<span class="txt">
<span class="avatar fl">
<img data-original="https://huyaimg.msstatic.com/avatar/1084/b7/896bc815db9560eabbcb4a227f62ba_180_135.jpg" src="https://huyaimg.msstatic.com/avatar/1084/b7/896bc815db9560eabbcb4a227f62ba_180_135.jpg" onerror="this.onerror=null; this.src='//a.msstatic.com/huya/main/assets/img/default/84x84.jpg';" alt="卡尔" title="卡尔">
<i class="nick" title="卡尔">卡尔</i>---------数据1
</span>
<span class="num">
<i class="num-icon"></i>
<i class="js-num">107.3万</i> ------------数据2
</span>
</span>
'''
代码实现
#encoding:utf-8
'''
Created on 2018年6月14日
@author: Administrator
'''
from urllib import request
import re
class Spider():
url = 'https://www.huya.com/g/lol'
root_pattern = '<span class="txt">([\s\S]*)</span>'
name_pattern = '<i class="nick" title="[\s\S]*?">([\s\S]*?)</i>'
num_pattern = '<i class="js-num">([\s\S]*?)</i>'
def __fetch_content(self): #私有方法__xx__
r=request.urlopen(Spider.url)
htmls = r.read()
htmls = str(htmls,encoding='utf-8')
#print(htmls)
return htmls
def __anaysis(self,htmls):
root_html = re.findall(self.root_pattern, htmls)
#print(type(root_html),len(root_html))
for html in root_html:
#print(html)
name = re.findall(self.name_pattern,html)
num = re.findall(self.num_pattern,html)
anchors = {'name':name,'num':num}
#print(anchors)
data_name = []
data_num = []
for anchor_name in anchors['name']:
name={'name':anchor_name}
data_name.append(name)
for anchor_num in anchors['num']:
#end = anchor_num[len(anchor_num)-1]
#print(end,type(end))
#print('万',type('万'))
#print(end=='万')
if '万' in anchor_num:
r = re.findall('\d*',anchor_num)
anchor_num =str (float(r[0])*10000)
num={'num':float(anchor_num)}
data_num.append(num)
#print(data_name)
#print(data_num)
datas = list(map(lambda name,num:{'name':name['name'],'num':num['num']},data_name,data_num))
#print('__anaysis',datas)
return datas
#print(anchors[0])
def __sortData(self,datas):
datas.sort(key=lambda x:x['num'],reverse = True)
return datas
def __showData(self,datas):
for data in datas:
print(data)
def go(self):
htmls=self.__fetch_content()
datas=self.__anaysis(htmls)
sort_datas=self.__sortData(datas)
self.__showData(sort_datas)
spider = Spider()
spider.go()