Python初学12-爬虫

'''
爬虫前奏:
    1.明确目的
    2.找到数据相对应得网页
    3.分析网页结构,找到数据的所在标签的位置
    
操作:
            模拟HTTP请求,向服务器发送这个请求,获取到服务器返回的页面
            正则表达式提取我们需要的数据
'''
虎牙主页主播和人气
'''
<span class="txt">
    <span class="avatar fl">
        <img data-original="https://huyaimg.msstatic.com/avatar/1084/b7/896bc815db9560eabbcb4a227f62ba_180_135.jpg" src="https://huyaimg.msstatic.com/avatar/1084/b7/896bc815db9560eabbcb4a227f62ba_180_135.jpg" onerror="this.onerror=null; this.src='//a.msstatic.com/huya/main/assets/img/default/84x84.jpg';" alt="卡尔" title="卡尔">
         <i class="nick" title="卡尔">卡尔</i>---------数据1
    </span>
    <span class="num">
        <i class="num-icon"></i>
        <i class="js-num">107.3万</i> ------------数据2
    </span>
</span>
'''

代码实现

#encoding:utf-8
'''
Created on 2018年6月14日

@author: Administrator
'''
from urllib import request
import re

class Spider():
    url = 'https://www.huya.com/g/lol'
    
    root_pattern = '<span class="txt">([\s\S]*)</span>' 
    name_pattern = '<i class="nick" title="[\s\S]*?">([\s\S]*?)</i>'
    num_pattern = '<i class="js-num">([\s\S]*?)</i>' 
      
    def __fetch_content(self): #私有方法__xx__
        r=request.urlopen(Spider.url)
        htmls = r.read()
        htmls = str(htmls,encoding='utf-8')
        #print(htmls)
        return htmls
    
    def __anaysis(self,htmls):    
        root_html = re.findall(self.root_pattern, htmls)
        #print(type(root_html),len(root_html))
        
        for html in root_html:
            #print(html)
            name = re.findall(self.name_pattern,html)
            num  = re.findall(self.num_pattern,html)
            anchors = {'name':name,'num':num}
            #print(anchors)
        
        data_name = []
        data_num = []
        for anchor_name in anchors['name']:
            name={'name':anchor_name}
            data_name.append(name)
            
        for anchor_num in anchors['num']:
            #end = anchor_num[len(anchor_num)-1]
            #print(end,type(end))
            #print('万',type('万'))
            #print(end=='万')
            
            if '万' in anchor_num:
                r = re.findall('\d*',anchor_num)
                anchor_num =str (float(r[0])*10000)
            
            num={'num':float(anchor_num)}
            data_num.append(num)
        
        #print(data_name)
        #print(data_num)
        
        datas = list(map(lambda name,num:{'name':name['name'],'num':num['num']},data_name,data_num))
        #print('__anaysis',datas)    
        return datas
        #print(anchors[0])    
    
    def __sortData(self,datas): 
        datas.sort(key=lambda x:x['num'],reverse = True)
        return datas
      
    def __showData(self,datas): 
        for data in datas:
            print(data)
    def go(self):
        htmls=self.__fetch_content()
        datas=self.__anaysis(htmls)
        sort_datas=self.__sortData(datas)
        self.__showData(sort_datas)
        
spider  = Spider()
spider.go()


猜你喜欢

转载自blog.csdn.net/qq_34819372/article/details/80701928
今日推荐