python爬虫,使用urllib + 正则

学习使用urllib + 正则爬取熊猫TV的直播,获取直播人和直播人气,并且进行了排序

from urllib import  request
import re
class Spider():
    url = 'https://www.panda.tv/cate/dota2'
    root_pattern = r'<div class="video-info">([\s\S]*?)</div>'
    title_pattern = r'</i>([\s\S]*?)</span>'
    number_pattern = r'<span class="video-number">([\s\S]*?)</span>'
    def __fetch_content(self):
        r = request.urlopen(self.url)
        htmls = r.read()
        htmls = str(htmls, encoding = 'utf-8')
        return htmls

    def __analysis(self, htmls):
        total = []
        root_html = re.findall(self.root_pattern, htmls)
        for i in range(len(root_html)):
            title = re.findall(self.title_pattern, root_html[i])
            number = re.findall(self.number_pattern, root_html[i])
            live_room =  {'title':title, 'number':number}
            total.append(live_room)
        return total

    def __refine(self, total):
        l = lambda each_item: {'title':each_item['title'][0].strip(), 'number':each_item['number'][0].strip()}
        return map(l, total)

    def __sort(self, refine_total):
        refine_total = sorted(refine_total, key = self.__sort_seed, reverse = True)
        return refine_total

    def __sort_seed(self,each_tiem):
        r = re.findall('\d*',each_tiem['number'])
        number = float(r[0])
        if '万' in each_tiem['number']:
            number *= 10000
        return number

    def __show(self, refine_total):
        for i in refine_total:
            print(i['title'] + '---' + i['number'])

    def start(self):
        htmls = self.__fetch_content()
        total = self.__analysis(htmls)
        refine_total = list(self.__refine(total))
        refine_total = self.__sort(refine_total)
        self.__show(refine_total)

spider = Spider()
spider.start()

猜你喜欢

转载自blog.csdn.net/xlelou/article/details/82026600