python3入门----一个实例

这个程序打算实现的功能是:从一个网页爬取所要的信息

获取HTML

    def __getHTML(self):
        r = request.urlopen(Spider.__url)
        #bytes
        html = r.read()
        #转成字符串
        html = str(html,encoding='utf-8')
        b = 1

在获取到的HTML里面获取所需信息

    __root_pattern = '<div class="mes">([\s\S]*?)<div class="impress-tag-list">'
    __name_pattern = '<span class="dy-name ellipsis fl">([\s\S]*?)</span>'
    __number_pattern = '<span class="dy-num fr"  >([\s\S]*?)</span>'


#上面是所用到的正则表达式

 #正则表达式匹配
    def __analysis(self,html):
        root_html = re.findall(Spider.__root_pattern,html)
        anchors = []
        for html in root_html:
            name = re.findall(Spider.__name_pattern,html)
            number = re.findall(Spider.__number_pattern,html)
            anchor = {'name' : name,'number':number}
            anchors.append(anchor)

数据排序

# 排序
    def __sort(self,anchors):
        anchors = sorted(anchors,key=self.__sort_seed,reverse=True)
        return anchors

    # 排序方法
    def __sort_seed(self,anchor):
        r = re.findall('\d*',anchor['number'][0]) 
        number = float(r[0])
        if '万' in anchor['number'][0]:
            number *= 10000
        return number

完整程序

from urllib import request
import re

class Spider:

    #爬去的网页 
    __url = 'https://www.douyu.com/g_How' 
    __root_pattern = '<div class="mes">([\s\S]*?)<div class="impress-tag-list">'
    __name_pattern = '<span class="dy-name ellipsis fl">([\s\S]*?)</span>'
    __number_pattern = '<span class="dy-num fr"  >([\s\S]*?)</span>'

    #获取HTML
    def __getHTML(self):
        r = request.urlopen(Spider.__url)
        #bytes
        html = r.read()
        #转成字符串
        html = str(html,encoding='utf-8')
        return html

    #正则表达式
    def __analysis(self,html):
        root_html = re.findall(Spider.__root_pattern,html)
        anchors = []
        for html in root_html:
            name = re.findall(Spider.__name_pattern,html)
            number = re.findall(Spider.__number_pattern,html)
            anchor = {'name' : name,'number':number}
            anchors.append(anchor)
        return anchors

    # 精炼数据
    def __refine(self,anchors):
        pass

    # 排序
    def __sort(self,anchors):
        anchors = sorted(anchors,key=self.__sort_seed,reverse=True)
        return anchors

    # 排序方法
    def __sort_seed(self,anchor):
        r = re.findall('\d*',anchor['number'][0]) 
        number = float(r[0])
        if '万' in anchor['number'][0]:
            number *= 10000
        return number

    # 输出
    def __print(self,anchors):
        for anchor in anchors:
            print(anchor['name'][0]+'----->'+anchor['number'][0])

    # 入口
    def run(self):
        html = self.__getHTML()
        anchors = self.__analysis(html)
        anchors = self.__sort(anchors)
        self.__print(anchors)


a = Spider()
a.run()

至此,把python一些基础知识用了一下。

猜你喜欢

转载自blog.csdn.net/aaalswaaa1/article/details/81280588