python reptile notes (five) the Web crawler extracts - Example: Chinese university rankings reptiles

1. Chinese university rankings directional reptiles

Website: http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html

View the source code, find information directly written in HTML, that is the reptiles orientation can be achieved

2. The structural design of the program

2. Examples of writing

2.1 General Framework Code

# -*- coding: utf-8 -*-
"""
Created on Thu Jan 30 01:27:38 2020

@author: douzi
"""

import requests
from bs4 import BeautifulSoup

# Convert url into HTML 
DEF getHTMLText (url):
         return  ""

# Extract html information critical data, and extract it to the list 
DEF fillUnivList (Ulist, html):
     Pass    

def printUnivList(ulist, num):  # 打印num所大学信息
    print("Suc" + str(num))
    
def main():
    # 大学信息放到列表中
    uinfo = []       
    # 大学排名的url
    url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html"
    # 将url转换成html
    html = getHTMLText(url)   
    fillUnivList(uinfo, html)
    printUnivList(uinfo, 20)    # 20 univs
    
if __name__ == '__main__':
    main()
    

2.2 将url转换成html

import requests
from bs4 import BeautifulSoup

def getHTMLText(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"}
        r = requests.get(url, timeout=30, headers=headers)
        r.raise_for_status()              # 产生异常信息
        r.encoding = r.apparent_encoding  # 修改编码
        return r.text   # 返回网页信息
    except:
        return ""

2.3 提取html信息中关键的数据,并提取到列表中

(1)观察源代码

 

 

  • 所有大学信息被封装在表格中,这个表格标签叫 tbody
  • 在 tbody 中,每个大学信息又被封装在 tr 中,每个 tr 标签,包含所有当前大学的所有信息
  • 每个 tr 中信息,又被 td 所包围

(2)遍历tbody中类型为tr的标签,查询tr标签中的td标签,所对应的字符串信息

# 提取html信息中关键的数据,并提取到列表中
def fillUnivList(ulist, html):
    soup = BeautifulSoup(html, "html.parser")
    # 所有大学信息被封装在表格中,这个表格标签叫tbody
    # 在tbody中,每个大学信息又被封装在tr中,每个tr标签,包含所有当前大学的所有信息
    # 每个tr中信息,又被td所包围
    # 1. 遍历tbody,tr即每个大学的信息
    for tr in soup.find('tbody').children:
        # 过滤非标签类型的其他数据
        if isinstance(tr, bs4.element.Tag):
            tds = tr('td')    # 查询tr中的 td
            ulist.append([tds[0].string, tds[1].string, tds[3].string])
    

注意:用法 a = soup("xxx"), a[0].string

2.4 格式化输出

def printUnivList(ulist, num):
    print("{:^10}\t{:^6}\t{:^10}".format("排名", "学校名称", "总分"))
    for i in range(num):
        u = ulist[i]
        print("{:^10}\t{:^6}\t{:^10}".format(u[0], u[1], u[2]))    

3. 全部代码

# -*- coding: utf-8 -*-
"""
Created on Thu Jan 30 01:27:38 2020

@author: douzi
"""

import requests
from bs4 import BeautifulSoup
import bs4

def getHTMLText(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"}
        r = requests.get(url, timeout=30, headers=headers)
        r.raise_for_status()              # 产生异常信息
        r.encoding = r.apparent_encoding  # 修改编码
        return r.text   # 返回网页信息
    except:
        return ""

# 提取html信息中关键的数据,并提取到列表中
def fillUnivList(ulist, html):
    soup = BeautifulSoup(html, "html.parser")
    # 所有大学信息被封装在表格中,这个表格标签叫tbody
    # 在tbody中,每个大学信息又被封装在tr中,每个tr标签,包含所有当前大学的所有信息
    # 每个tr中信息,又被td所包围
    # 1. 遍历tbody,tr即每个大学的信息
    for tr in soup.find('tbody').children:
        # 过滤非标签类型的其他数据
        if isinstance(tr, bs4.element.Tag):
            tds = tr('td')    # 查询tr中的 td
            ulist.append([tds[0].string, tds[1].string, tds[3].string])
            
    
def printUnivList(ulist, num):
    print("{:^10}\t{:^6}\t{:^10}".format("排名", "学校名称", "总分"))
    for i in range(num):
        u = ulist[i]
        print("{:^10}\t{:^6}\t{:^10}".format(u[0], u[1], u[2]))    
    
    
def main():
    # 大学信息放到列表中
    uinfo = []       
    # 大学排名的url
    url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html"
    # 将url转换成html
    html = getHTMLText(url)   
    fillUnivList(uinfo, html)
    printUnivList(uinfo, 20)    # 20 univs
    
if __name__ == '__main__':
    main()
    

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

Guess you like

Origin www.cnblogs.com/douzujun/p/12241972.html