基于requests库和BeautifulSoup库的大学排名网络爬虫

  • requests库BeautifulSoup库爬取“最好大学网”中的大学排名信息。
  • 代码如下:
import requests
from bs4 import BeautifulSoup
import bs4


def getHTMLText(url):
    try:
        Headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
        }
        r = requests.get(url, headers=Headers, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "产生异常"


def fillUnivList(univ_list, html):
    soup = BeautifulSoup(html, "html.parser")
    for tr in soup.find('tbody').children:
        # 如果tr标签不是bs4定义的Tag类型将过滤掉
        if isinstance(tr, bs4.element.Tag):
            # print(tr)
            tds = tr('td')  # 等价于 tr.find_all('td')
            univ_list.append([tds[0].string, tds[1].string, tds[2].string, tds[3].string])


def printUnivList(ulist, num):
    # 中英文全半角造成不对齐的原因产生在1号位
    # 排名、学校名称、总分、省市,4个地方需要填充,即为4
    tplt = "{0:^9}\t{1:{4}^10}\t{2:^10}\t{3:^4}"
    print(tplt.format("排名", "学校名称", "省市", "总分", chr(12288)))  # chr(12288)中文空格字符填充
    for i in range(num):
        u = ulist[i]
        print(tplt.format(u[0], u[1], u[2], u[3], chr(12288)))


def main():
    uinfo = []
    url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
    html = getHTMLText(url)
    fillUnivList(uinfo, html)
    print();
    printUnivList(uinfo, 20)  # 20 univs


main()


记录学习

发布了10 篇原创文章 · 获赞 1 · 访问量 135

猜你喜欢

转载自blog.csdn.net/qq_39419113/article/details/105658753