- 用requests库和BeautifulSoup库爬取“最好大学网”中的大学排名信息。
- 代码如下:
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
try:
Headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
r = requests.get(url, headers=Headers, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "产生异常"
def fillUnivList(univ_list, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find('tbody').children:
# 如果tr标签不是bs4定义的Tag类型将过滤掉
if isinstance(tr, bs4.element.Tag):
# print(tr)
tds = tr('td') # 等价于 tr.find_all('td')
univ_list.append([tds[0].string, tds[1].string, tds[2].string, tds[3].string])
def printUnivList(ulist, num):
# 中英文全半角造成不对齐的原因产生在1号位
# 排名、学校名称、总分、省市,4个地方需要填充,即为4
tplt = "{0:^9}\t{1:{4}^10}\t{2:^10}\t{3:^4}"
print(tplt.format("排名", "学校名称", "省市", "总分", chr(12288))) # chr(12288)中文空格字符填充
for i in range(num):
u = ulist[i]
print(tplt.format(u[0], u[1], u[2], u[3], chr(12288)))
def main():
uinfo = []
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
html = getHTMLText(url)
fillUnivList(uinfo, html)
print();
printUnivList(uinfo, 20) # 20 univs
main()
记录学习