爬取酷狗音乐排行榜
import requests
from bs4 import BeautifulSoup
import re
import time
import lxml
class Spider():
url='https://www.kugou.com/yy/rank/home/%7B%20%7D-8888.html'
headers = {
'User-Agent': 'Mozilla/5.0 ' \
'(Windows NT 10.0; Win64; x64' \
') AppleWebKit/537.36 (KHTML,' \
' like Gecko) Chrome/79.0.3' \
'945.88 Safari/537.36'
}
def __init__(self):
htmls=requests.get(Spider.url,headers=Spider.headers)
soup=BeautifulSoup(htmls.text,'lxml')
self.ranks=soup.select('span.pc_temp_num')
self.titles=soup.select('div.pc_temp_songlist > ul > li > a')
self.times=soup.select('span.pc_tem'
'p_tips_r > span')
def __analyse(self):
self.datas=[]
for rank,title,time in zip(self.ranks,self.titles,self.times):
data={
'rank':rank.get_text().strip(),
'title':title.get_text().split('-')[0],
'time': time.get_text().strip()
}
self.datas.append(data)
return self.datas
def __refine(self,datas):
for data in datas:
print(
data['rank']+
'~~~'+
data['title']+
'~~~'+
data['time']
)
def go(self):
datas=self.__analyse()
self.__refine(datas)
spider=Spider()
spider.go()