爬虫-网易云歌单下载
'''网易热门歌单
1. https://music.163.com/discover/playlist 进入歌单列表页,返回所有歌单datas = [('歌单名1','歌单id'),...]
2. 'https://music.163.com/playlist?id=%s'%data[1] 进入每一个歌单
2. 正则匹配出每个歌单的url
3. 用requests.get().content方法下载歌曲
4. 用multiprocessing 模块下的Pool方法
'''
import re
import requests
import json
from fake_useragent import UserAgent
from multiprocessing import Pool
class WangyiSpider(object):
def __init__(self):
'''爬取前2个歌单的所有歌曲'''
self.headers = {
'Referer': 'https://music.163.com/discover/playlist',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/72.0.3626.119 Safari/537.36'
}
def get_song(self):
# 首先进入歌单页面
songlist_url = 'https://music.163.com/discover/playlist'
songlist_res = requests.get(songlist_url,verify=False,headers=self.headers)
# print(songlist_res.text)
# 找到所有的li标签,找出a标签的链接
a_data = re.findall('<a title="(.*?)" href="/playlist\?id=(\d+)" class="msk"></a>',songlist_res.text)
# 进程池提高效率
pool = Pool(processes=4)
pool.map(self.get_song_content,a_data[:2]) # 将a_data中的每一个数放入get_song_content函数中
def get_song_content(self,data):
'''('世界它太小,小到容不下爱人的心', '2885665791')'''
url = 'https://music.163.com/playlist?id=%s'%data[1]
# 拿到所有的歌名和对应的url
res = requests.get(url,headers=self.headers,verify=False)
for i in re.findall(r'<li><a href="/song\?id=(\d+)">(.*?)</a></li>', res.text)[:3]:
id = i[0]
title = i[1]
song_url = 'https://music.163.com/song/media/outer/url?id=%s'%id
song_content = requests.get(song_url,headers=self.headers,verify=False)
try:
with open('music/%s.mp3'%title,'wb') as f:
f.write(song_content.content)
print(title + ' 下载完成!')
except Exception as e:
print(e)
if __name__ == '__main__':
# song_name = input('enter song name you want: ').strip()
ws = WangyiSpider()
ws.get_song()