爬取酷狗音乐

思路

获得榜单超链接列表
遍历榜单列表拿到，各个榜单歌曲超链接
遍历歌曲超链接，播放歌曲
播放页面就可以将歌曲爬下来（有版权的依然爬不到，笔者还在想办法）

在播放界面获得歌曲名
在播放页面（播放时一般有flash）找到歌曲的超链接。下载就可以了，（qq音乐无法使用同样的方法，QQ音乐进入播放页面时默认不播放，直接网址进入是空白页面。虾米音乐的下载链接是在歌曲超链接哪里，不是在播放界面，而且虾米音乐经过凯撒加密）

代码

import re
import time

import requests
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait

browser = webdriver.Chrome()

wait = WebDriverWait(browser, 10)


def song_play(url):
    # 打开歌曲播放界面 
    browser.get(url)
    html = browser.page_source
    etree_html = etree.HTML(html)
    try:
        # 获得歌词
        song_lyric = etree_html.xpath('//p[@class="ie8FontColor"]/text()')
        # 歌曲mp3超链接 
        song = etree_html.xpath('//audio[@class="music"]/@src')[0]
        # 构建歌曲名，去除不能做位文件名的特殊字符
        song_lyric_mp3 = './mp3/' + song_lyric[0].strip().replace('|', '').replace('/', '').replace('\\', '').replace('.','') + '.mp3'
        headers = {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
        }
        response = requests.get(song, headers=headers)
          # 将歌曲写到文件中
        with open(song_lyric_mp3, 'wb') as f:
            f.write(response.content)
        time.sleep(2)
        # 歌词命名
        song_lyric_txt = './song/'+ song_lyric[0].strip().replace('|','').replace('/','').replace('\\','').replace('.','')+ '.txt'
        print(song_lyric_txt)
        # 歌词写进文件
        with open(song_lyric_txt, 'a', encoding='utf-8') as f:
            for lyrics in song_lyric:
                f.write(lyrics)
    except:
        pass



def index_page(url):
    # 打开榜单
    browser.get(url)
    html = browser.page_source
    etree_html = etree.HTML(html)
    # 获得歌曲超链接列表
    song_href = etree_html.xpath('//li[@class=" "]/a/@href')
    # 循环歌曲超链接列表
    for song in song_href:
        song_play(song)


def parse_page(page_source):
    # 将HTML装换为etree对象，使用
    etree_html = etree.HTML(page_source)
    # 获得榜单
    result_name = etree_html.xpath('//div[@class="pc_temp_side"]/div//li/a/@href')
    # 循环榜单列表
    for url in result_name:
        index_page(url)


def main():
    # 打开排行榜页面
    browser.get('http://www.kugou.com/yy/html/rank.html')
    # 返回页面HTML代码
    page_source = browser.page_source
    # 调用获得排行榜
    parse_page(page_source)


if __name__ == '__main__':
    main()

思路

代码

猜你喜欢