python爬虫批量下载全民K歌音乐

网址示例: https://node.kg.qq.com/personal?uid=639e9983222a338a

直接上源码:

import requests
import time
import re
import json
import pprint
import math
import os
header={
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
SongList=[]
song_baseurl="http://node.kg.qq.com/play"
album_baseurl="http://node.kg.qq.com/cgi/fcgi-bin/kg_ugc_get_homepage"

def Down(url_file, filePath, FileDir):
    if not os.path.isdir(FileDir):  os.makedirs(FileDir)
    if os.path.isfile(FileDir + "/" + filePath):
        print(filePath + "   --已存在")
        return 0
    try:
        r = requests.get(url_file, stream=True)
        with open(FileDir + "/" + filePath, "wb") as f:
            size = int(r.headers['content-length'])
            title = "  当前下载-" + filePath + "  文件大小:" + size + "字节"
            print('\033[0;31m' + title + "\033[0m")
            CurTotal = 0
            for chunk in r.iter_content(chunk_size=512 * 1024):
                if chunk:
                    f.write(chunk)
                    CurTotal += len(chunk)
                    print("\r" + filePath + "--下载进度:" + '%3s' % (str(CurTotal * 100 // size)) + "%", end='')
            print()
            r.close()
    except Exception as e:
        print(filePath + " 下载出错!" + " 错误信息" + str(e.args))
        if os.path.isfile(FileDir + "/" + filePath): os.remove(FileDir + "/" + filePath)

def GetData(data,url):
    response = requests.get(url,params=data, headers=header)
    return response.content.decode("utf-8")

def Parse_Song_Info(content):
    jsonobj = re.findall(r'window.__DATA__ = (.*?); </script>', content)
    if len(jsonobj) > 0:
        data = json.loads(jsonobj[0])
        # pprint.pprint(data)
        obj={}
        obj[ "name"]=data['detail']['song_name']
        if data['detail']['playurl']:
            print(data['detail']['song_name']+"  音乐:"+data['detail']['playurl'])
            obj["url"]=data['detail']['playurl']
            obj["type"]=".mp3"
        else:
            print(data['detail']['song_name'] + "  视频:" + data['detail']['playurl_video'])
            obj["url"] = data['detail']['playurl_video']
            obj["type"] = ".mp4"
        SongList.append(obj)
    else:
        print("没有爬取到")

def GetSongsByIndex(uid, Is_Parse, page):
    data={
        'jsonpCallback':'callback_0',
        'g_tk':'5381',
        'outCharset':'utf-8',
        'format':'jsonp',
        'type':'get_ugc',
        'start':str(page),
        'num':'8',
        'touin':'',
        'share_uid':uid,
        'g_tk_openkey':'5381',
        '_':str(int(time.time()*1000))
    }
    response=requests.get(album_baseurl,params=data,headers=header)
    jsonobj=re.findall(r'callback_\d\((.*)\)',response.content.decode("utf-8"))
    if len(jsonobj)>0:
        data=json.loads(jsonobj[0])
        # pprint.pprint(data)
        count=data['data']['ugc_total_count']
        if Is_Parse:
            time.sleep(1)
            for obj in data['data']['ugclist']:
                print(obj['title']+"  -- "+obj['shareid'])
                songdata = {
                    "s": obj['shareid']
                }
                content=GetData(songdata,song_baseurl)
                Parse_Song_Info(content)
        else:
            pprint.pprint("共计:" + str(count))
        return  count

    else:
        print("没有爬取到")
        return 0

def Run(uid):
    count=GetSongsByIndex(uid, False, 1)
    if count!=0:
        for page in range(1,math.ceil(count/8)+1):
            GetSongsByIndex(uid, True, page)
    else:
        print("该用户没有歌曲")
if __name__=="__main__":
    Run('639e9983222a338a')
    for s in SongList:
        Down(s["url"],s["name"]+s["type"],"小小")
        # print(s)

将Run 括号里面的字符串换为 歌手主页链接后面的uid  

“小小” 可自定义文件夹

猜你喜欢

转载自www.cnblogs.com/yuanzessrs/p/10247347.html