前言
上一篇博客使用requests库加selenium库加Xpath解析html的技术路线下载了单首歌曲,但是存在一个问题:webdriver打开网页十分慢。这一节解决这个问题并修改代码批量下载。
上一节:Python网络爬虫与信息提取(10)—— 代码搜索全网音乐爬取并下载
目标
将想要下载的歌曲名字存在列表中,批量搜索并下载代码。
准备
因为webdriver打开网页缓慢的原因,我考虑使用selenium控制已经打开的chrome网页,毕竟直接用chrome搜索歌曲和打开网页还是很快的。
首先在pycharm中打开终端,输入以下命令切换到谷歌浏览器目录下:
cd \d C:\Program Files (x86)\Google\Chrome\Application
执行命令打开谷歌浏览器并保存配置在本地,这里路径可以选择和代码同级目录:
chrome.exe --remote-debugging-port=9222 --user-data-dir="e:\py_code\Reptile"
执行后会打开谷歌浏览器 ,在地址栏输入我们的网址:
http://www.gequdaquan.net/gqss/index.html
OK,准备工作完成!接下来交给脚本干活。
代码设计
网页驱动的配置:
chrome_options = Options()
# chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
chrome_options.debugger_address = "127.0.0.1:9222"
chrome_driver = "chromedriver.exe"
driver = webdriver.Chrome(chrome_driver, chrome_options=chrome_options)
这没啥好讲的,固定格式和端口号,具体参照谷歌中自动控制说明。
网页静音
因为前一篇文章说了下载歌曲需要点击播放,所以静音好点:
try:
driver.find_element_by_xpath("//a[@class='player-btn btn-quiet']").click()
except:
pass
这里用try except框架,因为如果用户点击了静音,再一次点击是静音按钮的class属性会发生变化从而找不到对应的xpath会引发异常,所以如果异常直接跳过即可。
初始化
这里初始化搜索的所有歌曲名字,本地保存路径和歌曲下载地址的列表:
search_name = ["万有引力","苦笑"]
savaer_path = "D://music//"
list = []
批量获取url
下面这段代码调用我们自己写的getMusicUrl函数获取每个搜索名字对应歌曲的URL:
print("开始获取url")
for name in search_name:
music_url = getMusicUrl(driver,name)
list.append(music_url)
函数具体实现和上一篇博客基本相同,不做累述:
def getMusicUrl(driver, search_name):
print("打开搜素框")
driver.find_element_by_xpath("//span[@data-action = \"search\"]").click() # 点击按钮
print("搜索音乐")
getXpath(driver, "//div[@class='search-group']/input[@id='search-wd']").clear()
getXpath(driver, "//div[@class='search-group']/input[@id='search-wd']").send_keys(search_name)
getXpath(driver, "//div[@class='search-group']/button[@class='search-submit']").submit()
time.sleep(5)
print("播放音乐")
# 因为搜索会重新加载界面,如果获取不到按钮控件则不能调用点击函数会抛出异常
flag = True
while flag is True:
try:
flag = False
target = getXpath(driver,"//div[@class='list-item'][1]")
ActionChains(driver).move_to_element(target).perform()
getXpath(driver, "//div[@class='list-item'][1]/span[@class='music-name']/div[@class='list-menu']/span[@class='list-icon icon-play']").click()
except:
flag =True
print("获取地址")
#music_url = str(driver.find_element_by_xpath("//audio").get_attribute("src"))
music_url = str(getXpath(driver, "//audio").get_attribute("src"))
print(music_url)
print("关闭音乐")
getXpath(driver, "//a[@class='player-btn btn-play btn-state-paused']").click()
return music_url
批量下载
下面这段代码遍历url列表,并批量下载:
print("开始下载")
num = 0
for _url in list:
DownloadFile(_url, savaer_path, search_name[num] + ".mp3")
num+=1
下载代码,上一篇博客已经详细介绍不做累述:
def DownloadFile(mp3_url, save_url,file_name):
try:
if mp3_url is None or save_url is None or file_name is None:
print('参数错误')
return None
# 文件夹不存在,则创建文件夹
folder = os.path.exists(save_url)
if not folder:
os.makedirs(save_url)
# 读取MP3资源
res = requests.get(mp3_url,stream=True)
# 获取文件地址
file_path = os.path.join(save_url, file_name)
print('开始写入文件:', file_path)
# 打开本地文件夹路径file_path,以二进制流方式写入,保存到本地
with open(file_path, 'wb') as fd:
for chunk in res.iter_content():
fd.write(chunk)
print(file_name+' 成功下载!')
except:
print("程序错误")
运行验证
完整代码
# coding=utf-8
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import requests
import os
import time
def DownloadFile(mp3_url, save_url,file_name):
try:
if mp3_url is None or save_url is None or file_name is None:
print('参数错误')
return None
# 文件夹不存在,则创建文件夹
folder = os.path.exists(save_url)
if not folder:
os.makedirs(save_url)
# 读取MP3资源
res = requests.get(mp3_url,stream=True)
# 获取文件地址
file_path = os.path.join(save_url, file_name)
print('开始写入文件:', file_path)
# 打开本地文件夹路径file_path,以二进制流方式写入,保存到本地
with open(file_path, 'wb') as fd:
for chunk in res.iter_content():
fd.write(chunk)
print(file_name+' 成功下载!')
except:
print("程序错误")
def getMusicUrl(driver, search_name):
# opt = webdriver.ChromeOptions() # 创建浏览器
# #opt.add_argument('--headless') #无窗口模式
# opt.add_argument("--mute-audio") # 静音
# driver = webdriver.Chrome(options=opt) # 创建浏览器对象
#
# print("打开网页")
# driver.get('http://www.gequdaquan.net/gqss/index.html') # 打开网页
print("打开搜素框")
driver.find_element_by_xpath("//span[@data-action = \"search\"]").click() # 点击按钮
print("搜索音乐")
getXpath(driver, "//div[@class='search-group']/input[@id='search-wd']").clear()
getXpath(driver, "//div[@class='search-group']/input[@id='search-wd']").send_keys(search_name)
getXpath(driver, "//div[@class='search-group']/button[@class='search-submit']").submit()
time.sleep(5)
print("播放音乐")
# 因为搜索会重新加载界面,如果获取不到按钮控件则不能调用点击函数会抛出异常
flag = True
while flag is True:
try:
flag = False
target = getXpath(driver,"//div[@class='list-item'][1]")
ActionChains(driver).move_to_element(target).perform()
getXpath(driver, "//div[@class='list-item'][1]/span[@class='music-name']/div[@class='list-menu']/span[@class='list-icon icon-play']").click()
except:
flag =True
print("获取地址")
#music_url = str(driver.find_element_by_xpath("//audio").get_attribute("src"))
music_url = str(getXpath(driver, "//audio").get_attribute("src"))
print(music_url)
print("关闭音乐")
getXpath(driver, "//a[@class='player-btn btn-play btn-state-paused']").click()
return music_url
def getXpath(driver,path):
flag = True
while flag is True:
try:
flag = False
driver.find_element_by_xpath(path)
except:
flag = True
return driver.find_element_by_xpath(path)
if __name__ == '__main__':
search_name = ["万有引力","苦笑"]
savaer_path = "D://music//"
list = []
chrome_options = Options()
# chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
chrome_options.debugger_address = "127.0.0.1:9222"
chrome_driver = "chromedriver.exe"
driver = webdriver.Chrome(chrome_driver, chrome_options=chrome_options)
try:
driver.find_element_by_xpath("//a[@class='player-btn btn-quiet']").click()
except:
pass
print("开始获取url")
for name in search_name:
music_url = getMusicUrl(driver,name)
list.append(music_url)
driver.quit()
print("开始下载")
num = 0
for _url in list:
DownloadFile(_url, savaer_path, search_name[num] + ".mp3")
num+=1