Selenium+Request crawls live audio data of a fish (on)

Use the cat scratch plugin to view the video page:

Cat Claw plugin address: https://download.csdn.net/download/qq_35944102/12751647

Insert picture description here
View the address of the .m3u8 file,

https://apd-d76221faa00f776e123126c247fccbdb.v.smtcdns.com/sportsts.tc.qq.com/AaypdBLtr9u-YamHt0g-wsYRosTc9RPqbUxzRLFWG6FA/uwMROfz2r5zAoaQXGdGnC2df644E7D3uP8M8pmtgwsRK9nEL/ctOoRPhfxNNS8pRHfUEALMA8J1OKmAMXbtZ2UosD57qe4fjKtkzuJZRj2z0nwDul_ZPfhQTiIallk1yQz73xiJxQmGzU8SnurlOhpPHrZLTXxnzZvWR9Krc-Wi0FcH851LWJ4K5TWn2iQRPwvyjaPhDXlvXjc8pDI5Emaf9-ux8/d0034d5pyst.321002.ts.m3u8?ver=4

1. Analyze the source code of the page and find that the m3u8 address is a synthetic address,
Insert picture description here
get the address: directly upload the code

from selenium import webdriver
from fake_useragent import UserAgent
import requests
import time
from lxml import etree

#视频地址
url ="https://lpl.qq.com/es/video_detail.shtml?nid=38220&bMatchId=6685"

ua = UserAgent()
#生成随机headers
headers = {
    
    'User-Agent': ua.random}

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')  # 使用无头谷歌浏览器模式
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')

chromedriver = 'C:/Program Files (x86)/Google/Chrome/Application/chromedriver'
# driver = webdriver.Chrome(chrome_options=chrome_options,executable_path='C:/Program Files (x86)/Google/Chrome/Application/chromedriver')
driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chrome_options)
driver.get(url=url)
time.sleep(2)
#模拟点击播放按钮,发现点击一次获取不到m3u8地址,模拟点击了50次
    while True:
        time.sleep(1)
        driver.find_element_by_xpath("//txpdiv[@data-role='txp-ui-control-playbtn']").click()
        html = driver.page_source
        html1 = etree.HTML(html)
        host2 = html1.xpath("//txpdiv[@class='txp_left_controls']/txpdiv[@class='txp_time_display']/txpdiv[@class='txp_time_duration']")
        for index in range(len(host2)):
            if (index % 2) == 0:
                host_name1 = host2[index].text
                print(host_name1)
                #当播放时长不是00:00停止点击
        if host_name1 !='00:00':
            break
    time.sleep(3)

html1 = etree.HTML(html)

host = html1.xpath("//div[@id='liveCon']//txpdiv[@class='txp_console_inner']/txpdiv[@class='txp_line']/txpdiv[@data-role='txp-ui-console-cdn']")
#此处获取为一个列表 正常获取<Element a at 0x39a9a80> 所以写了一个循环
for index in range(len(host)):
    if (index % 2) == 0:
        host_name =host[index].text
        print(host_name)

#拼接m3u8地址 由于后面的拼接数据有时效性,
m3u8_url ='https://{}/sportsts.tc.qq.com/AT1u-BHmgh8ggz92397MSFOZB7ayt13lws9bbPfMxJBU/uwMROfz2r5zAoaQXGdGnC2df644E7D3uP8M8pmtgwsRK9nEL/1fQq3doMj_NuoNJRW5xktbv7fngOTyifEJxpL6gLPhXyQaxmEvLqr1axSIM_nW9UHIk8ZIB7kCrfkouLxahhHJrPFt11oP5_91U4_neFpmXeM2R07r_U7cctF9Rl6f8GhhJwgBa9Es6kVwAYWUAZMtVxbpwySkrtjQhDpXIbksA/f0034cs60si.321002.ts.m3u8?ver=4'.format(host_name)
m3u8 =requests.get(url=m3u8_url,headers=headers)
print(m3u8.text)

The content of the m3u8 file is successfully obtained:
Insert picture description here

Reference link: https://www.cnblogs.com/zxy/p/8260213.html

Guess you like

Origin blog.csdn.net/qq_35944102/article/details/108199599