Youku爬虫抓取视频

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接: https://blog.csdn.net/net_wolf/article/details/102507820

尝试用python抓取视频并且处理title中的非法字符,保存为mp4

已经处理了分页的问题,默认下载为360p格式。

#!/usr/bin/python
from bs4 import BeautifulSoup as bs
from requests.exceptions import ConnectTimeout,ConnectionError
import requests,time,sys,re,queue
import youtube_dl

#基本URL
base = "https://www.youku.com/results?search_query="
qstring = "cctv+空中剧院"
pagestring = "&page="
proxystr = '127.0.0.1:49705'
#proxystr = ''

#设置代理
sess = requests.session()
sess.proxies = {'https': proxystr}

video_urls = queue.Queue()   # url队列
counter=0 #页码
while True:
    counter += 1
    try:
        response = sess.get(base + qstring+ pagestring + str(counter))
    except (ConnectTimeout, ConnectionError):
        print("不能访问youku 检查是否已设置代理")
        sys.exit()
    page = response.text
    soup = bs(page, 'html.parser')# 开始解析html

    No_more_results = soup.findAll('div',attrs={'class':'display-message'})
    if No_more_results and No_more_results[0].text=="No more results":
        break#翻页过头了

    vids = soup.findAll('a', attrs={'class': 'yt-uix-tile-link'})
    if(vids):
        for v in vids:
            if len(v['href']) > 20:
                continue#超过20的可能是广告?
            v_link = 'https://www.youku.com' + v['href']
            video_urls.put([v_link,v['title']])
    else:#没有找到视频,结束了?
        break

    print("page:{} size:{}".format(counter,video_urls.qsize()))
    time.sleep(1)#休息一下

counter=0
while not video_urls.empty():
    v_url,title = video_urls.get()
    print(v_url,title)
    # pattern = re.compile(r"\||CCTV戏曲| |来自")
    # file_name = re.sub(pattern, "", title).replace("/", "-")

    try:
        ydl_opts = {# 定义下载参数
            'format' : '[height=360]',#360p已经足够了
            # 格式化下载后的文件名,加入处理后的title
            'outtmpl': '%(title)s.%(ext)s',
            'proxy'  : proxystr
        }
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            # 下载给定的URL列表
            result = ydl.download([v_url])
        print('下载完成')

    except (TimeoutError,ConnectTimeout, ConnectionError):
        print("不能访问youku 检查是否已设置代理")
        sys.exit()

    counter += 1
    if(counter>3):
        break#测试3条


猜你喜欢

转载自blog.csdn.net/net_wolf/article/details/102507820