第一次爬取得到的是每个视频页面的链接,第二次爬取的是视频下载链接,分开爬取的。
本次为第二次爬取
import os
import xlrd
import requests
import threading
import time
from lxml import etree
from requests.adapters import HTTPAdapter
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"}
number = 0
def mp4_down():
global number
number = 0
# 读取xls文件
xls = xlrd.open_workbook(r"C:\Users\Administrator\Desktop\视频文件_.xlsx")
sheet = xls.sheet_by_name("Sheet1")
list = []
urls = sheet.col_values(0)
nrows = sheet.nrows
s = requests.Session()
for i in range(number, nrows):
print("第{0}个正在开始。。。-{1}-::{2}::".format(i, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), urls[i]), end="")
s.mount(urls[i], HTTPAdapter(max_retries=3)) # 重连三次
res = s.get(urls[i], headers=header, timeout=5) # 5s超时
html = etree.HTML(res.text)
print(str(res.status_code), end="...")
if res.status_code != 200:
with open(r'C:\Users\Administrator\Desktop\pachong0.txt', 'a', encoding='gbk') as f:
f.write(urls[i])
f.write('\n')
continue
result = html.xpath('//div[@class="rm_bq"]//li/a/@href')[0]
list.append(result)
print(result, end="...")
print('当前线程数为{}'.format(threading.activeCount()))
with open(r'C:\Users\Administrator\Desktop\pachong0.txt', 'a', encoding='gbk') as f:
f.write(result)
f.write('\n')
i += 1
number += 1
s.close()
print(list)
if __name__ == "__main__":
global timer
timer = threading.Timer(3.0, mp4_down(), ['hoho'])
timer.start()