二次爬取--爬虫经典案例

第一次爬取得到的是每个视频页面的链接,第二次爬取的是视频下载链接,分开爬取的。
本次为第二次爬取

import os
import xlrd
import requests
import threading
import time
from lxml import etree
from requests.adapters import HTTPAdapter

header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"}
number = 0


def mp4_down():
    global number
    number = 0
    # 读取xls文件
    xls = xlrd.open_workbook(r"C:\Users\Administrator\Desktop\视频文件_.xlsx")
    sheet = xls.sheet_by_name("Sheet1")
    list = []
    urls = sheet.col_values(0)
    nrows = sheet.nrows
    s = requests.Session()
    for i in range(number, nrows):
        print("第{0}个正在开始。。。-{1}-::{2}::".format(i, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), urls[i]), end="")
        s.mount(urls[i], HTTPAdapter(max_retries=3))        # 重连三次
        res = s.get(urls[i], headers=header, timeout=5)     # 5s超时
        html = etree.HTML(res.text)
        print(str(res.status_code), end="...")

        if res.status_code != 200:
            with open(r'C:\Users\Administrator\Desktop\pachong0.txt', 'a', encoding='gbk') as f:
                f.write(urls[i])
                f.write('\n')
            continue

        result = html.xpath('//div[@class="rm_bq"]//li/a/@href')[0]
        list.append(result)
        print(result, end="...")
        print('当前线程数为{}'.format(threading.activeCount()))
        with open(r'C:\Users\Administrator\Desktop\pachong0.txt', 'a', encoding='gbk') as f:
            f.write(result)
            f.write('\n')
        i += 1
        number += 1
        s.close()

    print(list)


if __name__ == "__main__":
    global timer
    timer = threading.Timer(3.0, mp4_down(), ['hoho'])
    timer.start()

发布了37 篇原创文章 · 获赞 91 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/weixin_43386443/article/details/105255691