python批量下载年报(反爬虫应对版)

from win32com.client import Dispatch
#pip install win32compat
#pip install pywin32
from fake_useragent import UserAgent
import os
import re
import openpyxl
import random
import requests
import urllib.request
import time
def download(url, downpath,filename,i,code):
    if filename in os.listdir(downpath):
        print(str(i)+"th "+code+" already there")
        return
    file_path=downpath+'\\'+filename
    ua = UserAgent()
    user_agent = {
    
    "User-Agent":ua.random}
    proxies_list = [
        {
    
    "http": "106.52.172.214"}
        , {
    
    "http": "202.55.5.209"}
        , {
    
    "http": "106.54.128.253"}
    ]#从快代理网站免费找的临时IP
    request = urllib.request.Request(url, headers=user_agent)
    #response = urllib.request.urlopen(request)
    proxy=random.choice(proxies_list)
    print(proxy)#为了判断代理IP的有效性
    t1=time.monotonic()
    response = requests.get(url, headers=user_agent, proxies=proxy)
    t2 =time.monotonic()
    f = open(file_path, 'wb')
    f.write(response.content)
    #response对象数据存储
    f.close()
    print(str(i)+'th '+code+' is done'+" ",str(round(t2-t1,3))+'s')#测试响应时间
def code_revise(code_cell):
    code=(code_cell.value)
    code=str(code)
    #用value就是数值,text不能用
    for i in range(1,6-len(code)+1):
        code='0'+code
    return code
def url_revise(url):
    #普通命令str.replace(old, new[, max])
    #old --将被替换的子字符串。.new --新字符串,用于替换old子字符串。max --可选字符串,替换不超过max次
    #re.sub(pattern, repl, string, count=0, flags=0)
    #参数含义依次为旧字符正则匹配式、新子串、原文、次数默认全部替换
    #print("url1:" + url)
    old1=re.compile(r'disclosure/detail\?stockCode=\d+&announcementId')
    old2=re.compile(r'orgId=\w+\d+&announcementTime')
    new1='announcement/download?bulletinId'
    new2='announceTime'
    url=re.sub(old1,new1,url)
    url = re.sub(old2, new2, url)
    #print("url2:"+url)
    return url
#input= r'E:\huang\Documents'
input= r'E:\huang\Documents\其他行业'
os.chdir(input)
downpath=r'E:\Alark\Users\Desktop\年报\2020\其他'
downlist='2020-2021深市其他行业.xlsx'
wb = openpyxl.load_workbook(downlist)
ws = wb.active
#active_sheet = wb.active
i=0
for row in ws.rows:
    if row[0].value==None:
        #print("row[0]:",row[0].value)
        break
    else:
        pass
        #print(code_revise(row[0]),row[2].value)
    filename=code_revise(row[0])+'_'+row[2].value+'.pdf'
    url=url_revise(row[4].value)
    i=i+1
    download(url, downpath, filename,i,code_revise(row[0]))
wb.save("cache.xlsx")


猜你喜欢

转载自blog.csdn.net/qq_37639139/article/details/124238437