Python——下载湖北省招办swf文件

#下载(注意,这里和前面不一样的是请求response.content
# content是返回二进制内容,text返回网页请求结果
#一般请求网页用text,请求图片用content
import requests
import re

response = requests.get("http://zsxx.e21.cn/e21html/zsarticles/gaozhao/2015_07_14/77339.html")
response.encoding="GB2312"
print(response.text)

#/sqlimg/file/2018/06/21/20180621121244_1870525573.swf" target="_blank">11北京市</a></p>

pat = "/sqlimg/file/(.*?).swf.*?\\>(.*?)\\<"

urls = re.compile(pat).findall(response.text)
print(urls)


for url in urls:
     Download_addres = "http://zsxx.e21.cn/sqlimg/file/"+url[0]+".swf"
     f=requests.get(Download_addres)
     with open("D:/大四上/毕业论文/爬虫2/湖北省招办/投档线2/"+url[1]+".swf","wb") as file:
          file.write(f.content)

import pandas  as pds
import requests
import urllib.request
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

browser = webdriver.Chrome()  #驱动谷歌浏览器

def enter(url,element):
     wait = WebDriverWait(browser, 3)
     try:
         browser.get(url)
         wait.until(
             EC.presence_of_element_located((By.XPATH,element)),
         )
     except TimeoutException:
          result = "在"+url+'\n'+'未定位到'+element
          print(result)


def get_detail(element):
    elements = browser.find_element_by_xpath(element)
    return elements.text


def get_element_attribute(element, attribute):
    elements = browser.find_element_by_xpath(element)
    return elements.get_attribute(attribute)

def down_swf(path, swf_url):
     f=requests.get(swf_url)
     with open(path ,"wb") as file:
          file.write(f.content)
     
def excel(fname):
          data=pds.read_excel(fname)
          return data

def main():
     data = excel("C:/Users/Administrator/Desktop/投档线.xlsx")
     nrow=data.shape[0]
     for i in range(nrow):
          print(data.values[i][0])
     fail_urls = []
     for i in range(nrow):
          print("-------------------------------正在爬取"+str(i)+"---------------------------------")
          print(data.values[i][0])
          try:
               enter(data.values[i][0],'//*[@id="Table1"]/tbody/tr[1]/td/table/tbody/tr/td')
               title = get_detail('//*[@id="Table1"]/tbody/tr[1]/td/table/tbody/tr/td')
               title = title.replace('\n','')
               #print(title)
               path = 'D:/大四上/毕业论文/爬虫2/湖北省招办/投档线/'+title+".swf"
               swf_url = get_element_attribute('/html/body/table/tbody/tr/td/table[6]/tbody/tr/td[1]/table[1]/tbody/tr[1]/td/p/object/embed', 'src')
               #print(swf_url)
               
               down_swf(path, swf_url)
          except:
               fail_urls.append(data.values[i][0])
               print("                                               ERROR")
          print("------------------------------------结束---------------------------------------"+'\n')

          
     for fail_url in fail_urls:
          print(fail_url)
          
     
if __name__ ==  "__main__":
     main()

发布了55 篇原创文章 · 获赞 17 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/weixin_43213658/article/details/90202590