#下载(注意,这里和前面不一样的是请求response.content
# content是返回二进制内容,text返回网页请求结果
#一般请求网页用text,请求图片用content
import requests
import re
response = requests.get("http://zsxx.e21.cn/e21html/zsarticles/gaozhao/2015_07_14/77339.html")
response.encoding="GB2312"
print(response.text)
#/sqlimg/file/2018/06/21/20180621121244_1870525573.swf" target="_blank">11北京市</a></p>
pat = "/sqlimg/file/(.*?).swf.*?\\>(.*?)\\<"
urls = re.compile(pat).findall(response.text)
print(urls)
for url in urls:
Download_addres = "http://zsxx.e21.cn/sqlimg/file/"+url[0]+".swf"
f=requests.get(Download_addres)
with open("D:/大四上/毕业论文/爬虫2/湖北省招办/投档线2/"+url[1]+".swf","wb") as file:
file.write(f.content)
import pandas as pds
import requests
import urllib.request
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome() #驱动谷歌浏览器
def enter(url,element):
wait = WebDriverWait(browser, 3)
try:
browser.get(url)
wait.until(
EC.presence_of_element_located((By.XPATH,element)),
)
except TimeoutException:
result = "在"+url+'\n'+'未定位到'+element
print(result)
def get_detail(element):
elements = browser.find_element_by_xpath(element)
return elements.text
def get_element_attribute(element, attribute):
elements = browser.find_element_by_xpath(element)
return elements.get_attribute(attribute)
def down_swf(path, swf_url):
f=requests.get(swf_url)
with open(path ,"wb") as file:
file.write(f.content)
def excel(fname):
data=pds.read_excel(fname)
return data
def main():
data = excel("C:/Users/Administrator/Desktop/投档线.xlsx")
nrow=data.shape[0]
for i in range(nrow):
print(data.values[i][0])
fail_urls = []
for i in range(nrow):
print("-------------------------------正在爬取"+str(i)+"---------------------------------")
print(data.values[i][0])
try:
enter(data.values[i][0],'//*[@id="Table1"]/tbody/tr[1]/td/table/tbody/tr/td')
title = get_detail('//*[@id="Table1"]/tbody/tr[1]/td/table/tbody/tr/td')
title = title.replace('\n','')
#print(title)
path = 'D:/大四上/毕业论文/爬虫2/湖北省招办/投档线/'+title+".swf"
swf_url = get_element_attribute('/html/body/table/tbody/tr/td/table[6]/tbody/tr/td[1]/table[1]/tbody/tr[1]/td/p/object/embed', 'src')
#print(swf_url)
down_swf(path, swf_url)
except:
fail_urls.append(data.values[i][0])
print(" ERROR")
print("------------------------------------结束---------------------------------------"+'\n')
for fail_url in fail_urls:
print(fail_url)
if __name__ == "__main__":
main()