版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/SunWuKong_Hadoop/article/details/82870385
# -*- coding: utf-8 -*-
import urllib.request
import http.cookiejar
from bs4 import BeautifulSoup
import requests
import csv
import time
import re
import urllib
from urllib.parse import quote
import string
def get_url_2():
with open('F:/python/二级目录网址.csv')as f:
f_csv = csv.reader(f)
link_list =[]
for link1 in f_csv:
link_list.append(link1)
return link_list
def get_url_weizhuang(head={
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}):
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
header = []
for key, value in head.items():
elem = (key, value)
header.append(elem)
opener.addheaders = header
return opener
def get_html(link):
Cookie = "PHPStat_First_Time_10000011=1480428327337; PHPStat_Cookie_Global_User_Id=_ck16112922052713449617789740328; PHPStat_Return_Time_10000011=1480428327337; PHPStat_Main_Website_10000011=_ck16112922052713449617789740328%7C10000011%7C%7C%7C; VISITED_COMPANY_CODE=%5B%22600064%22%5D; VISITED_STOCK_CODE=%5B%22600064%22%5D; seecookie=%5B600064%5D%3A%u5357%u4EAC%u9AD8%u79D1; _trs_uv=ke6m_532_iw3ksw7h; VISITED_MENU=%5B%228451%22%2C%229055%22%2C%229062%22%2C%229729%22%2C%228528%22%5D"
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Cookie': Cookie,
'Connection': 'keep-alive',
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Host': 'query.sse.com.cn',
'Referer': 'http://www.sse.com.cn/assortment/stock/list/share/'
}
r = requests.get(link, headers=headers, timeout=10)
if 200 != r.status_code:
pass
html = r.text
return html
def get_data4():
uop = get_url_weizhuang().open(link1, timeout=1000)
content = uop.read().decode("utf-8")
pattern1 = re.compile('style="padding-left:20px;">([\s\S]*?)</td>')
pattern2 = re.compile('style="font-size: 15px; line-height: 45px;text-indent: 2em; padding: 0 10px;">([\s\S]*?)</p>')
pattern3 = re.compile('<p style="font-size: 15px; line-height: 45px;">([\s\S]*?)</p>')
pattern4 = re.compile('<p style="font-size: 15px; line-height: 45px;(.*?);float:left;">([\s\S]*?)</p>')
pattern5 = re.compile('<div class="main_tab_item" id="con-one-5" style="display: none;">([\s\S]*?)</div>')
pattern6 = re.compile('<div class="main_tab_item" id="con-one-6" style="display: none;">([\s\S]*?)</div>')
pattern7 = re.compile('<div class="main_tab_item" id="con-one-7" style="display: none;">([\s\S]*?)</div>')
pattern8=re.compile('<div class="main_tab_item" id="con-one-8" style="display: none;">([\s\S]*?)</div>')
items1 = re.findall(pattern1,content)
items2 = re.findall(pattern2,content)
items3 = re.findall(pattern3,content)
items4 = re.findall(pattern4,content)
items5 = re.findall(pattern5,content)
items7 = re.findall(pattern7,content)
items8 = re.findall(pattern8,content)
item_sum1 = [[items1,items8],[items2,items3,items4],[items5, items7]]
for p1 in item_sum1:
jiben_xinxi = []
for p in p1:
for item11 in p:
item1111 = qingxi_data(item11)
jiben_xinxi.append(item1111)
ui_string2 = str(jiben_xinxi).replace('\n', '').replace('\r', '').replace('\\n', '').replace('\\r','').\
replace(' ','').replace('\'','').replace('>','').replace('[','').replace(']','').replace('\\u3000','').\
replace('(text-indent:2em;padding:010px','').replace('(padding:010px;margin-top:0px;','').replace('--','')\
.replace('"','').replace(')"','').split(',')
ui_string2 = [x for x in ui_string2 if x != '']
for n in range(len(ui_string2)):
pattern = 'vard=(.*);if'
ui_string3 = re.findall(pattern, ui_string2[n])
if ui_string3 != []:
ui_string2[n] = ui_string3
print(ui_string2)
save_contents(ui_string2)
def qingxi_data(item11):
dr = re.compile('<[^>]+>', re.S)
item111 = dr.sub(',', str(item11))
item1111 = item111.replace('\\r', '').replace('\\n', '').replace(' ', '').replace('\n', '').replace('\r', '')
return item1111
def dict_data5(jiben_xinxi):
dict1 = {}
len_1 = len(jiben_xinxi)
if len_1 % 2 == 0:
for index, item in enumerate(jiben_xinxi):
if index % 2 == 0:
dict1[item] = jiben_xinxi[index + 1]
print(dict1)
def save_contents(shuju):
urlist = shuju
try:
with open("详细数据.csv",'a+',newline='') as f:
writer = csv.writer(f)
for i in range(len(urlist)):
writer.writerow([urlist[i]])
except:
pass
def check_link(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print('无法链接服务器!!!')
def Schedule(a, b, c):
per = 100.0 * a * b / c
if per > 100:
per = 100
print('完成!')
print('%.2f%%' % per)
def get_contents(rurl):
soup = BeautifulSoup(rurl, 'lxml')
trs = soup.find_all('img')
title_name = soup.find(attrs={'class': 'content_banner_list_up'}).string
title_name2 = title_name.replace(' ', '').replace("\n", "").replace("\r", "")
if trs != []:
for src in trs:
ui = []
ui.append(src)
ui_string = str(ui).replace('<img alt="" src="','').replace('"/>','').replace('[','').replace(']','')
url = quote(ui_string,safe=string.printable)
pattern2 = "[\u4e00-\u9fa5]+"
regex2 = re.compile(pattern2)
results2 = regex2.findall(ui_string)
filename = str(results2[0]) + '.jpg'
try:
urllib.request.urlretrieve(url, 'e:/test/%s_%s' % (title_name2, filename), Schedule)
except:
pass
time.sleep(1)
print('下载完成!')
else:
pass
if __name__ == '__main__':
for i in range(len(get_url_2())):
link1 = get_url_2()[i][0]
print(link1)
get_data4()
rs = check_link(link1)
get_contents(rs)
time.sleep(3)
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
import csv
import time
link1 = 'http://www.hbzwfw.gov.cn/hbzw/sxcx/itemList/xz_index.do?webId=31&deptid='
def get_html(link):
Cookie = "PHPStat_First_Time_10000011=1480428327337; PHPStat_Cookie_Global_User_Id=_ck16112922052713449617789740328; PHPStat_Return_Time_10000011=1480428327337; PHPStat_Main_Website_10000011=_ck16112922052713449617789740328%7C10000011%7C%7C%7C; VISITED_COMPANY_CODE=%5B%22600064%22%5D; VISITED_STOCK_CODE=%5B%22600064%22%5D; seecookie=%5B600064%5D%3A%u5357%u4EAC%u9AD8%u79D1; _trs_uv=ke6m_532_iw3ksw7h; VISITED_MENU=%5B%228451%22%2C%229055%22%2C%229062%22%2C%229729%22%2C%228528%22%5D"
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Cookie': Cookie,
'Connection': 'keep-alive',
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Host': 'query.sse.com.cn',
'Referer': 'http://www.sse.com.cn/assortment/stock/list/share/'
}
r = requests.get(link, headers=headers, timeout=10)
if 200 != r.status_code:
pass
html = r.text
return html
def get_id(link11):
movie_list = []
soup = BeautifulSoup(get_html(link=link11), "lxml")
div_list2 = re.findall(r'href="javascript:changebm(.*)" title=(.*)',soup.decode("utf8", "ignore"))
for i in range(len(div_list2)):
list1 = str(div_list2[i])
list2 = re.findall(r"[\u4e00-\u9fa5]+",list1)
list3 = re.findall(r"\d+",list1)
if len((list3[0]))==9:
movie_list.append(list3[0])
else:
pass
return movie_list
def get_shuju_1():
movie_list2 = get_id(link1)
print(movie_list2)
for n in range(len(movie_list2)):
url_id = movie_list2[n]
for p in range(1,9):
url3 = "http://www.hbzwfw.gov.cn/hbzw/sxcx/itemList/xz_list.do?webId=31&deptid=%s&isone=&isonline=&type=&word=&page_num=%s" % (url_id,p)
soup3 = BeautifulSoup(get_html(link=url3), "lxml")
div_list2 = soup3.select('a')
if len(div_list2) != 0:
print("存在此页" + '' + url3)
div_list = soup3.select('div > div.r3_tit > a')
for m in range(len(div_list)):
div_list_2 = str(div_list[m]).replace('<a href="', '').replace('" target="_blank" title="', '').replace('\r\n\t\t\t\t\t\t\t\t\t</a>', '')
div_list_3 = re.sub(r'">[\u4e00-\u9fa5]+', '', str(div_list_2))
pattern = re.compile(r'^http(.*)html')
div_list_4 = re.findall(pattern, div_list_3)
movie_list1 = []
time.sleep(1)
movie_list3 = ('http'+str(div_list_4[0])+'html')
movie_list1.append(movie_list3)
save_contents(movie_list1)
else:
None
def save_contents(shuju):
urlist = shuju
try:
with open("二级目录网址.csv",'a+',newline='') as f:
writer = csv.writer(f)
for i in range(len(urlist)):
writer.writerow([urlist[i]])
except:
pass
if __name__ == '__main__':
get_html(link=link1)
get_id(link1)
get_shuju_1()