需求
采集广东省政府的政策的文件
代码
使用selenium工具爬取
# coding: utf-8
import requests
import re
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from lxml import etree
import datetime
from pymongo import MongoClient
import time
DB_IP = '127.0.0.1'
DB_PORT = 27017
DB_NAME = 'research'
DB_COL = 'policy'
client = MongoClient(host=DB_IP, port=DB_PORT)
db = client[DB_NAME]
col = db[DB_COL]
'''
广东省人民政府-----------爬虫
'''
base_urls =["http://zwgk.gd.gov.cn/747050516/",
"http://zwgk.gd.gov.cn/661546078/",
"http://zwgk.gd.gov.cn/006941127/",
"http://zwgk.gd.gov.cn/00694108X/",
"http://zwgk.gd.gov.cn/00694108X/",
"http://zwgk.gd.gov.cn/754537285/",
"http://zwgk.gd.gov.cn/006941186/",
"http://zwgk.gd.gov.cn/759214127/",
"http://zwgk.gd.gov.cn/006940263/",
"http://zwgk.gd.gov.cn/786485539/",
"http://zwgk.gd.gov.cn/006940212/",
"http://zwgk.gd.gov.cn/006940247/",
"http://zwgk.gd.gov.cn/006940335/",
"http://zwgk.gd.gov.cn/00693981X/",
"http://zwgk.gd.gov.cn/00694124X/",
"http://zwgk.gd.gov.cn/758333079/",
"http://zwgk.gd.gov.cn/006940124/",
"http://zwgk.gd.gov.cn/006940095/",
"http://zwgk.gd.gov.cn/006940298/",
"http://zwgk.gd.gov.cn/00694001X/",
"http://zwgk.gd.gov.cn/006941290/",
"http://zwgk.gd.gov.cn/758336165/",
"http://zwgk.gd.gov.cn/006940204/",
"http://zwgk.gd.gov.cn/006940028/",
"http://zwgk.gd.gov.cn/006940132/",
"http://zwgk.gd.gov.cn/006940079/",
"http://zwgk.gd.gov.cn/006939780/",
"http://zwgk.gd.gov.cn/006941338/",
"http://zwgk.gd.gov.cn/006939916/",
"http://zwgk.gd.gov.cn/006939908/",
"http://zwgk.gd.gov.cn/006941135/",
"http://zwgk.gd.gov.cn/006939844/",
"http://zwgk.gd.gov.cn/006939799/",
"http://zwgk.gd.gov.cn/006940060/",
"http://zwgk.gd.gov.cn/006939932/",
"http://zwgk.gd.gov.cn/553612461/",
"http://zwgk.gd.gov.cn/006939991/",
"http://zwgk.gd.gov.cn/006940167/",
"http://zwgk.gd.gov.cn/006940175/",
"http://zwgk.gd.gov.cn/006940183/",
"http://zwgk.gd.gov.cn/006940140/",
"http://zwgk.gd.gov.cn/725107227/",
"http://zwgk.gd.gov.cn/006939801/",
"http://zwgk.gd.gov.cn/006940116/",
"http://zwgk.gd.gov.cn/696453330/",
"http://zwgk.gd.gov.cn/006939756/"
]
browser = webdriver.Chrome(
executable_path="/usr/lib/chromium-browser/chromedriver",
desired_capabilities=DesiredCapabilities.CHROME)
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"}
def save_result(result):
if result is None:
return
# print result
update_key = {
'date': result['date'],
'title': result['title']
}
col.update(update_key, {'$set': result}, upsert=True)
def format_date(date):
return datetime.datetime.strptime(date, '%Y%m%d')
def parse_body(content_url,save):
try:
response = requests.get(content_url,headers=headers)
html = etree.HTML(response.content.decode("utf-8"))
body_list = html.xpath("//div[@class='main']//text()")
body = ''
for each in body_list:
body += each.strip().encode('utf-8')
except:
body = ''
# print body
result = {"title": save["title"],
"categories": save["categories"],
"date": format_date(save["date"]),
"url": save["url"],
"body": body,
"update_time": datetime.datetime.now(),
"source": "广东省人民政府"
}
save_result(result)
def parse_contents(page_url1,base_url):
try:
response = requests.get(page_url1,headers=headers)
except:
return
html = etree.HTML(response.content.decode("utf-8"))
categories = ["政务公开", "政府信息公开目录"]
try:
current_position = html.xpath("//span[@id='currentPosition']/text()")[0]
categories.append(current_position)
except:
pass
content_list = html.xpath("//div[@id='documentContainer']//div[@class='row']")
for each in content_list:
try:
content_title = each.xpath("./li[@class='mc']//a/text()")[0].encode('utf-8')
except:
continue
content_url = base_url + re.findall('/(\d+/[\w/.]+)',each.xpath("./li[@class='mc']//a/@href")[0])[0]
content_date = each.xpath("./li[@class='fwrq']/text()")[0].encode('utf-8')
content_date = ''.join(re.findall('\d+', content_date))
# print content_url
save = {"title":content_title,
"url":content_url,
"date":content_date,
"categories":categories}
parse_body(content_url,save)
# time.sleep(2)
def parse_page(page_url,base_url):
try:
browser.get(page_url)
except:
return
html = etree.HTML(browser.page_source)
# 获取翻页数
total_page = int(html.xpath("//div[@class='pageInfo list_navigator']/span[1]/span[1]/text()")[0])
print total_page
# 翻页
for each in range(total_page):
if each == 0:
page_url1 = page_url
else:
page_url1 = re.findall('([\w./:]+)\.htm',page_url)[0] + "_" + str(each) + '.htm'
print page_url1
parse_contents(page_url1,base_url)
#time.sleep(5)
def main():
for base_url in base_urls:
browser.get(base_url)
res = browser.page_source
# print res
html = etree.HTML(res)
# 跳转网站地址
url = html.xpath("//iframe[@id='DataList']/@src")[0]
print url
page_url = base_url + re.findall('/([\w./]+)',url)[0]
parse_page(page_url,base_url)
# time.sleep(5)
browser.close() # 关闭当前窗口, 如果当前窗口是最后一个窗口, 浏览器将关闭
if __name__ == '__main__':
main()