国家烟草专卖局的网址是:http://www.tobacco.gov.cn/html/
要爬取的内容为各省级局的新闻。
大部分的省的新闻页url都是有规律的,比如贵州省的是
http://www.tobacco.gov.cn/html/36/3617/361704_i.html
这个i就是页数。
但有些省的新闻页url在翻页后是不变的,比如江西省,从第一页到最后一页一直都是http://jx.tobacco.com.cn/nportal/portal/_ns:YVAtMTQ2ZGMzYTk5YzQtMTAwODZ8YzB8ZDB8ZWNob2ljZUlkPTE9MTEwfGVwYWdlTnVtYmVyPTE9NQ__/zwxx/zxdt.psml
,这时就需要使用selenuim处理JavaScript实现的翻页。
具体代码如下:
# -*- coding: utf-8 -*-
from selenium import webdriver
import pymysql
from pymysql.cursors import DictCursor
from lxml import etree
import requests
import random
import re
import time
driver = webdriver.PhantomJS('G:\Python Extension Packages\phantomjs-2.1.1-windows\\bin\phantomjs.exe')
# driver = webdriver.PhantomJS()
url = ['http://jx.tobacco.com.cn/nportal/portal/zwxx/zxdt.psml']
cls = ["省局信息", "地市信息", "基层信息"]
page = [40, 133, 57]
db_params = dict(
host="localhost",
db="chinatobacco",
user="root",
passwd="123456",
charset="utf8",
cursorclass=DictCursor,
use_unicode=True
)
connect = pymysql.connect(**db_params)
cursor = connect.cursor()
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) App leWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53",
"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; QIHU 360EE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)",
"'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh; U; IntelMac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0",
]
def parse_and_save(href, count, cls):
try:
link = href
headers = {'user-agent': random.choice(USER_AGENTS)}
text = requests.get(href, headers=headers)
text.encoding = "utf-8"
tree = etree.HTML(text.text)
title = tree.xpath('//div[@class="article-content-t"]/text()')
if title:
title = title[0].split()[0]
# print(title)
date = tree.xpath('//div[@class="article-content-ban"]/span/text()')
# print(date)
date = date[0].split(":")[-1]
# print(date)
ps = tree.xpath('//div[@class="content-text"]/p/text()')
content = ''.join(p for p in ps)
# print(content)
sql = "insert into jiangxi(title, link, cls, date, content) values (%s, %s, %s, %s, %s)"
params = (title, link, cls, date, content)
# print(params)
cursor.execute(sql, params)
connect.commit()
count += 1
except:
pass
return count
if __name__ == "__main__":
content_count = 0
hrefs = []
# for url in urls[:]:
# tab = tabs[n]
# n += 1
# driver.get(url)
# # 获取该栏目总页数
# num = driver.find_element_by_xpath('//*[@class="page_show"]/span/font[2]').text
driver.get(url[0])
driver.find_element_by_link_text('省局信息').click()
time.sleep(2)
page_num=page[0]
# text = requests.get(url[0])
# time.sleep(0.5)
# text.encoding = "GBK"
# tree = etree.HTML(text.text)
# divs = tree.xpath('//*[@id="div110"]/div/ul/form/div[1]/li/p/a')
divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
# print(divs)
for div in divs:
lis = div.find_elements_by_xpath('p/a')
# print(lis)
for li in lis:
href = li.get_attribute('href')
# print(href)
content_count = parse_and_save(href=href, count=content_count, cls=cls[0])
print(cls[0], "第", 1, "页")
for i in range(page_num - 1):
# print(driver.page_source)
driver.find_element_by_link_text('下一页').click()
time.sleep(2)
divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
for div in divs:
lis = div.find_elements_by_xpath('p/a')
for li in lis:
href = li.get_attribute('href')
content_count = parse_and_save(href=href, count=content_count, cls=cls[0])
print(cls[0], "第", i + 2, "页")
# lis = driver.find_elements_by_xpath('//*[@id="second_main_right"]/dl/dd/ul/li')
# content_count = parse_and_save(lis=lis, count=content_count)
print("爬取%d篇文章" % content_count)
driver.get(url[0])
driver.find_element_by_link_text('地市信息').click()
time.sleep(2)
page_num = page[1]
divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
for div in divs:
lis = div.find_elements_by_xpath('p/a')
for li in lis:
href = li.get_attribute('href')
content_count = parse_and_save(href=href, count=content_count, cls=cls[1])
print(cls[1], "第", 1, "页")
for i in range(page_num - 1):
# print(driver.page_source)
driver.find_element_by_link_text('下一页').click()
time.sleep(2)
divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
for div in divs:
lis = div.find_elements_by_xpath('p/a')
for li in lis:
href = li.get_attribute('href')
content_count = parse_and_save(href=href, count=content_count, cls=cls[1])
print(cls[1], "第", i + 2, "页")
# lis = driver.find_elements_by_xpath('//*[@id="second_main_right"]/dl/dd/ul/li')
# content_count = parse_and_save(lis=lis, count=content_count)
print("爬取%d篇文章" % content_count)
driver.get(url[0])
driver.find_element_by_link_text('基层信息').click()
time.sleep(2)
page_num = page[2]
# lis = driver.find_elements_by_xpath('//*[@id="second_main_right"]/dl/dd/ul/li')
divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
for div in divs:
lis = div.find_elements_by_xpath('p/a')
for li in lis:
href = li.get_attribute('href')
content_count = parse_and_save(href=href, count=content_count, cls=cls[2])
print(cls[2], "第", 1, "页")
for i in range(page_num-1):
# print(driver.page_source)
driver.find_element_by_link_text('下一页').click()
time.sleep(2)
divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
for div in divs:
lis = div.find_elements_by_xpath('p/a')
for li in lis:
href = li.get_attribute('href')
content_count = parse_and_save(href=href, count=content_count, cls=cls[2])
print(cls[2], "第", i+2, "页")
# lis = driver.find_elements_by_xpath('//*[@id="second_main_right"]/dl/dd/ul/li')
# content_count = parse_and_save(lis=lis, count=content_count)
print("爬取%d篇文章" % content_count)
driver.quit()
遇到的问题:
1 页面不能右键
解决:shift+f10
2 页面里的xpath和程序里爬的不一定对应
程序里要使用[@class]这种形式
3 单步调试 加断点 debug
4 异常处理
5 pymysql.err.InternalError: (1241, ‘Operand should contain 1 column(s)’)
原因是页面源代码中的标题分成了两部分,因此爬下来的title分为两部分 以列表的形式保存了,所以存不进去
6 requests.exceptions.ConnectionError: (‘Connection aborted.’, ConnectionResetError(10054, ‘远程主机强迫关闭了一个现有的连接。’, None, 10054, None))
目前的解决方案是sleep时间长一点然后报错了就从上次爬的页数往后重新运行,不知道有没有什么好的方法
7 注意没有.html的网页不能乱加
8 对于http://www.hntobacco.gov.cn/export/sites/mysite/gongzuodongtai/yancaoyaowen/###
这种翻页其实没有翻页,2000多条内容全在一页里,会爬着爬着自动停掉,不知道为什么,每次停的地方还不一样。