python爬虫(以国家烟草网新闻为例)

国家烟草专卖局的网址是:http://www.tobacco.gov.cn/html/
要爬取的内容为各省级局的新闻。
大部分的省的新闻页url都是有规律的,比如贵州省的是
http://www.tobacco.gov.cn/html/36/3617/361704_i.html 这个i就是页数。
但有些省的新闻页url在翻页后是不变的,比如江西省,从第一页到最后一页一直都是http://jx.tobacco.com.cn/nportal/portal/_ns:YVAtMTQ2ZGMzYTk5YzQtMTAwODZ8YzB8ZDB8ZWNob2ljZUlkPTE9MTEwfGVwYWdlTnVtYmVyPTE9NQ__/zwxx/zxdt.psml,这时就需要使用selenuim处理JavaScript实现的翻页。
具体代码如下:

# -*- coding: utf-8 -*-
from selenium import webdriver
import pymysql
from pymysql.cursors import DictCursor
from lxml import etree
import requests
import random
import re
import time

driver = webdriver.PhantomJS('G:\Python Extension Packages\phantomjs-2.1.1-windows\\bin\phantomjs.exe')
# driver = webdriver.PhantomJS()
url = ['http://jx.tobacco.com.cn/nportal/portal/zwxx/zxdt.psml']
cls = ["省局信息", "地市信息", "基层信息"]
page = [40, 133, 57]
db_params = dict(
    host="localhost",
    db="chinatobacco",
    user="root",
    passwd="123456",
    charset="utf8",
    cursorclass=DictCursor,
    use_unicode=True
)
connect = pymysql.connect(**db_params)
cursor = connect.cursor()

USER_AGENTS = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) App leWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53",
    "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ;  QIHU 360EE)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)",
    "'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Macintosh; U; IntelMac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1Safari/534.50",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0",
    ]



def parse_and_save(href, count, cls):
    try:
        link = href
        headers = {'user-agent': random.choice(USER_AGENTS)}
        text = requests.get(href, headers=headers)
        text.encoding = "utf-8"
        tree = etree.HTML(text.text)
        title = tree.xpath('//div[@class="article-content-t"]/text()')
        if title:
            title = title[0].split()[0]
            # print(title)
            date = tree.xpath('//div[@class="article-content-ban"]/span/text()')
            # print(date)
            date = date[0].split(":")[-1]
            # print(date)
            ps = tree.xpath('//div[@class="content-text"]/p/text()')
            content = ''.join(p for p in ps)
            # print(content)
            sql = "insert into jiangxi(title, link, cls, date, content) values (%s, %s, %s, %s, %s)"
            params = (title, link, cls, date, content)
            # print(params)
            cursor.execute(sql, params)
            connect.commit()
            count += 1
    except:
        pass
    return count


if __name__ == "__main__":

    content_count = 0
    hrefs = []
    # for url in urls[:]:
    #     tab = tabs[n]
    #     n += 1
    #     driver.get(url)
    #     # 获取该栏目总页数
    #     num = driver.find_element_by_xpath('//*[@class="page_show"]/span/font[2]').text
    driver.get(url[0])
    driver.find_element_by_link_text('省局信息').click()
    time.sleep(2)
    page_num=page[0]
    # text = requests.get(url[0])
    # time.sleep(0.5)
    # text.encoding = "GBK"
    # tree = etree.HTML(text.text)
    # divs = tree.xpath('//*[@id="div110"]/div/ul/form/div[1]/li/p/a')
    divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
    # print(divs)
    for div in divs:
        lis = div.find_elements_by_xpath('p/a')
        # print(lis)
        for li in lis:
            href = li.get_attribute('href')
            # print(href)
            content_count = parse_and_save(href=href, count=content_count, cls=cls[0])
    print(cls[0], "第", 1, "页")
    for i in range(page_num - 1):
        # print(driver.page_source)
        driver.find_element_by_link_text('下一页').click()
        time.sleep(2)
        divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
        for div in divs:
            lis = div.find_elements_by_xpath('p/a')
            for li in lis:
                href = li.get_attribute('href')
                content_count = parse_and_save(href=href, count=content_count, cls=cls[0])
        print(cls[0], "第", i + 2, "页")
    # lis = driver.find_elements_by_xpath('//*[@id="second_main_right"]/dl/dd/ul/li')
    #     content_count = parse_and_save(lis=lis, count=content_count)
    print("爬取%d篇文章" % content_count)


    driver.get(url[0])
    driver.find_element_by_link_text('地市信息').click()
    time.sleep(2)
    page_num = page[1]
    divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
    for div in divs:
        lis = div.find_elements_by_xpath('p/a')
        for li in lis:
            href = li.get_attribute('href')
            content_count = parse_and_save(href=href, count=content_count, cls=cls[1])
    print(cls[1], "第", 1, "页")
    for i in range(page_num - 1):
        # print(driver.page_source)
        driver.find_element_by_link_text('下一页').click()
        time.sleep(2)
        divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
        for div in divs:
            lis = div.find_elements_by_xpath('p/a')
            for li in lis:
                href = li.get_attribute('href')
                content_count = parse_and_save(href=href, count=content_count, cls=cls[1])
        print(cls[1], "第", i + 2, "页")
    # lis = driver.find_elements_by_xpath('//*[@id="second_main_right"]/dl/dd/ul/li')
    #     content_count = parse_and_save(lis=lis, count=content_count)
    print("爬取%d篇文章" % content_count)

    driver.get(url[0])
    driver.find_element_by_link_text('基层信息').click()
    time.sleep(2)
    page_num = page[2]

    # lis = driver.find_elements_by_xpath('//*[@id="second_main_right"]/dl/dd/ul/li')
    divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
    for div in divs:
        lis = div.find_elements_by_xpath('p/a')
        for li in lis:
            href = li.get_attribute('href')
            content_count = parse_and_save(href=href, count=content_count, cls=cls[2])
    print(cls[2], "第", 1, "页")
    for i in range(page_num-1):
        # print(driver.page_source)
        driver.find_element_by_link_text('下一页').click()
        time.sleep(2)
        divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
        for div in divs:
            lis = div.find_elements_by_xpath('p/a')
            for li in lis:
                href = li.get_attribute('href')
                content_count = parse_and_save(href=href, count=content_count, cls=cls[2])
        print(cls[2], "第", i+2, "页")
    #     lis = driver.find_elements_by_xpath('//*[@id="second_main_right"]/dl/dd/ul/li')
    #     content_count = parse_and_save(lis=lis, count=content_count)
    print("爬取%d篇文章" % content_count)
    driver.quit()

遇到的问题:
1 页面不能右键
解决:shift+f10
2 页面里的xpath和程序里爬的不一定对应
程序里要使用[@class]这种形式
3 单步调试 加断点 debug
4 异常处理
5 pymysql.err.InternalError: (1241, ‘Operand should contain 1 column(s)’)
原因是页面源代码中的标题分成了两部分,因此爬下来的title分为两部分 以列表的形式保存了,所以存不进去
6 requests.exceptions.ConnectionError: (‘Connection aborted.’, ConnectionResetError(10054, ‘远程主机强迫关闭了一个现有的连接。’, None, 10054, None))
目前的解决方案是sleep时间长一点然后报错了就从上次爬的页数往后重新运行,不知道有没有什么好的方法
7 注意没有.html的网页不能乱加
8 对于http://www.hntobacco.gov.cn/export/sites/mysite/gongzuodongtai/yancaoyaowen/###这种翻页其实没有翻页,2000多条内容全在一页里,会爬着爬着自动停掉,不知道为什么,每次停的地方还不一样。

猜你喜欢

转载自blog.csdn.net/vivian_ll/article/details/79005067