用selenium爬取百度新闻

# -*- coding: UTF-8 -*-
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from bs4 import BeautifulSoup
import urllib.request
import io
import json
import os
import requests
import sys
#用selenium模拟浏览器行为打开chrome,打开新闻首页,输入关键词,点击,然后查找下一页
# import chardet
import re
def test_sel(keyword):
    driver = webdriver.Chrome()
    link = 'http://news.baidu.com/?tn=news'
    driver.get(link)
    try:
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.ID, "ww"))
        )

    except TimeoutException:
        print ('加载页面失败')
    try:
        element = driver.find_element_by_css_selector('#ww')
        print ('成功找到了搜索框')
        keyword = keyword
        print (keyword)
        print ('输入关键字', keyword)
        element.send_keys(keyword)
        element.send_keys(Keys.ENTER)
    except NoSuchElementException:
        print ('没有找到搜索框')
    print(u'正在查询该关键字')
    html = driver.page_source
    num1 = crawl_html(html,keyword,0)
    isTure = True
    while(isTure):
        try:
            element = driver.find_element_by_link_text('下一页>').click()
            print('成功找到了按钮')
            html = driver.page_source
            # print(html)
            # crawl_html(html, keyword,0+num1)
            num1 = crawl_html(html, keyword,0+num1)
        except NoSuchElementException:
            print ('没有点击按钮')
            isTure = False
    return html

def crawl_html(html,keyword,num):

    bf = BeautifulSoup(html, 'lxml')
    mkdir_path('D:/a-学习/b-前端/新闻1//%s' % keyword)
    try:
        for i in range(0, 20):
            print('i:'+str(i))
            texts = bf.find_all('h3', class_='c-title', recursive=True)[i].find('a')
            # title = bf.find_all('h3', class_='c-title',recursive=True)[i].find('a')
            url_news = texts['href']
            text_time = bf.find_all('p', class_='c-author', recursive=True)[i].text
            try:
                page_req_news = urllib.request.urlopen(url_news).read()

                page_news = page_req_news.decode('utf-8')
                # page_req_news = requests.get(url_news)
                # page_news = page_req_news.text
                # print()
                # page_news = page_news.text
                bf_news = BeautifulSoup(page_news, 'lxml')
                # print(url_news)
                bf_news_title = bf_news.find('head').find('title')
                if (bf_news_title != None):
                    dict = {
                        "title": text_time,
                        "url": url_news,
                        "html": page_news

                    }
                    num += 1
                    print('num:'+str(num))
                    with open("D:/a-学习/b-前端/新闻1/%s/%s.json" % (keyword,num), "w+", encoding='utf-8') as f:
                            # json.dump(dict,f,False,indent=4)
                            json.dump(dict,f,ensure_ascii=False,indent=4)
            except Exception:
                continue
    except Exception:
        return num
    return num


def mkdir_path(path):

    # 去除首位空格
    path = path.strip()
    # 去除尾部 \ 符号
    path = path.rstrip("\\")

    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists = os.path.exists(path)

    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        os.makedirs(path)

        print(path + ' 创建成功')
        return True
    else:
        # 如果目录存在则不创建,并提示目录已存在
        print(path + ' 目录已存在')
        return False

def next_page(html):
    a =1

if __name__ == "__main__":
    # keys = ['哈尔滨二环桥发生交通事故','山东昌乐一化工企业发生火灾事故','天津河西火灾事故','广西钦州一食品厂发生气体中毒事故','杭州富阳发生翻车事故','江苏盐城一司机醉驾致5人死亡4人受伤','大连一石化公司厂区发生工人中毒事件','北京大兴西红门镇新建村发生火灾','11·15滁新高速车辆相撞事故','11·11上海超市坍塌事故','河南新乡境内发生车祸12死11伤','广西柳州一村子发生火灾',
    #         '云南大关县青林村发生山体滑坡','四川芦山:5名幼林抚育人员失联4人遇难','陕西柞水一餐馆疑似发生食物中毒','云南曲靖发生交通事故','云南昆石高速发生4车相撞事故',
    #         '江苏宝应一食品厂发生毒气中毒事故 ','山东枣庄一水泥厂突发一氧化碳中毒','8·8九寨沟地震','8·11赤峰龙卷风灾害','西汉高速陕西段发生一起重大交通事故','甘肃文县暴洪灾害造成7人遇难2人失联','四川凉山州普格县发生泥石流','云南省巧家县发生山体滑坡','徐州一在建工地施工平台滑塌致4人死亡','徐州一在建工地施工平台滑塌致4人死亡','7·27榆林特大暴雨 ','7·21蔚县重大交通事故',
    #         '7.16江苏常熟民房火灾事故','甘肃平川区红会路街道遭受暴雨','江西洪涝致452万人受灾7人死亡2人失踪 转移55万余人','2017年湖南洪灾']
    keys = ['7.16江苏常熟民房火灾事故','甘肃平川区红会路街道遭受暴雨','江西洪涝致452万人受灾7人死亡2人失踪 转移55万余人','2017年湖南洪灾']
    for key in keys:
        # test_sel(key)#
        # url = 'http://news.baidu.com/ns?word=%E5%A4%A9%E5%8E%9F%E6%B6%B2%E6%B0%AF%E7%88%86%E7%82%B8&tn=news&from=news&cl=2&rn=20&ct=1'
        test_sel(key)
        # page_req = requests.get(url)
        # page = page_req.text
        # crawl_html(page, key,0)

    # mkdir_path('D:/a-学习/b-前端/新闻1//火灾')

猜你喜欢

转载自blog.csdn.net/lizhaozhaozhaoxuan/article/details/80550645