python selenium简单使用

selenium的简单使用

安装 selenium
pip install selenium
安装pymongo
pip install pymongo

爬取起点完本小说排行榜数据并保存到MongoDB数据库
代码如下

import time
import pymongo
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

# driver = webdriver.Chrome()

# 设置不显示浏览器窗口
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(options=options)

# 设置等待时长
driver.implicitly_wait(15)

# 起点完本小说排行榜第一页
url = 'https://www.qidian.com/rank/fin?dateType=3&page=1'
driver.get(url)

# 获得主窗口句柄
mainwindow = driver.current_window_handle

# 开启数据库连接
mongoclient = pymongo.MongoClient(host='10.31.160.242',port=27017)
mongodb = mongoclient['novel']
mongocollection = mongodb['novel_collections']

while True:
    # 爬取每一页信息
    booksList = driver.find_element_by_class_name('book-img-text').find_elements_by_xpath('.//li')
    for book in booksList:
        item = {}
        name = book.find_element_by_xpath('.//h4').text     # 获取文本信息
        author = book.find_element_by_xpath('.//p/a[1]').text
        type = book.find_element_by_xpath('.//p/a[2]').text

        infoClick = book.find_element_by_partial_link_text('书籍详情')
        infoClick.click()
        # 获得小说详情窗口的句柄
        book_detail_window = driver.window_handles[-1]
        time.sleep(0.5) # 休眠一会,防止打开网页速度过快
        driver.switch_to_window(book_detail_window)
        try:
            bookinfo = driver.find_element_by_xpath('//div[@class="book-intro"]').text.strip()
            # 将数据保存到monggodd
            item['name'] = name
            item['author'] = author
            item['type'] = type
            item['bookinfo'] = bookinfo
            mongocollection.insert(item)
        except:
            print(name,'未获取到详细内容')
        finally:
            driver.close()
            print(name)

        # 回到主窗口
        driver.switch_to_window(mainwindow)

    try:
        next_page = driver.find_element_by_xpath('//a[contains(@class,"lbf-pagination-next")]')
        if next_page.get_attribute('class') == "lbf-pagination-next lbf-pagination-disabled":
            break
    except NoSuchElementException as e:
        print('爬取完毕')
        break
    else:
        time.sleep(1)
        driver.find_element_by_class_name('lbf-pagination-input')
        print('第{page}页爬取完成'.format(page=driver.find_element_by_class_name('lbf-pagination-input').get_attribute('value')))
        next_page.click()

# 关闭数据库连接
mongoclient.close()

# 退出
driver.quit()

猜你喜欢

转载自blog.csdn.net/dandanfengyun/article/details/83590495