selenium的简单使用
安装 selenium
pip install selenium
安装pymongo
pip install pymongo
爬取起点完本小说排行榜数据并保存到MongoDB数据库
代码如下
import time
import pymongo
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
# driver = webdriver.Chrome()
# 设置不显示浏览器窗口
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(options=options)
# 设置等待时长
driver.implicitly_wait(15)
# 起点完本小说排行榜第一页
url = 'https://www.qidian.com/rank/fin?dateType=3&page=1'
driver.get(url)
# 获得主窗口句柄
mainwindow = driver.current_window_handle
# 开启数据库连接
mongoclient = pymongo.MongoClient(host='10.31.160.242',port=27017)
mongodb = mongoclient['novel']
mongocollection = mongodb['novel_collections']
while True:
# 爬取每一页信息
booksList = driver.find_element_by_class_name('book-img-text').find_elements_by_xpath('.//li')
for book in booksList:
item = {}
name = book.find_element_by_xpath('.//h4').text # 获取文本信息
author = book.find_element_by_xpath('.//p/a[1]').text
type = book.find_element_by_xpath('.//p/a[2]').text
infoClick = book.find_element_by_partial_link_text('书籍详情')
infoClick.click()
# 获得小说详情窗口的句柄
book_detail_window = driver.window_handles[-1]
time.sleep(0.5) # 休眠一会,防止打开网页速度过快
driver.switch_to_window(book_detail_window)
try:
bookinfo = driver.find_element_by_xpath('//div[@class="book-intro"]').text.strip()
# 将数据保存到monggodd
item['name'] = name
item['author'] = author
item['type'] = type
item['bookinfo'] = bookinfo
mongocollection.insert(item)
except:
print(name,'未获取到详细内容')
finally:
driver.close()
print(name)
# 回到主窗口
driver.switch_to_window(mainwindow)
try:
next_page = driver.find_element_by_xpath('//a[contains(@class,"lbf-pagination-next")]')
if next_page.get_attribute('class') == "lbf-pagination-next lbf-pagination-disabled":
break
except NoSuchElementException as e:
print('爬取完毕')
break
else:
time.sleep(1)
driver.find_element_by_class_name('lbf-pagination-input')
print('第{page}页爬取完成'.format(page=driver.find_element_by_class_name('lbf-pagination-input').get_attribute('value')))
next_page.click()
# 关闭数据库连接
mongoclient.close()
# 退出
driver.quit()