爬虫之Selenium库

官方文档:https://selenium-python.readthedocs.io/

Selenium:自动化测试工具,支持多种浏览器。爬虫中主要用来解决JavaScript渲染的问题。

一、开始

基本使用

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

browser = webdriver.Chrome()
try:
    browser.get("https://www.baidu.com")
    input = browser.find_element_by_id("kw")  # 查找id为kw的元素
    input.send_keys("Python")                 # 相当于在输入框输入Python
    input.send_keys(Keys.ENTER)               # 回车
    wait = WebDriverWait(browser, 10)
    # 等待id为content_left的元素加载出来
    wait.until(EC.presence_of_element_located((By.ID, "content_left")))
    print(browser.current_url)
    print(browser.get_cookies())
    print(browser.page_source)
finally:
    browser.close()

声明浏览器对象

from selenium import webdriver

browser = webdriver.Chrome()
browser = webdriver.Firefox()
browser = webdriver.Edge()
browser = webdriver.PhantomJS()
browser = webdriver.Safari()

访问页面

from selenium import webdriver

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
print(browser.page_source)
browser.close()

二、查找元素

单个元素

from selenium import webdriver

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
# 查找id为q的元素
input1 = browser.find_element_by_id("q")
# CSS选择器选择元素
input2 = browser.find_element_by_css_selector("#q")
# xpath选择元素
input3 = browser.find_element_by_xpath("//*[@id='q']")
print(input1, input2, input3)
"""
<selenium.webdriver.remote.webelement.WebElement (session="7b4386265c07c8e860a4e57cf7f15e6a", element="0.2418348835793498-1")>
<selenium.webdriver.remote.webelement.WebElement (session="7b4386265c07c8e860a4e57cf7f15e6a", element="0.2418348835793498-1")>
<selenium.webdriver.remote.webelement.WebElement (session="7b4386265c07c8e860a4e57cf7f15e6a", element="0.2418348835793498-1")>
"""
browser.close()

查找方式:

find_element_by_name
find_element_by_xpath
find_element_by_link_text
find_element_by_partial_link_text
find_element_by_tag_name
find_element_by_class_name
find_element_by_css_selector
View Code

通用查找方式:

from selenium import webdriver
from selenium.webdriver.common.by import By

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input = browser.find_element(By.ID, "q")
print(input)
browser.close()

多个元素

实际上就是复数的区别。

from selenium import webdriver
from selenium.webdriver.common.by import By

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")

li = browser.find_elements_by_css_selector(".service-bd li")
print(li)
browser.close()
####################
li = browser.find_elements(By.CSS_SELECTOR, ".service-bd li")
print(li)
browser.close()

查找方式: 

find_elements_by_name
find_elements_by_xpath
find_elements_by_link_text
find_elements_by_partial_link_text
find_elements_by_tag_name
find_elements_by_class_name
find_elements_by_css_selector
View Code

三、元素交互操作

对获取的元素调用交互方法。

import time
from selenium import webdriver

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input = browser.find_element_by_id("q")
input.send_keys("iPhone")
time.sleep(1)
input.clear()
input.send_keys("iPad")
button = browser.find_element_by_class_name("btn-search")
button.click()

更多操作:https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.remote.webelement

1

猜你喜欢

转载自www.cnblogs.com/believepd/p/10657953.html