爬虫--selenium

什么是selenium?

基本使用

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support. wait import WebDriverWait
browser = webdriver.Chrome() # Chrom驱动
try:
    browser.get("https://www.baidu.com") # 输入 www.baidu.com 网址
    input = browser.find_element_by_id("kw") # 找到一个 kw 元素赋值为 input
    input.send_keys("Python") # 在网页里敲入Python
    input.send_keys(Keys.ENTER) # 在网页里敲入回车
    wait=WebDriverWait(browser,10) # 等待10秒
    wait.until(EC.presence_of_element_located((By.ID,"content_left"))) # 等待content_left元素被加载出来
    print(browser.current_url) # 打印目前的url
    print(browser.get_cookies()) # 打印cookies
    # print(browser.page_source) # 打印 网页源代码
finally:
    browser.close() # 关闭浏览器
https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=0&rsv_idx=1&tn=baidu&wd=Python&rsv_pq=c618fa5900004b25&rsv_t=c25fWJbEN2wl13gOxRoocQDIAUMPaoguAnEu9Rg4KGX4uoRC0lynG5EjFGY&rqlang=cn&rsv_enter=1&rsv_sug3=6&rsv_sug2=0&inputT=162&rsv_sug4=162
[{'path': '/', 'value': '1468_21104_18559_26350_22075', 'secure': False, 'domain': '.baidu.com', 'httpOnly': False, 'name': 'H_PS_PSSID'}, {'path': '/', 'value': '3FFEC0A0709997465509BC1AFB51F757:FG=1', 'secure': False, 'domain': '.baidu.com', 'expiry': 3685018278.537968, 'httpOnly': False, 'name': 'BAIDUID'}, {'path': '/', 'value': '3FFEC0A0709997465509BC1AFB51F757', 'secure': False, 'domain': '.baidu.com', 'expiry': 3685018278.538044, 'httpOnly': False, 'name': 'BIDUPSID'}, {'path': '/', 'value': '1537534637', 'secure': False, 'domain': '.baidu.com', 'expiry': 3685018278.538084, 'httpOnly': False, 'name': 'PSTM'}, {'path': '/', 'value': '0', 'secure': False, 'domain': 'www.baidu.com', 'expiry': 2483614633.353963, 'httpOnly': False, 'name': 'delPer'}, {'path': '/', 'value': '0', 'secure': False, 'domain': 'www.baidu.com', 'httpOnly': False, 'name': 'BD_HOME'}, {'path': '/', 'value': 'B490B5EBF6F3CD402E515D22BCDA1598', 'secure': False, 'domain': '.baidu.com', 'expiry': 1537621032.332359, 'httpOnly': False, 'name': 'BDORZ'}, {'path': '/', 'value': '12314353', 'secure': False, 'domain': 'www.baidu.com', 'expiry': 1538398632, 'httpOnly': False, 'name': 'BD_UPN'}, {'path': '/', 'value': '1', 'secure': False, 'domain': 'www.baidu.com', 'httpOnly': False, 'name': 'BD_CK_SAM'}, {'path': '/', 'value': '1', 'secure': False, 'domain': '.baidu.com', 'httpOnly': False, 'name': 'PSINO'}, {'path': '/', 'value': 'bf67l36p%2FgarFggwpeficZXTG5zE%2FhBZEp2ev5JvDSo8venU134svju%2FJL4', 'secure': False, 'domain': 'www.baidu.com', 'expiry': 1537537225, 'httpOnly': False, 'name': 'H_PS_645EC'}]
打印后的结果为:

生命浏览器对象

from selenium import webdriver
browser_1 = webdriver.Chrome()
browser_2 = webdriver.Firefox()
browser_3 = webdriver.Edge()
browser_4 = webdriver.PhantomJS()
browser_5 = webdriver.Safari()

访问页面

from selenium import webdriver

browser = webdriver.Chrome()
browser.get("http://www.tabao.com")
print(browser.get_cookie)
browser.close()
<bound method WebDriver.get_cookie of <selenium.webdriver.chrome.webdriver.WebDriver (session="65ad512a5c81e7d9f6f3bd81a4ba3495")>>
打印后的结果为:

查找元素

单个元素

from selenium import webdriver

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input_first = browser.find_element_by_id("q")
input_second = browser.find_element_by_xpath('//*[@id="q"]')
input_third = browser.find_element_by_css_selector("#q")
print(input_first)
print("----------------------------------------------------------------")
print(input_second)
print("----------------------------------------------------------------")
print(input_third)
browser.close()
<selenium.webdriver.remote.webelement.WebElement (session="72af913bda52ce0848486b08ba93d3a1", element="0.3093750540466209-1")>
----------------------------------------------------------------
<selenium.webdriver.remote.webelement.WebElement (session="72af913bda52ce0848486b08ba93d3a1", element="0.3093750540466209-1")>
----------------------------------------------------------------
<selenium.webdriver.remote.webelement.WebElement (session="72af913bda52ce0848486b08ba93d3a1", element="0.3093750540466209-1")>
打印的结果为:

比较通用的查找方式

from selenium import webdriver

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input_first = browser.find_element(By.ID,'q')
print(input_first)
browser.close()

运行的结果与上面一样!

多个元素

from selenium import webdriver

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input_first = browser.find_elements_by_css_selector(".service-bd li")
print(input_first)
browser.close()
[<selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-1")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-2")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-3")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-4")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-5")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-6")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-7")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-8")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-9")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-10")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-11")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-12")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-13")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-14")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-15")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-16")>]
打印后的结果为:

元素交互操作

对获取的元素调用交互方法

from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input = browser.find_element_by_id("q")
input.send_keys('iPhone')
time.sleep(1)
input.clear()
input.send_keys("iPad")
button = browser.find_element_by_class_name('btn-search')
button.click()
browser.close()

交互动作

将动作附加到动作链中串行执行

from selenium import webdriver
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()
url ='http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
browser.get(url) # 请求url
browser.switch_to.frame('iframeResult') # 切换到 frame
source = browser.find_elements_by_css_selector('#draggable')
target = browser.find_elements_by_css_selector('#droppable')
actions = ActionChains(browser) # 声明动作链
actions.drag_and_drop(source,target)
actions.perform() # 执行这个动作

执行JavaScript

from selenium import webdriver

browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') # 滑动栏拖拽到最下方
browser.execute_script('alert("To Bottom")')

执行后的结果为:

获取元素信息

获取属性

from selenium import webdriver
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()
browser.get("https://www.zhihu.com/explore")
logo = browser.find_element_by_id('zh-top-link-logo')
print(logo)
print(logo.get_attribute('class'))
<selenium.webdriver.remote.webelement.WebElement (session="ef2d80c82e37098c4c702fe5c0e2df31", element="0.9948931372437708-1")>
zu-top-link-logo
打印后的结果为:

获取文本值

from selenium import webdriver
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()
browser.get("https://www.zhihu.com/explore")
input = browser.find_element_by_class_name('post-link')
print(input.text) # 打印获取的文本信息
《红色警戒》的世界:没有希特勒 二战死了1亿人
打印后的结果为:

获取ID、位置、标签名、大小

from selenium import webdriver
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()
browser.get("https://www.zhihu.com/explore")
input = browser.find_element_by_class_name('post-link')
print(input.text) # 打印获取的文本信息
print(input.id) # 获取id
print(input.location) # 获取位置
print(input.tag_name) # 获取标签名
print(input.size) # 获取大小
browser.close()
《红色警戒》的世界:没有希特勒 二战死了1亿人
0.22091173377675544-1
{'y': 304, 'x': 32}
a
{'height': 16, 'width': 306}
打印后的结果为:

Frame

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
browser = webdriver.Chrome()
url='http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
browser.get(url)
browser.switch_to.frame('iframeResult')
source =browser.find_element_by_css_selector("#draggable")
print(source)
try:
    logo=browser.find_element_by_class_name('logo')
except NoSuchElementException:
    print("NO LOGO")
browser.switch_to.parent_frame()
logo = browser.find_element_by_class_name('logo')
print(logo)
print(logo.text)
<selenium.webdriver.remote.webelement.WebElement (session="ecfecc0e705df8976f5241726b66e273", element="0.27322378119978463-1")>
NO LOGO
<selenium.webdriver.remote.webelement.WebElement (session="ecfecc0e705df8976f5241726b66e273", element="0.8128333237150809-2")>
RUNOOB.COM
打印后的结果为:

等待

隐式等待

当使用了隐式等待执行测试的时候,如果WebDriver没有在DOM中找到元素,将继续等待,超出设定时间后则抛出找不到元素的异常,换句话说,当查找元素或元素并没有立即出现的时候,隐式等待将等待一段时间再查找DOM,默认的时间是0。

from selenium import webdriver

browser = webdriver.Chrome()
browser.implicitly_wait(10)
browser.get("https://www.zhihu.com/explore")
input = browser.find_element_by_class_name('zu-top-add-question')
print(input)
<selenium.webdriver.remote.webelement.WebElement (session="87a2e958b9a3b58334e8c2ec76d0419e", element="0.014192877625801792-1")>
打印后的结果为:

显示等待

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

browser = webdriver.Chrome()
browser.get("http://www.taobao.com/")
wait = WebDriverWait(browser,10)
input = wait.until(EC.presence_of_all_elements_located((By.ID,'q')))
button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'btn-search')))
print(input,button)
Traceback (most recent call last):
  File "C:/Users/Administrator/Desktop/正则表达式/正则表达式.py", line 10, in <module>
    button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'btn-search')))
  File "C:\Users\Administrator\Desktop\正则表达式\venv\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
    raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message: 
不知什么原因,总是超时

·title_is 标题是某内容
·title_contains 标题包含某内容
·presence_of_element located 元素加载出,传入定位元组,如(By.lD,p)
·visibility_of_element located 元素可见,传入定位元组
·visibility_of可见,传入元素对象
·presence_of_all_elements_located 所有元素加载出
·text_to_be_present_in_element某个元素文本龟含某文字
.text to_be_present_in_element_value 某个元素值包含某文字
·frame_to _be_available_and_switch_ to _it framea载并切换
·invisibility_of_element_located 元素不可见
·element_to_be_clickable 元素可点击
·staleness_of 判断一个元素是否仍在DOM,可判断页面是否已经刷新
·element to_be_selected 元素可选择,传元素对象
·element located_to_be_selected 元素可选择,传入定位元组
·element selection_state_to_be 传入元素对象以及状态,相等返回True,否则返回False
·element located_selection_state_to_be 传入定位元组以及状态,相等返回True,否则返回False
·alertis_present 是否出现Alert

前进后退

from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.get("https://www.taobao.com/")
browser.get("https://www.baidu.com/")
browser.get("https://www.jingdong.com/")
browser.back()
time.sleep(1)
browser.forward()
browser.close()

Cookies

from selenium import webdriver

browser = webdriver.Chrome()
browser.get("http://www.zhihu.com/explore")
print(browser.get_cookies())
browser.add_cookie({"name":"name","domain":"www,zhihu.com","value":"germey"})
print(browser.get_cookies())
browser.delete_all_cookies()
print(browser.get_cookies())
[{'path': '/', 'httpOnly': False, 'name': 'l_n_c', 'domain': '.zhihu.com', 'secure': False, 'value': '1'}, {'path': '/', 'httpOnly': False, 'name': 'tgw_l7_route', 'domain': 'www.zhihu.com', 'expiry': 1537604591.643548, 'secure': False, 'value': '156dfd931a77f9586c0da07030f2df36'}, {'path': '/', 'httpOnly': False, 'name': 'd_c0', 'domain': '.zhihu.com', 'expiry': 1632211696.283527, 'secure': False, 'value': '"AFCk5k4oQA6PTrVJvEIdM1iDREt1Ez3H0lw=|1537603702"'}, {'path': '/', 'httpOnly': False, 'name': '_xsrf', 'domain': '.zhihu.com', 'expiry': 1615363691.643627, 'secure': False, 'value': 'ZnQElKxeWBcoeNFASCTcgdhk56NJ83hf'}, {'path': '/', 'httpOnly': False, 'name': '__utmb', 'domain': '.zhihu.com', 'expiry': 1537605497, 'secure': False, 'value': '51854390.0.10.1537603697'}, {'path': '/', 'httpOnly': False, 'name': 'q_c1', 'domain': '.zhihu.com', 'expiry': 1632211692.325099, 'secure': False, 'value': '104b20902a9f4159b0c1811e7dd3959c|1537603698000|1537603698000'}, {'path': '/', 'httpOnly': False, 'name': 'r_cap_id', 'domain': '.zhihu.com', 'expiry': 1540195692.325146, 'secure': False, 'value': '"MWIwYjU3YmI1OWVkNGEwYmJhZGM0MTY5ZDQzZWU3MmQ=|1537603698|85c9986946afde9f1823ce067dc29aa2ea19d5f3"'}, {'path': '/', 'httpOnly': False, 'name': 'cap_id', 'domain': '.zhihu.com', 'expiry': 1540195692.325191, 'secure': False, 'value': '"OTVhZmIwZDkzYzliNDAzNWI4ZTJiNWM2NzY0NWFmMjQ=|1537603698|67c5e766f10d39421a3d3afc84d45dfed316ae18"'}, {'path': '/', 'httpOnly': False, 'name': 'l_cap_id', 'domain': '.zhihu.com', 'expiry': 1540195692.325236, 'secure': False, 'value': '"MDUzODliMmYwN2VlNDU1YjkwNGU0MjEwZDU0OTdkMjI=|1537603698|e44d90334d2319d2934c4b5cccb4d8d5a549247d"'}, {'path': '/', 'httpOnly': False, 'name': 'n_c', 'domain': '.zhihu.com', 'secure': False, 'value': '1'}, {'path': '/', 'httpOnly': False, 'name': '_zap', 'domain': '.zhihu.com', 'expiry': 1600675697, 'secure': False, 'value': 'c7dbed92-9690-47b9-886e-d5539b1f74b8'}, {'path': '/', 'httpOnly': False, 'name': '__utma', 'domain': '.zhihu.com', 'expiry': 1600675697, 'secure': False, 'value': '51854390.1486460487.1537603697.1537603697.1537603697.1'}, {'path': '/', 'httpOnly': False, 'name': '__utmc', 'domain': '.zhihu.com', 'secure': False, 'value': '51854390'}, {'path': '/', 'httpOnly': False, 'name': '__utmz', 'domain': '.zhihu.com', 'expiry': 1553371697, 'secure': False, 'value': '51854390.1537603697.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)'}, {'path': '/', 'httpOnly': False, 'name': '__utmv', 'domain': '.zhihu.com', 'expiry': 1600675697, 'secure': False, 'value': '51854390.000--|3=entry_date=20180922=1'}]
[{'path': '/', 'httpOnly': False, 'name': 'l_n_c', 'domain': '.zhihu.com', 'secure': False, 'value': '1'}, {'path': '/', 'httpOnly': False, 'name': 'tgw_l7_route', 'domain': 'www.zhihu.com', 'expiry': 1537604591.643548, 'secure': False, 'value': '156dfd931a77f9586c0da07030f2df36'}, {'path': '/', 'httpOnly': False, 'name': 'd_c0', 'domain': '.zhihu.com', 'expiry': 1632211696.283527, 'secure': False, 'value': '"AFCk5k4oQA6PTrVJvEIdM1iDREt1Ez3H0lw=|1537603702"'}, {'path': '/', 'httpOnly': False, 'name': '_xsrf', 'domain': '.zhihu.com', 'expiry': 1615363691.643627, 'secure': False, 'value': 'ZnQElKxeWBcoeNFASCTcgdhk56NJ83hf'}, {'path': '/', 'httpOnly': False, 'name': '__utmb', 'domain': '.zhihu.com', 'expiry': 1537605497, 'secure': False, 'value': '51854390.0.10.1537603697'}, {'path': '/', 'httpOnly': False, 'name': 'q_c1', 'domain': '.zhihu.com', 'expiry': 1632211692.325099, 'secure': False, 'value': '104b20902a9f4159b0c1811e7dd3959c|1537603698000|1537603698000'}, {'path': '/', 'httpOnly': False, 'name': 'r_cap_id', 'domain': '.zhihu.com', 'expiry': 1540195692.325146, 'secure': False, 'value': '"MWIwYjU3YmI1OWVkNGEwYmJhZGM0MTY5ZDQzZWU3MmQ=|1537603698|85c9986946afde9f1823ce067dc29aa2ea19d5f3"'}, {'path': '/', 'httpOnly': False, 'name': 'cap_id', 'domain': '.zhihu.com', 'expiry': 1540195692.325191, 'secure': False, 'value': '"OTVhZmIwZDkzYzliNDAzNWI4ZTJiNWM2NzY0NWFmMjQ=|1537603698|67c5e766f10d39421a3d3afc84d45dfed316ae18"'}, {'path': '/', 'httpOnly': False, 'name': 'l_cap_id', 'domain': '.zhihu.com', 'expiry': 1540195692.325236, 'secure': False, 'value': '"MDUzODliMmYwN2VlNDU1YjkwNGU0MjEwZDU0OTdkMjI=|1537603698|e44d90334d2319d2934c4b5cccb4d8d5a549247d"'}, {'path': '/', 'httpOnly': False, 'name': 'n_c', 'domain': '.zhihu.com', 'secure': False, 'value': '1'}, {'path': '/', 'httpOnly': False, 'name': '_zap', 'domain': '.zhihu.com', 'expiry': 1600675697, 'secure': False, 'value': 'c7dbed92-9690-47b9-886e-d5539b1f74b8'}, {'path': '/', 'httpOnly': False, 'name': '__utma', 'domain': '.zhihu.com', 'expiry': 1600675697, 'secure': False, 'value': '51854390.1486460487.1537603697.1537603697.1537603697.1'}, {'path': '/', 'httpOnly': False, 'name': '__utmc', 'domain': '.zhihu.com', 'secure': False, 'value': '51854390'}, {'path': '/', 'httpOnly': False, 'name': '__utmz', 'domain': '.zhihu.com', 'expiry': 1553371697, 'secure': False, 'value': '51854390.1537603697.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)'}, {'path': '/', 'httpOnly': False, 'name': '__utmv', 'domain': '.zhihu.com', 'expiry': 1600675697, 'secure': False, 'value': '51854390.000--|3=entry_date=20180922=1'}]
[]
打印后的结果为:

选项卡管理

from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.get("http://www.baidu.com")
browser.execute_script("window.open()") # 打开新的选项卡
print(browser.window_handles) # 返回所有窗口的一些引用
browser.switch_to_window(browser.window_handles[1]) # 切换到第二个选项卡
browser.get("https://www.taobao.com")
time.sleep(1)
browser.switch_to_window(browser.window_handles[0]) # 切换到第一个选项卡
browser.get("https://www.taobao.com")
browser.close()
['CDwindow-65E31D9BF9FDC0B83D2821ABB85DB273', 'CDwindow-C2A8A67F87828D4AEA0A9D391203121E']
打印的结果为:

异常处理

from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException

browser = webdriver.Chrome()
browser.get("http://www.baidu.com")

try:
    browser.get("https://www.baidu.com")
except TimeoutException:
    print("TIME OUT")
try:
    browser.find_element_by_id("name")
except NoSuchElementException:
    print("NO ELEMENT")
finally:
    browser.close()
NO ELEMENT
打印后的结果为:

猜你喜欢

转载自www.cnblogs.com/zhuifeng-mayi/p/9688588.html