python3网络爬虫学习4.3pyquery的使用学习

#使用pyquery
#1、初始化
#他的初始化方式多种，比如直接传入字符串、URL，文件名等
#（1）字符串初始化
# text = """
# <html><head><title>The Dormouse's story</title></head>
# <body id="abs">
# <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
# <p class="story" id ="qwe">Once upon a time there were three little sisters; and their names were
# <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</p>
# <p class="story">...</p>
# """
# from pyquery import PyQuery as pq
# doc = pq(text)
# # print(doc)
# print(doc("p")("a"))
#URL初始化
# from pyquery import PyQuery as pq
# doc = pq(url="https://cuiqingcai.com")
# print(doc("title"))

#文件初始化

from pyquery import PyQuery as pq
doc = pq(filename="./BStest.html")
# print(doc("head"))

#基本CSS选择器
# print(doc('#abs .title b'))   #意思是查询 id=abs的节点下class = 'title'节点下的b节点

#查找节点
#查找子孙节点   find(节点名)  它查找的是所有子孙节点中的节点
# a = doc("#abs")
# # print(a)
# b = a.find("b")
# print(b)
#只查找子节点     children（节点名）
# a = doc("#abs")
# # b = a.children('b')
# # print(b)  #找不到
# b = a.children("p")
# print(b)   #找的到
#要筛选子类中符合条件的数据
# a = doc("#abs")
# # b = a.children("#qwe")   #查b中Id = qwe 的节点，用法与前面相同
# b = a.children("#qwe .sister")
# print(b)
#父节点
#使用parent（）方法获取某个节点的父节点   不会去找祖先节点
# a = doc("#abs #qwe .sister")
# # print(a)
# b = a.parent()
# # print(b)
# c = b.parent()
# print(c)
#使用parents()方法获取某个节点的祖先节点
# a = doc("#abs .story .sister")
# b = a.parents()
# print(b)
#兄弟节点   使用siblings()方法
# a = doc("#abs .title")
# b = a.siblings()
# print(b)    #这里打印出了body节点下的两个同级的节点b
#遍历
#对于单个节点可以直接输出，当结果有多个节点能够匹配呢？
# a = doc("#abs .story .sister")
# b = a.items()
# print(next(b))
# print(next(b))
# print(next(b))

#6、获取属性
#提取到某个节点后，可以调用attr()方法来获取属性
# a = doc("#abs .title")
# # b = a.attr("name")
# b = a.attr.name  与上面功能相同
# print(b)
#当能获取多个匹配节点时，使用attr（）方法只能获取第一个匹配的值，如果想全都获取得遍历
# a = doc("#abs #qwe .sister")
# b = a.items()
# for x in b:
#     print(x.attr('href'))
#获取文本
#获取文本后的另一个操作就是获取文本，可以使用text（）方法实现    #它会将所有匹配到的结果都输出  #如果想要其中一个就得遍历
# a = doc("#abs .story .sister")
# b = a.items()
# for x in b:
#     print(x.text())

#7、节点操作
#pyquery提供了一系列的方法对节点进行动态修改
#addClass 和removeClass
# a = doc("#abs .story")
# print(a.attr("class"))
# a.removeClass("story")
# print(a)
# a.add_class("story")
# print(a)
# a.add_class("1")   #这次添加的放在了一起，中间有个空格<p class="story 1" id="qwe">
# print(a)
#attr、text和html   attr(字符串a，字符串b)添加属性    text（str）将str字符串添加到文本中     html（）将网页添加到字符串中
#a = doc("#abs .sister")
# print(a.attr("id","11")) #  <a class="sister" href="http://example.com/lacie" id="11">  所有匹配到的结果都被添加了
# print(a.text("111111111111111111111111"))#<a class="sister" href="http://example.com/elsie" id="link1">111111111111111111111111</a> 所有匹配到的都添加了
# print(a.html("<a1>123213</a1>"))   #所有文本内容中加入了网页
#remove（）移除节点
# a = doc("#abs")
# a.find("p").remove()   #得找到子节点进行删除操作
# print(a)
#8、伪选择器
#CSS选择器之所以强大，还有一种原因是因为它支持多种多样的伪类选择器，例如选择第一个节点、最后一个节点、奇数偶数节点、包含某一文本的节点等

# c = doc("#abs .story .sister:first-child")   #选择class = sister能匹配到的第一个节点
# print(c)
# c = doc("#abs .story .sister:last-child")  #选择class = sister能匹配到的第一个节点
# print(c)
# c = doc("#abs .story .sister:nth-child(2)")  #选择class = sister能匹配到的第二个节点（括号内写几就是匹配第几个）
# print(c)
# c = doc("#abs .story .sister:gt(1)")  #匹配很奇怪，负数全都匹配，（0）匹配到了后两个（1）匹配到了最后一个
# print(c)
# c = doc("#abs .story .sister:nth-child(3n)")  #选择class = sister能匹配到的n的3倍（能被3整除的）
# print(c)
# c = doc("#abs .story .sister:contains(i)")  #选择class = sisterz中文本包含“i”的节点
# print(c)
python3网络爬虫学习4.3pyquery的使用学习

猜你喜欢