版权声明:派森带你学python,欢迎加群:923414804与群主一起学习 https://blog.csdn.net/weixin_44369414/article/details/85916727
from lxml import etree
def etree_html():
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
return etree.HTML(text)
def read1():
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))
def all_node():
html = etree_html()
result = html.xpath('//*')
for i in result:
print(i)
def all_li_node():
html = etree_html()
result = html.xpath('//li')
for i in result:
print(i)
def child_node():
"""
// 相对节点
/ 绝对节点
"""
html = etree_html()
result = html.xpath('//li/a')
print(result)
def father_node1():
'''
父节点
'''
html = etree_html()
result = html.xpath('//a[@href="link4.html"]/../@class')
print(result)
def father_node2():
html = etree_html()
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
print(result)
def match_att():
'''
用 @ 符号进行属性过滤
'''
html = etree_html()
result = html.xpath('//li[@class="item-0"]')
print(result)
def get_text1():
'''
获取文本
'''
html = etree_html()
result = html.xpath('//li[@class="item-0"]/a/text()')
print(result)
def get_text2():
html = etree_html()
result = html.xpath('//li[@class="item-0"]//text()')
print(result)
def get_att():
'''
获取所有 li 节点下所有 a 节点的 href 属性
'''
html = etree_html()
result = html.xpath('//li/a/@href')
print(result)
def get_more_att():
'''
如果属性有多个值就需要用 contains() 函数
'''
text = '''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)
def get_more_att2():
'''
多属性匹配
'''
text = '''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
print(result)
def according_order_select():
'''
按序选择
'''
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/a/text()')
print(result)
result = html.xpath('//li[last()]/a/text()')
print(result)
result = html.xpath('//li[position()<3]/a/text()')
print(result)
result = html.xpath('//li[last()-2]/a/text()')
print(result)
def select_by_axis():
"""
节点轴选择
"""
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/ancestor::*')
print(result)
result = html.xpath('//li[1]/ancestor::div')
print(result)
result = html.xpath('//li[1]/attribute::*')
print(result)
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
result = html.xpath('//li[1]/descendant::span')
print(result)
result = html.xpath('//li[1]/following::*[2]')
print(result)
result = html.xpath('//li[1]/following-sibling::*')
print(result)
if __name__ == "__main__":
get_more_att2()