xpath常用方法整理

版权声明:派森带你学python,欢迎加群:923414804与群主一起学习 https://blog.csdn.net/weixin_44369414/article/details/85916727
from lxml import etree

def etree_html():
    text = '''
    <div>
        <ul>
            <li class="item-0"><a href="link1.html">first item</a></li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-inactive"><a href="link3.html">third item</a></li>
            <li class="item-1"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a>
        </ul>
    </div>
    '''
    return etree.HTML(text)

def read1():
    text = '''
    <div>
        <ul>
            <li class="item-0"><a href="link1.html">first item</a></li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-inactive"><a href="link3.html">third item</a></li>
            <li class="item-1"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a>
        </ul>
    </div>
    '''
    html = etree.HTML(text)
    result = etree.tostring(html)
    print(result.decode('utf-8'))


def all_node():
    html = etree_html()
    result = html.xpath('//*')
    for i in result:
        print(i)

def all_li_node():
    html = etree_html()
    result = html.xpath('//li')
    for i in result:
        print(i)

def child_node():
    """
    //  相对节点
    /   绝对节点
    """
    html = etree_html()
    result = html.xpath('//li/a')
    print(result)

def father_node1():
    '''
    父节点
    '''
    html = etree_html()
    result = html.xpath('//a[@href="link4.html"]/../@class')
    print(result)

def father_node2():
    html = etree_html()
    result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
    print(result)

def match_att():
    '''
    用 @ 符号进行属性过滤
    '''
    html = etree_html()
    result = html.xpath('//li[@class="item-0"]')
    print(result)

def get_text1():
    '''
    获取文本
    '''
    html = etree_html()
    result = html.xpath('//li[@class="item-0"]/a/text()')
    print(result)

def get_text2():
    html = etree_html()
    result = html.xpath('//li[@class="item-0"]//text()')
    print(result)

def get_att():
    '''
    获取所有 li 节点下所有 a 节点的 href 属性
    '''
    html = etree_html()
    result = html.xpath('//li/a/@href')
    print(result)

def get_more_att():
    '''
    如果属性有多个值就需要用 contains() 函数
    '''
    text = '''
    <li class="li li-first"><a href="link.html">first item</a></li>
    '''
    html = etree.HTML(text)
    result = html.xpath('//li[contains(@class, "li")]/a/text()')
    print(result)

def get_more_att2():
    '''
    多属性匹配
    '''
    text = '''
    <li class="li li-first" name="item"><a href="link.html">first item</a></li>
    '''
    html = etree.HTML(text)
    result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
    print(result)

def according_order_select():
    '''
    按序选择
    '''
    text = '''
    <div>
        <ul>
            <li class="item-0"><a href="link1.html">first item</a></li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-inactive"><a href="link3.html">third item</a></li>
            <li class="item-1"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a>
        </ul>
    </div>
    '''
    html = etree.HTML(text)
    result = html.xpath('//li[1]/a/text()')
    print(result)
    result = html.xpath('//li[last()]/a/text()')
    print(result)
    result = html.xpath('//li[position()<3]/a/text()')
    print(result)
    result = html.xpath('//li[last()-2]/a/text()')
    print(result)

def select_by_axis():
    """
    节点轴选择
    """
    text = '''
    <div>
        <ul>
            <li class="item-0"><a href="link1.html"><span>first item</span></a></li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-inactive"><a href="link3.html">third item</a></li>
            <li class="item-1"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a>
        </ul>
    </div>
    '''
    html = etree.HTML(text)
    result = html.xpath('//li[1]/ancestor::*')
    print(result)
    result = html.xpath('//li[1]/ancestor::div')
    print(result)
    result = html.xpath('//li[1]/attribute::*')
    print(result)
    result = html.xpath('//li[1]/child::a[@href="link1.html"]')
    print(result)
    result = html.xpath('//li[1]/descendant::span')
    print(result)
    result = html.xpath('//li[1]/following::*[2]')
    print(result)
    result = html.xpath('//li[1]/following-sibling::*')
    print(result)
    
if __name__ == "__main__":
    get_more_att2()

猜你喜欢

转载自blog.csdn.net/weixin_44369414/article/details/85916727