lxml和xpath结合使用主要有以下5个方面内容:
# 1.获取所有的tr标签
# 2.获取第2个标签
# 3.获取所有class 等于event的tr标签
# 4.获取所有a标签下的href属性
# 5.或许所有的职位信息(纯文本)
实例代码如下:
# -- coding:utf-8 -- from lxml import etree parser = etree.HTMLParser(encoding="utf-8") html = etree.parse("baudu.html", parser=parser)
# 1.获取所有的tr标签 # //tr # xpath返回的是一个列表 def return_trs(): trs = html.xpath("//tr") for tr in trs: print tr
# 2.获取第2个标签 def return_tr(): tr = html.xpath("//tr[2]")[0] print type(tr) print tr print etree.tostring(tr, encoding="utf-8").decode("utf-8")
# 3.获取所有class 等于event的tr标签 def class_tr(): trs = html.xpath("//tr[@class='event']") print trs
# 4.获取所有a标签下的href属性 def a_href(): aLists = html.xpath("//a/@href") for a in aLists: print a print "D:\\Python2.7\\"+a
# 5.或许所有的职位信息(纯文本) positions = [] def position_text(): trs = html.xpath("//tr[position()>1]") for tr in trs: # 在某个标签下执行xpath函数,获取这个标签下的子孙元素,那么在“//”前面加一个“.”,即“//.” herf = tr.xpath(".//a/@href")[0] fullurl = "http://"+herf title = tr.xpath("./td[1]//text()") category = tr.xpath("./td[2]//text()") nums = tr.xpath("./td[3]//text()") address = tr.xpath("./td[4]//text()") pubtime = tr.xpath("./td[5]//text()") position = { "url": fullurl, "title": title, "category": category, "nums": nums, "address": address, "pubtime": pubtime } positions.append(position)
def main(): position_text() print positions # a_href() # class_tr() # return_trs() # return_tr() if __name__ == '__main__': main()