1 引入的东西不同 正则是系统的 xpath和bs4是第三方的 bs4 和 xpath 作用一样,都是用来解析html数据的 相比之下,xpath的速度会快一点 xpath底层是用c来实现的 2.基本语法不同 正则使用元字符 xpath和bs4将获取的源码转化成一个对象 用对象的方法和属性获取想要的内容 3.正则无层级结构 只有先后顺序
下面通过代码来区分它们:
import re
from lxml import etree
from bs4 import BeautifulSoup
from urllib.request import urlopen,Request
url = 'https://www.meishij.net/chufang/diy/?&page=1'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'
}
def get_code(url):
request = Request(url,headers=headers)
response = urlopen(request)
code = response.read().decode()
return code
def regular(code):
# 5.正则可以一次找多个不同标签里面的值 返回值[(),(),....]
pattern = re.compile(r'<img class="img".*?alt="(.*?)".*?src="(.*?)">',re.S)
print(pattern)
result = pattern.findall(code)
print(result)
def xpath(code):
root = etree.HTML(code)
#4.< Element html at 0x2e7dcb0 >
# print(root)
img_list = root.xpath('//img[@class="img"]')
#5.img_list 是一个列表 里面是<Element img at 0x2e95120>
# print(img_list)
for value in img_list:
name = value.xpath('@alt')[0]
img = value.get('src')
print(name,img)
def beautifulsoup(code):
soup = BeautifulSoup(code,'lxml')
#4. print(soup) # < class 'bs4.BeautifulSoup'>
img_list = soup.select('img.img')
#5.img_list是一个列表
# 内容为类名为img的img标签下的所有内容
print(img_list)
for value in img_list:
name = value.get('alt')
img = value.get('src')
print(name,img)
code = get_code(url)
regular(code)
xpath(code)
beautifulsoup(code)