python爬虫_PyQuery库基础

1.初始化

字符串初始化
from pyquery import PyQuery as pq
html = """
<html><head><title>Demo</title></head>
<body>
 <div>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('head'))
<head><title>Demo</title></head>
URL初始化
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))
文件初始化
from pyquery import PyQuery as pq
doc = pq(filename='demo.html')
print(doc('li'))

2.基础CSS选择器

from pyquery import PyQuery as pq
doc = pq(html)
html = """
<html><head><title>Demo</title></head>
<body>
 <div>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('.test.test2 #span'))
<span id="span" class="test4">123</span>

查找元素

子元素
items = doc('#div')
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('#div')
p = items.find('p')
print(p)
父元素
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('#span')
container = items.parent()
containers = items.parents()
print(container)
print('-'*50)
print(containers)
兄弟元素
p = doc('.test.test2')
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test.test2')
print(p.siblings())
<a href="http://www.baidu.com">
     <span class="test3">456</span>
   </a>

遍历

单个元素
	from pyquery import PyQuery as pq
	doc = pq(html)
	html = """
	<html><head><title>Demo</title></head>
	<body>
	 <div id='div'>
	   <p name="test" class='test test2'>Hello Python
	    <span id='span' class='test4'>123</span>
	   </p>
	   <a href="http://www.baidu.com">
	     <span class='test3'>456</span>
	   </a>
	   <p>
	       Hello World!
	   </p>
	 </div>
	</body>
	</html>
	"""
	from pyquery import PyQuery as pq
	doc = pq(html)
	p = doc('p').items()  # 将其变为一个可遍历对象
	for i in p:  
	    print(i)

3.获取信息

获取属性、文本 、HTNL
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p>
       Hello World!
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
print(p.attr('name')) # 属性

	html = """
	<html><head><title>Demo</title></head>
	<body>
	 <div id='div'>
	   <p name="test" class='test test2'>Hello Python
	    <span id='span' class='test4'>123</span>
	   </p>
	   <a href="http://www.baidu.com">
	     <span class='test3'>456</span>
	   </a>
	   <p>
	       Hello World!
	   </p>
	 </div>
	</body>
	</html>
	"""
	from pyquery import PyQuery as pq
	doc = pq(html) 
	p = doc('.test')  
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p>
       Hello World!
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
print(p.text())  #文本
Hello Python 123
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p>
       Hello World!
   </p>
 </div>
</body>
</html>
"""

from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
print(p.html())   # html
Hello Python
    <span id="span" class="test4">123</span>

3.DOM操作

addClass removeClass
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p>
       Hello World!
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')

html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p>
       Hello World!
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
p.removeClass('test2')
print(p)
p.addClass('active')
print(p)
attr、css
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p>
       Hello World!
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
p = doc('.test')
p.attr('age','18')
print(p)
p.css('font-size','14px')
print(p)
remove
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p id="remove">
       Hello World!
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
div = doc('#div')
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p id="remove">
       Hello World!
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
div = doc('#div')
print(div.find('#remove').remove())
print(div)
伪类选择器
html = """
<html><head><title>Demo</title></head>
<body>
 <div id='div'>
   <p name="test" class='test test2'>Hello Python
    <span id='span' class='test4'>123</span>
   </p>
   <a href="http://www.baidu.com">
     <span class='test3'>456</span>
   </a>
   <p id="remove">
       Hello World!1
   </p>
   <p id="remove">
       Hello World!2
   </p>
 </div>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('p:first-child'))  # 第一个同级元素
print(doc('p:last-child'))   # 最后一个同级元素
print(doc('p:nth-child(2)'))  # 第二个同级元素
print(doc('p:nth-child(2n)')) # 偶数个同级元素
发布了54 篇原创文章 · 获赞 24 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/weixin_43388615/article/details/105089899