典型应用一:
# 1.通过requests向百度首页发送请求,获取百度首页的数据。
# 导入requests库
import requests
# 请求的URL路径和查询参数
url = "http://www.baidu.com"
# 请求报头
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
}
# 发送GET请求,返回一个响应对象
response = requests.get(url)
# 查看响应的内容
print(response.text)
# 2.通过requests向百度首页发送请求查询”测试数据",获取百度查询页的数据。
# # 导入requests库
# import requests
# # 请求的URL路径和查询参数
# url = "http://www.baidu.com/s"
# param = { "wd":"测试数据"}
# # 请求报头
# headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
# }
# # 发送GET请求,返回一个响应对象
# response = requests.get(url, params=param, headers=headers)
# # 查看响应的内容
# print(response.text)
# 3.把www.baidu.com的图片保存到本地。
# import requests
# # 图片的Url
# url="https://www.baidu.com/img/bd_logo1.png"
# # 响应本身是一个图片,并且是二进制类理
# response=requests.get(url)
# # print(response.content)
# # 以二进制+写入方式打开文件
# with open('baidu.png','wb') as f:
# # 写入respone.content bytes二进制类型
# f.write(response.content)
# 4.requests发送带header的请求
典型应用二:
from lxml import etree
# 1.xml库使用
print("1.xml库使用")
text= '''<div>
<ul>
<li class="item-0"><a href="link1.html">第一个</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0"><a href="link5.html">a属性</a></li>
</ul>
</div>'''
# (1)etree.formstring()函数读取文本解析节点
print("(1)formstring()函数读取文本解析节点")
print("formstring()函数将文本解析成为Element对象:")
fromstr=etree.fromstring(text)
print("formstring()函数将文本解析成为Element对象后:",fromstr)
print("将Element对象解析成为文本:")
resultfromstr=etree.tostring(fromstr,encoding='utf-8')
print("将Element对象解析成为文本后:",resultfromstr)
# (2)etree.HTML()函数读取文本解析节点
print("(2)etree.HTML()函数读取文本解析节点")
print("etree.HTML()函数将文本解析成为Element对象:")
html=etree.HTML(text)
print("etree.HTML()函数将文本解析成为Element对象后:",html)
resulthtml=etree.tostring(html,encoding='utf-8')
print("将Element对象解析成为文本后:",resulthtml)
# 2.lxml库find()方法,findall()方法,interfind()方法
print(" 2.lxml库find()方法,findall()方法,interfind()方法使用")
print("find查找html节点:",html.find("."))
print("find查找body节点:",html.find("./body"))
print("find查找body/div节点:",html.find("./body/div"))
print("find查找body/div/ul节点:",html.find("./body/div/ul"))
print("find查找body/div/ul/li节点:",html.find("./body/div/ul/li"))
print("find查找body/div/ul/li/a节点:",html.find("./body/div/ul/li/a"))
print("findall查找body/div/ul节点结果是一个列表:",html.findall("./body/div/ul"))
print("迭代器查询的使用:")
liList=html.iterfind("./body/div/ul/li")
print("迭代器查询后输出:",end=" ")
for li in liList:
print(li.xpath("./a/text()")[0],end=" ")
print("\n")
# 3.xpath用法
print("3.xpath用法")
print("(1).xpath用法选取节点")
print("xpath用法选取html节点:",html.xpath("."))
print("xpath用法选取body节点:",html.xpath("./body"))
print("xpath用法选取body/div节点:",html.xpath("./body/div"))
print("xpath用法选取body/div/ul/li节点:",html.xpath("./body/div/ul/li"))
print("xpath用法'//'不考虑位置选取/div节点:",html.xpath("//div"))
print("xpath用法'..'选取li 父节点:",html.xpath("//li/.."))
print("(2).xpath用法选取@属性")
print("xpath用法'@属性'选取//a/@href 属性:",html.xpath("//a/@href"))
print("(3).xpath用法选取谓语")
print("xpath用法'@属性=值'选取//li[@class='item-0']选谓语 :",html.xpath("//li[@class='item-0']"))
print("(4).xpath用法选取未知节点")
print("xpath用法'ul/*'选取ul元素下所有元素 :",html.xpath("//ul/*"))
print("xpath用法所有li带属性的元素 :",html.xpath("//li[@*]"))
print("xpath用法根元素下所有节点 :",html.xpath("//node()"))
print("(5).xpath用法选取若干路径")
print("xpath用法'|'选取若干路径 :",html.xpath("//li[@class='item-0']|//li[@class='item-1']"))
抓取163即时新闻
import requests
from lxml import etree
url="https://news.163.com/domestic/"
response=requests.get(url)
response.encoding="gb2312"
# txt=response.text
html=etree.HTML(response.text)
liList=html.xpath("//div[@class='today_news']/ul/li")
print("163今日推荐------------")
for li in liList:
print( li.xpath("./a/text()")[0],"\n")
print( li.xpath("./a/@href")[0],"\n")
print("163即时新闻------------")
liList2=html.xpath("//div[@class='mt23 mod_jsxw']/ul/li")
for li in liList:
print( li.xpath("./a/text()")[0],"\n")
print( li.xpath("./a/@href")[0],"\n")
# 4.抓取亠州城建职业学院网站上学院新闻栏目下的新闻列表
print("4.抓取亠州城建职业学院网站上学院新闻栏目下的新闻列表")
import requests
from lxml import etree
response =requests.get("http://www.gzccc.edu.cn/xwzx/cjyw.htm")
response.encoding="utf-8"
html=etree.HTML(response.text)
newList=html.xpath("//a[@class='c18915']")
# print(newList)
for li in newList:
title=li.xpath("./text()")[0] #xpath抓取得到的是一个列表,加[0]读取的是第0个元素
href=li.xpath("./@href")[0]
time=li.xpath("../../td[3]/span/text()")[0]
# f=open("gzccc.txt",'a',encoding="utf-8")
# f.write(title+href+time+"\n")
# f.close()
with open("gzccc.txt",'a',encoding="utf-8") as f:
# 写入respone.content bytes二进制类型
f.write(title+href+time+"\n")
print(title,href,time)
典型应用三:
import requests
from lxml import etree
def get_data(url):
resp=requests.get(url)
resp.encoding="utf-8"
return etree.HTML(resp.text)
def printContent(pagCnt,content):
num=1
li_list=content.xpath("//div[@class='artic_t_1 ny_news_lb']/ul/li")
for li in li_list:
title=li.xpath("./a/text()")
href=li.xpath("./a/@href")
time=li.xpath("./span/text()")
print( pagCnt*20 + num,title,time,href)
num=num+1
f=open("1.txt","a",encoding="utf-8")
f.write(str(pagCnt*20 + num)+ str(title)+ str(time)+ str(href)+"\n")
pagCnt = 0
str_url= "http://www.hnjmxy.cn/xwdt/xyxw.htm"
content= get_data(str_url)
while True:
nextpage=content.xpath("//a[@class='Next']")
pagCnt=pagCnt+1
print("--------这是nextpage--",nextpage)
if len(nextpage) != 0:
href=nextpage[0].xpath("./@href")[0]
text=nextpage[0].xpath("./text()")[0]
# print (href)
# print (text) #显示下页文字
if str(href).find("/") > 0:
str_url="http://www.hnjmxy.cn/xwdt/"+href #如果href是xyxw/2.htm这种形式
else:
str_url="http://www.hnjmxy.cn/xwdt/xyxw/"+href #如果href是2.htm这种形式
print(str_url)
content= get_data(str_url)
printContent(pagCnt,content)
else:
break
#-------------------------------------------------------------
#爬取广州城建职业学院新闻
import requests
from lxml import etree
def get_data(url):
resp=requests.get(url)
resp.encoding="utf-8"
return etree.HTML(resp.text)
url="http://www.gzccc.edu.cn/xwzx.htm"
def printContent(pagCnt,content):
num=1
li_list=content.xpath("//table[@class='winstyle18915']/tr")
for li in li_list:
title=li.xpath("./td[2]/a/text()")
href=li.xpath("./td[2]/a/@href")
time=li.xpath("./td[3]/span/text()")
print(pagCnt * 12 + num, title, time, href)
num=num+1
# f=open("e:\\shiye\\gzccc.txt","a",encoding="utf-8")
# f.write(str(pagCnt*12+num)+str(title[0])+str(time[0])+str(href[0])+"\n")
pagCnt=0
str_url="http://www.gzccc.edu.cn/xwzx.htm"
content=get_data(str_url)
while True:
nextpage=content.xpath("//a[@class='Next']")
pagCnt=pagCnt+1
print("--------这是nextpage--", nextpage)
if len(nextpage)!=0:
href=nextpage[0].xpath("./@href")[0]
text=nextpage[0].xpath("./text()")[0]
if str(href).find("/") > 0:
str_url = "http://www.gzccc.edu.cn/" + href # 如果href是xyxw/2.htm这种形式
else:
str_url = "http://www.gzccc.edu.cn/xwzx/" + href # 如果href是2.htm这种形式
print(str_url)
content = get_data(str_url)
printContent(pagCnt, content)
else:
break