python 爬虫代码:

典型应用一:

# 1.通过requests向百度首页发送请求,获取百度首页的数据。

# 导入requests库

import requests

# 请求的URL路径和查询参数

url = "http://www.baidu.com"

# 请求报头

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"

}

# 发送GET请求,返回一个响应对象

response = requests.get(url)

# 查看响应的内容

print(response.text)

 

 

# 2.通过requests向百度首页发送请求查询”测试数据",获取百度查询页的数据。

# # 导入requests库

# import requests

# # 请求的URL路径和查询参数

# url = "http://www.baidu.com/s"

# param = { "wd":"测试数据"}

# # 请求报头

# headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"

# }

# # 发送GET请求,返回一个响应对象

# response = requests.get(url, params=param, headers=headers)

# # 查看响应的内容

# print(response.text)

 

# 3.把www.baidu.com的图片保存到本地。

# import requests

# # 图片的Url

# url="https://www.baidu.com/img/bd_logo1.png"

# # 响应本身是一个图片,并且是二进制类理

# response=requests.get(url)

# # print(response.content)

# # 以二进制+写入方式打开文件

# with open('baidu.png','wb') as f:

#     # 写入respone.content bytes二进制类型

#     f.write(response.content)

 

# 4.requests发送带header的请求

典型应用二:

from lxml import etree

# 1.xml库使用

print("1.xml库使用")

text= '''<div>

    <ul>

         <li class="item-0"><a href="link1.html">第一个</a></li>

         <li class="item-1"><a href="link2.html">second item</a></li>

         <li class="item-0"><a href="link5.html">a属性</a></li>

     </ul>

 </div>'''

# (1)etree.formstring()函数读取文本解析节点

print("(1)formstring()函数读取文本解析节点")

print("formstring()函数将文本解析成为Element对象:")

fromstr=etree.fromstring(text)

print("formstring()函数将文本解析成为Element对象后:",fromstr)

print("将Element对象解析成为文本:")

resultfromstr=etree.tostring(fromstr,encoding='utf-8')

print("将Element对象解析成为文本后:",resultfromstr)

# (2)etree.HTML()函数读取文本解析节点

print("(2)etree.HTML()函数读取文本解析节点")

print("etree.HTML()函数将文本解析成为Element对象:")

html=etree.HTML(text)

print("etree.HTML()函数将文本解析成为Element对象后:",html)

resulthtml=etree.tostring(html,encoding='utf-8')

print("将Element对象解析成为文本后:",resulthtml)

 

# 2.lxml库find()方法,findall()方法,interfind()方法

print(" 2.lxml库find()方法,findall()方法,interfind()方法使用")

print("find查找html节点:",html.find("."))

print("find查找body节点:",html.find("./body"))

print("find查找body/div节点:",html.find("./body/div"))

print("find查找body/div/ul节点:",html.find("./body/div/ul"))

print("find查找body/div/ul/li节点:",html.find("./body/div/ul/li"))

print("find查找body/div/ul/li/a节点:",html.find("./body/div/ul/li/a"))

print("findall查找body/div/ul节点结果是一个列表:",html.findall("./body/div/ul"))

print("迭代器查询的使用:")

liList=html.iterfind("./body/div/ul/li")

print("迭代器查询后输出:",end=" ")

for li in liList:

    print(li.xpath("./a/text()")[0],end="  ")

print("\n")

# 3.xpath用法

print("3.xpath用法")

print("(1).xpath用法选取节点")

print("xpath用法选取html节点:",html.xpath("."))

print("xpath用法选取body节点:",html.xpath("./body"))

print("xpath用法选取body/div节点:",html.xpath("./body/div"))

print("xpath用法选取body/div/ul/li节点:",html.xpath("./body/div/ul/li"))

print("xpath用法'//'不考虑位置选取/div节点:",html.xpath("//div"))

print("xpath用法'..'选取li 父节点:",html.xpath("//li/.."))

 

print("(2).xpath用法选取@属性")

print("xpath用法'@属性'选取//a/@href 属性:",html.xpath("//a/@href"))

 

print("(3).xpath用法选取谓语")

print("xpath用法'@属性=值'选取//li[@class='item-0']选谓语 :",html.xpath("//li[@class='item-0']"))

print("(4).xpath用法选取未知节点")

print("xpath用法'ul/*'选取ul元素下所有元素 :",html.xpath("//ul/*"))

print("xpath用法所有li带属性的元素 :",html.xpath("//li[@*]"))

print("xpath用法根元素下所有节点 :",html.xpath("//node()"))

print("(5).xpath用法选取若干路径")

print("xpath用法'|'选取若干路径 :",html.xpath("//li[@class='item-0']|//li[@class='item-1']"))

 

抓取163即时新闻

import requests

from lxml import etree

url="https://news.163.com/domestic/"

response=requests.get(url)

response.encoding="gb2312"

# txt=response.text

html=etree.HTML(response.text)

liList=html.xpath("//div[@class='today_news']/ul/li")

print("163今日推荐------------")

for li in liList:

     print( li.xpath("./a/text()")[0],"\n")

     print( li.xpath("./a/@href")[0],"\n")

print("163即时新闻------------")

liList2=html.xpath("//div[@class='mt23 mod_jsxw']/ul/li")

for li in liList:

     print( li.xpath("./a/text()")[0],"\n")

     print( li.xpath("./a/@href")[0],"\n")

# 4.抓取亠州城建职业学院网站上学院新闻栏目下的新闻列表

print("4.抓取亠州城建职业学院网站上学院新闻栏目下的新闻列表")

import requests

from lxml import etree

response =requests.get("http://www.gzccc.edu.cn/xwzx/cjyw.htm")

response.encoding="utf-8"

html=etree.HTML(response.text)

newList=html.xpath("//a[@class='c18915']")

# print(newList)

for li in newList:

    title=li.xpath("./text()")[0] #xpath抓取得到的是一个列表,加[0]读取的是第0个元素

    href=li.xpath("./@href")[0]

    time=li.xpath("../../td[3]/span/text()")[0]

    # f=open("gzccc.txt",'a',encoding="utf-8")

    # f.write(title+href+time+"\n")

    # f.close()

    with open("gzccc.txt",'a',encoding="utf-8") as f:

        # 写入respone.content bytes二进制类型

        f.write(title+href+time+"\n")

    print(title,href,time)

 

典型应用三:

import requests

from lxml import etree

def get_data(url):

      resp=requests.get(url)

      resp.encoding="utf-8"

      return etree.HTML(resp.text)

 

def printContent(pagCnt,content):

    num=1

    li_list=content.xpath("//div[@class='artic_t_1 ny_news_lb']/ul/li")

    for li in li_list:

        title=li.xpath("./a/text()")

        href=li.xpath("./a/@href")

        time=li.xpath("./span/text()")

        print( pagCnt*20 + num,title,time,href)

        num=num+1

        f=open("1.txt","a",encoding="utf-8")

        f.write(str(pagCnt*20 + num)+ str(title)+ str(time)+ str(href)+"\n")

 

pagCnt = 0

str_url= "http://www.hnjmxy.cn/xwdt/xyxw.htm"

content= get_data(str_url)

while True:

    nextpage=content.xpath("//a[@class='Next']")

    pagCnt=pagCnt+1

    print("--------这是nextpage--",nextpage)

    if len(nextpage) != 0:

        href=nextpage[0].xpath("./@href")[0]

        text=nextpage[0].xpath("./text()")[0]

        # print (href)

        # print (text)  #显示下页文字

        if str(href).find("/") > 0:

            str_url="http://www.hnjmxy.cn/xwdt/"+href    #如果href是xyxw/2.htm这种形式

        else:

            str_url="http://www.hnjmxy.cn/xwdt/xyxw/"+href   #如果href是2.htm这种形式

        print(str_url)

        content= get_data(str_url)

        printContent(pagCnt,content)

    else:

        break

 

#-------------------------------------------------------------

#爬取广州城建职业学院新闻

import requests
from lxml import etree
def get_data(url):
resp=requests.get(url)
resp.encoding="utf-8"
return etree.HTML(resp.text)
url="http://www.gzccc.edu.cn/xwzx.htm"
def printContent(pagCnt,content):
num=1
li_list=content.xpath("//table[@class='winstyle18915']/tr")
for li in li_list:
title=li.xpath("./td[2]/a/text()")
href=li.xpath("./td[2]/a/@href")
time=li.xpath("./td[3]/span/text()")
print(pagCnt * 12 + num, title, time, href)
num=num+1
# f=open("e:\\shiye\\gzccc.txt","a",encoding="utf-8")
# f.write(str(pagCnt*12+num)+str(title[0])+str(time[0])+str(href[0])+"\n")
pagCnt=0
str_url="http://www.gzccc.edu.cn/xwzx.htm"
content=get_data(str_url)

while True:
nextpage=content.xpath("//a[@class='Next']")
pagCnt=pagCnt+1
print("--------这是nextpage--", nextpage)
if len(nextpage)!=0:
href=nextpage[0].xpath("./@href")[0]
text=nextpage[0].xpath("./text()")[0]
if str(href).find("/") > 0:
str_url = "http://www.gzccc.edu.cn/" + href # 如果href是xyxw/2.htm这种形式
else:
str_url = "http://www.gzccc.edu.cn/xwzx/" + href # 如果href是2.htm这种形式
print(str_url)
content = get_data(str_url)
printContent(pagCnt, content)
else:
break

 

猜你喜欢

转载自www.cnblogs.com/soft2408/p/10962386.html