整理自北京理工大学嵩天老师的mooc
1 京东商品页面爬取
import requests
url = "https://item.jd.com/7652143.html"
try:
r = requests.get(url) #获取网页内容
r.raise_for_status() #如果返回状态不是200会出现一个异常
r.encoding = r.apparent_encoding #从内容中分析出的响应内容编码方式
print(r.text[:1000]) #显示合适的长度内容
except:
print("爬取失败")
这个例子是为了熟悉网页获取框架
2 亚马逊商品信息爬取
import requests
header = {"user-agent":"Mozilla/5.0"} #给一个headr
url = "https://www.amazon.cn/dp/B078FFX8B6/ref=cngwdyfloorv2_recs_0/457-6454630-3233915?pf_rd_m=A1AJ19PSB66TGU&pf_rd_s=desktop-2&pf_rd_r=NVMTA9FE1WQ36ZXDJQCH&pf_rd_r=NVMTA9FE1WQ36ZXDJQCH&pf_rd_t=36701&pf_rd_p=538883d8-3c6e-4f6f-89c9-fedbbcdc164f&pf_rd_p=538883d8-3c6e-4f6f-89c9-fedbbcdc164f&pf_rd_i=desktop"
try:
r = requests.get(url,headers = header)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text)
except:
print("爬取失败")
这个例子在熟悉网页获取框架的基础上增加了访问的headers
3 百度360搜索关键词提交
这两个实例基于两个接口:
http://www.baidu.com/s?wd=keyword
import requests
header = {"user-agent":"Mozilla/5.0"} #给一个headr
url = "https://www.amazon.cn/dp/B078FFX8B6/ref=cngwdyfloorv2_recs_0/457-6454630-3233915?pf_rd_m=A1AJ19PSB66TGU&pf_rd_s=desktop-2&pf_rd_r=NVMTA9FE1WQ36ZXDJQCH&pf_rd_r=NVMTA9FE1WQ36ZXDJQCH&pf_rd_t=36701&pf_rd_p=538883d8-3c6e-4f6f-89c9-fedbbcdc164f&pf_rd_p=538883d8-3c6e-4f6f-89c9-fedbbcdc164f&pf_rd_i=desktop"
try:
r = requests.get(url,headers = header)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text)
except:
print("爬取失败")
这个例子主要学习了:parament
>>> payload = {'key1': 'value1', 'key2': 'value2'}
>>> r = requests.get('http://httpbin.org/get', params=payload
>>> print(r.url)
http://httpbin.org/get?key2=value2&key1=value1
>>> payload = {'key1': 'value1', 'key2': ['value2', 'value3']}
>>> r = requests.get('http://httpbin.org/get', params=payload)
>>> print(r.url)
http://httpbin.org/get?key1=value1&key2=value2&key2=value3
4 网络图片的爬取与存储
import requests
import os
header = {"user-agent":"Mozilla、5.0"}
url = "https://www.natgeo.com.cn/Files/pic/3486t.jpg"
root = "//home//tomblack//下载的图片//"
path = root + url.split('/')[-1] #取出文件名字
try:
if not os.path.exists(root): #判断文件夹是否存在
os.mkdir(root)
if not os.path.exists(path): #判断文件是否存在
r = requests.get(url,headers = header) #给个头
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")