下载源文件与数据存储
下载源文件 urlretrieve
多用于非结构化的数据(图片等),以下载http://www.pythonscraping.com 上的logo.jpg为例:
'''urlretrieve'''
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.pythonscraping.com")
bsObj=BeautifulSoup(html,"html.parser")
imageLocation=bsObj.find("a",id="logo").find("img").attrs["src"]
urlretrieve(imageLocation,"logo.jpg")
'''显示logo.jpg'''
import matplotlib.pyplot as plt # plt 用于显示图片
import matplotlib.image as mpimg # mpimg 用于读取图片
logo = mpimg.imread('logo.jpg') # 读取和代码处于同一目录下的logo.jpg
plt.imshow(logo) # 显示图片
plt.axis('off') # 不显示坐标轴
plt.show()
下载多个文件到指定路径下
- 根据图形文件在服务器上存储的路径在本机上建立相同的存储路径,引进 os 库
- 如果不是该服务器上的图形文件,则不下载
# from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import os
baseUrl="http://pythonscraping.com"
baseDirectory="downloaded"
def getAbsoluteUrl(baseUrl,source):
if source.startswith("http://www."):
absoluteUrl="http://"+source[11:]
elif source.startswith("http://"):
absoluteUrl=source
elif source.startswith("wwww."):
absoluteUrl="http://"+source[4:]
else:
absoluteUrl=baseUrl+"/"+source
if baseUrl in absoluteUrl:
return absoluteUrl
def getDownloadPath(baseUrl,absoluteUrl,baseDirectory):
downloadPath=baseDirectory+absoluteUrl.replace(baseUrl,"")
directory=os.path.dirname(downloadPath)
if not os.path.exists(directory):
os.makedirs(directory)
return downloadPath
html=urlopen("http://www.pythonscraping.com")
bsObj=BeautifulSoup(html,"html.parser")
imageList=bsObj.findAll("img",src=True)
for image in imageList:
absoluteUrl=getAbsoluteUrl(baseUrl,image.attrs["src"])
if absoluteUrl is not None:
print(absoluteUrl)
downloadPath=getDownloadPath(baseUrl,absoluteUrl,baseDirectory)
print(downloadPath)
urlretrieve(absoluteUrl,downloadPath)
http://pythonscraping.com/sites/default/files/lrg_0.jpg
downloaded/sites/default/files/lrg_0.jpg
http://pythonscraping.com/img/lrg%20(1).jpg
downloaded/img/lrg%20(1).jpg
将结果写入文件(csv)
适用于结构化的数据(表格形式的)。
import csv
csvFile=open("test.csv","w+",newline="",encoding="utf-8-sig")
writer=csv.writer(csvFile)
writer.writerow(('number','number plus 2','number times 2'))
for i in range(10):
writer.writerow((i,i+2,i*2))
csvFile.close()
csvFile=open("test.csv","r",encoding="utf-8-sig")
reader=csv.reader(csvFile)
for row in reader:
print(str(row))
csvFile.close()
['number', 'number plus 2', 'number times 2']
['0', '2', '0']
['1', '3', '2']
['2', '4', '4']
['3', '5', '6']
['4', '6', '8']
['5', '7', '10']
['6', '8', '12']
['7', '9', '14']
['8', '10', '16']
['9', '11', '18']