Python爬虫学习之下载源文件与数据存储

下载源文件与数据存储

下载源文件 urlretrieve

多用于非结构化的数据(图片等),以下载http://www.pythonscraping.com 上的logo.jpg为例:

'''urlretrieve'''
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("http://www.pythonscraping.com")
bsObj=BeautifulSoup(html,"html.parser")
imageLocation=bsObj.find("a",id="logo").find("img").attrs["src"]
urlretrieve(imageLocation,"logo.jpg")
'''显示logo.jpg'''
import matplotlib.pyplot as plt # plt 用于显示图片
import matplotlib.image as mpimg # mpimg 用于读取图片
 
logo = mpimg.imread('logo.jpg') # 读取和代码处于同一目录下的logo.jpg
plt.imshow(logo) # 显示图片
plt.axis('off') # 不显示坐标轴
plt.show()

在这里插入图片描述

下载多个文件到指定路径下

  • 根据图形文件在服务器上存储的路径在本机上建立相同的存储路径,引进 os 库
  • 如果不是该服务器上的图形文件,则不下载
# from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import os
baseUrl="http://pythonscraping.com"
baseDirectory="downloaded"
def getAbsoluteUrl(baseUrl,source):
    if source.startswith("http://www."):
        absoluteUrl="http://"+source[11:]
    elif source.startswith("http://"):
        absoluteUrl=source
    elif source.startswith("wwww."):
        absoluteUrl="http://"+source[4:]
    else:
        absoluteUrl=baseUrl+"/"+source
    if baseUrl in absoluteUrl:
        return absoluteUrl

def getDownloadPath(baseUrl,absoluteUrl,baseDirectory):
    downloadPath=baseDirectory+absoluteUrl.replace(baseUrl,"")
    directory=os.path.dirname(downloadPath)
    if not os.path.exists(directory):
        os.makedirs(directory)
    return downloadPath

html=urlopen("http://www.pythonscraping.com")
bsObj=BeautifulSoup(html,"html.parser")
imageList=bsObj.findAll("img",src=True)
for image in imageList:
    absoluteUrl=getAbsoluteUrl(baseUrl,image.attrs["src"])
    if absoluteUrl is not None:
        print(absoluteUrl)
        downloadPath=getDownloadPath(baseUrl,absoluteUrl,baseDirectory)
        print(downloadPath)
        urlretrieve(absoluteUrl,downloadPath)
http://pythonscraping.com/sites/default/files/lrg_0.jpg
downloaded/sites/default/files/lrg_0.jpg
http://pythonscraping.com/img/lrg%20(1).jpg
downloaded/img/lrg%20(1).jpg

将结果写入文件(csv)

适用于结构化的数据(表格形式的)。

import csv
csvFile=open("test.csv","w+",newline="",encoding="utf-8-sig")
writer=csv.writer(csvFile)
writer.writerow(('number','number plus 2','number times 2'))
for i in range(10):
    writer.writerow((i,i+2,i*2))
csvFile.close()
csvFile=open("test.csv","r",encoding="utf-8-sig")
reader=csv.reader(csvFile)
for row in reader:
    print(str(row))
csvFile.close()
['number', 'number plus 2', 'number times 2']
['0', '2', '0']
['1', '3', '2']
['2', '4', '4']
['3', '5', '6']
['4', '6', '8']
['5', '7', '10']
['6', '8', '12']
['7', '9', '14']
['8', '10', '16']
['9', '11', '18']
发布了4 篇原创文章 · 获赞 4 · 访问量 223

猜你喜欢

转载自blog.csdn.net/m0_37544963/article/details/103132240