Python | 基于WebHDFS REST API操作HDFS

记录下基于WebHDFS REST API操作HDFS的基本功能,具体更多请参照官网介绍:

http://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-hdfs/WebHDFS.html

# 获取客户端连接
client = Client(url='http://192.168.0.1:50070', root=None, proxy=None, timeout=None, session=None)
# 或者使用InsecureClient,基于InsecureClient时可以指定登录用户,而Client()中的proxy会报异常,还没整明白
client = InsecureClient("http://192.168.0.1:50070", user='hadoop');

# 创建目录
client.makedirs(hdfs_path)
 
# 删除hdfs文件
client.delete(hdfs_path)
 
# 上传文件到hdfs
client.upload(hdfs_path, local_path, cleanup=True)
 
# 从hdfs获取文件到本地
client.download(hdfs_path, local_path, overwrite=False)
 
# 追加数据到hdfs文件
client.write(hdfs_path, data, overwrite=False, append=True, encoding='utf-8')
 
# 覆盖数据写到hdfs文件
client.write(hdfs_path, data, overwrite=True, append=False, encoding='utf-8')
 
# 移动或者修改文件
client.rename(hdfs_src_path, hdfs_dst_path)
 
# 列举指定目录下的文件
client.list(hdfs_path, status=False)

网上也有大佬将常见的基本方法中的参数做了些详细介绍,右转地址:

https://blog.csdn.net/gamer_gyt/article/details/52446757

记录下工作中基本实例:

# -*- coding: UTF-8 -*- 
#!/usr/bin/python 

#import codecs
import os
import shutil
import json 
import sys 
import datetime
from hdfs.client import Client 
from hdfs import InsecureClient

import logging 
from logging import handlers

# 设置日志
logger = logging.getLogger()
logger.setLevel(logging.INFO) 
 
logFile = './sdkup.log'
fileHandler = logging.FileHandler(logFile, mode='a')
fileHandler.setLevel(logging.INFO) 
 
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fileHandler.setFormatter(formatter)

timedRotatingFileHandler = handlers.TimedRotatingFileHandler(filename=logFile, when='D')
timedRotatingFileHandler.setLevel(logging.INFO)
timedRotatingFileHandler.setFormatter(formatter)

logger.addHandler(timedRotatingFileHandler)


beginDate=sys.argv[1]
endDate=sys.argv[2]

rootDir = '/datalog/t/python_test_webhdfs/'
localDir = '/data3/hdfs/sdklog/'
	
logger.info('Note the date format : yyyy-MM-dd')

#client = Client("http://192.168.0.1:50070", root='/tables/', proxy='supergroup')
client = InsecureClient("http://192.168.0.1:50070", user='berg')

# 获取指定日期范围类别
def dateRangeList(beginDate, endDate):
    dateList = []
    begin = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    end = datetime.datetime.strptime(endDate, "%Y-%m-%d")
    while begin <= end:
	date = begin.strftime("%Y-%m-%d") 
	dateList.append(date)
        begin += datetime.timedelta(days=1)
    return dateList
	
# 先在HDFS创建基于本地文件的目录,然后开始上传文件
def uploadFileToHdfs():
	for date in dateRangeList(beginDate, endDate):
		datep = date.split('-')
		year = datep[0]
		month = datep[1]
		day = datep[2]	
		cityTuple = ['gz', 'sz', 'wh', 'km', 'qd']
		for city in cityTuple:
			hdfsTargetFilePath = '{}{}/{}/{}/{}/'.format(rootDir, city, year, month, day) 
			localTagetFilePath = '{}{}/{}/{}/{}/'.format(localDir, city, year, month, day) 
			logger.info('localTagetFilePath:{}'.format(localTagetFilePath))
			if(os.path.exists(localTagetFilePath)):
				logger.info('The Local Target File Is Exists , Start Make HDFS Target File Path And Upload File !')
				client.makedirs(hdfsTargetFilePath)
        		        client.upload(hdfsTargetFilePath, localTagetFilePath, overwrite = True)
				logger.info('Execute Ok!')               
		
			else:
				logger.info('The Local Target File Not-Exists !')
				
if __name__ == '__main__':
	uploadFileToHdfs()
发布了44 篇原创文章 · 获赞 11 · 访问量 5438

猜你喜欢

转载自blog.csdn.net/Sampson_Hugo/article/details/103630259