Python | 基於WebHDFS REST API操作HDFS

記錄下基於WebHDFS REST API操作HDFS的基本功能,具體更多請參照官網介紹:

http://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-hdfs/WebHDFS.html

# 獲取客戶端連接
client = Client(url='http://192.168.0.1:50070', root=None, proxy=None, timeout=None, session=None)
# 或者使用InsecureClient,基於InsecureClient時可以指定登錄用戶,而Client()中的proxy會報異常,還沒整明白
client = InsecureClient("http://192.168.0.1:50070", user='hadoop');

# 創建目錄
client.makedirs(hdfs_path)
 
# 刪除hdfs文件
client.delete(hdfs_path)
 
# 上傳文件到hdfs
client.upload(hdfs_path, local_path, cleanup=True)
 
# 從hdfs獲取文件到本地
client.download(hdfs_path, local_path, overwrite=False)
 
# 追加數據到hdfs文件
client.write(hdfs_path, data, overwrite=False, append=True, encoding='utf-8')
 
# 覆蓋數據寫到hdfs文件
client.write(hdfs_path, data, overwrite=True, append=False, encoding='utf-8')
 
# 移動或者修改文件
client.rename(hdfs_src_path, hdfs_dst_path)
 
# 列舉指定目錄下的文件
client.list(hdfs_path, status=False)

網上也有大佬將常見的基本方法中的參數做了些詳細介紹,右轉地址:

https://blog.csdn.net/gamer_gyt/article/details/52446757

記錄下工作中基本實例:

# -*- coding: UTF-8 -*- 
#!/usr/bin/python 

#import codecs
import os
import shutil
import json 
import sys 
import datetime
from hdfs.client import Client 
from hdfs import InsecureClient

import logging 
from logging import handlers

# 設置日誌
logger = logging.getLogger()
logger.setLevel(logging.INFO) 
 
logFile = './sdkup.log'
fileHandler = logging.FileHandler(logFile, mode='a')
fileHandler.setLevel(logging.INFO) 
 
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fileHandler.setFormatter(formatter)

timedRotatingFileHandler = handlers.TimedRotatingFileHandler(filename=logFile, when='D')
timedRotatingFileHandler.setLevel(logging.INFO)
timedRotatingFileHandler.setFormatter(formatter)

logger.addHandler(timedRotatingFileHandler)


beginDate=sys.argv[1]
endDate=sys.argv[2]

rootDir = '/datalog/t/python_test_webhdfs/'
localDir = '/data3/hdfs/sdklog/'
	
logger.info('Note the date format : yyyy-MM-dd')

#client = Client("http://192.168.0.1:50070", root='/tables/', proxy='supergroup')
client = InsecureClient("http://192.168.0.1:50070", user='berg')

# 獲取指定日期範圍類別
def dateRangeList(beginDate, endDate):
    dateList = []
    begin = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    end = datetime.datetime.strptime(endDate, "%Y-%m-%d")
    while begin <= end:
	date = begin.strftime("%Y-%m-%d") 
	dateList.append(date)
        begin += datetime.timedelta(days=1)
    return dateList
	
# 先在HDFS創建基於本地文件的目錄,然後開始上傳文件
def uploadFileToHdfs():
	for date in dateRangeList(beginDate, endDate):
		datep = date.split('-')
		year = datep[0]
		month = datep[1]
		day = datep[2]	
		cityTuple = ['gz', 'sz', 'wh', 'km', 'qd']
		for city in cityTuple:
			hdfsTargetFilePath = '{}{}/{}/{}/{}/'.format(rootDir, city, year, month, day) 
			localTagetFilePath = '{}{}/{}/{}/{}/'.format(localDir, city, year, month, day) 
			logger.info('localTagetFilePath:{}'.format(localTagetFilePath))
			if(os.path.exists(localTagetFilePath)):
				logger.info('The Local Target File Is Exists , Start Make HDFS Target File Path And Upload File !')
				client.makedirs(hdfsTargetFilePath)
        		        client.upload(hdfsTargetFilePath, localTagetFilePath, overwrite = True)
				logger.info('Execute Ok!')               
		
			else:
				logger.info('The Local Target File Not-Exists !')
				
if __name__ == '__main__':
	uploadFileToHdfs()

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章