記錄下基於WebHDFS REST API操作HDFS的基本功能,具體更多請參照官網介紹:
http://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-hdfs/WebHDFS.html
# 獲取客戶端連接
client = Client(url='http://192.168.0.1:50070', root=None, proxy=None, timeout=None, session=None)
# 或者使用InsecureClient,基於InsecureClient時可以指定登錄用戶,而Client()中的proxy會報異常,還沒整明白
client = InsecureClient("http://192.168.0.1:50070", user='hadoop');
# 創建目錄
client.makedirs(hdfs_path)
# 刪除hdfs文件
client.delete(hdfs_path)
# 上傳文件到hdfs
client.upload(hdfs_path, local_path, cleanup=True)
# 從hdfs獲取文件到本地
client.download(hdfs_path, local_path, overwrite=False)
# 追加數據到hdfs文件
client.write(hdfs_path, data, overwrite=False, append=True, encoding='utf-8')
# 覆蓋數據寫到hdfs文件
client.write(hdfs_path, data, overwrite=True, append=False, encoding='utf-8')
# 移動或者修改文件
client.rename(hdfs_src_path, hdfs_dst_path)
# 列舉指定目錄下的文件
client.list(hdfs_path, status=False)
網上也有大佬將常見的基本方法中的參數做了些詳細介紹,右轉地址:
https://blog.csdn.net/gamer_gyt/article/details/52446757
記錄下工作中基本實例:
# -*- coding: UTF-8 -*-
#!/usr/bin/python
#import codecs
import os
import shutil
import json
import sys
import datetime
from hdfs.client import Client
from hdfs import InsecureClient
import logging
from logging import handlers
# 設置日誌
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logFile = './sdkup.log'
fileHandler = logging.FileHandler(logFile, mode='a')
fileHandler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fileHandler.setFormatter(formatter)
timedRotatingFileHandler = handlers.TimedRotatingFileHandler(filename=logFile, when='D')
timedRotatingFileHandler.setLevel(logging.INFO)
timedRotatingFileHandler.setFormatter(formatter)
logger.addHandler(timedRotatingFileHandler)
beginDate=sys.argv[1]
endDate=sys.argv[2]
rootDir = '/datalog/t/python_test_webhdfs/'
localDir = '/data3/hdfs/sdklog/'
logger.info('Note the date format : yyyy-MM-dd')
#client = Client("http://192.168.0.1:50070", root='/tables/', proxy='supergroup')
client = InsecureClient("http://192.168.0.1:50070", user='berg')
# 獲取指定日期範圍類別
def dateRangeList(beginDate, endDate):
dateList = []
begin = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
end = datetime.datetime.strptime(endDate, "%Y-%m-%d")
while begin <= end:
date = begin.strftime("%Y-%m-%d")
dateList.append(date)
begin += datetime.timedelta(days=1)
return dateList
# 先在HDFS創建基於本地文件的目錄,然後開始上傳文件
def uploadFileToHdfs():
for date in dateRangeList(beginDate, endDate):
datep = date.split('-')
year = datep[0]
month = datep[1]
day = datep[2]
cityTuple = ['gz', 'sz', 'wh', 'km', 'qd']
for city in cityTuple:
hdfsTargetFilePath = '{}{}/{}/{}/{}/'.format(rootDir, city, year, month, day)
localTagetFilePath = '{}{}/{}/{}/{}/'.format(localDir, city, year, month, day)
logger.info('localTagetFilePath:{}'.format(localTagetFilePath))
if(os.path.exists(localTagetFilePath)):
logger.info('The Local Target File Is Exists , Start Make HDFS Target File Path And Upload File !')
client.makedirs(hdfsTargetFilePath)
client.upload(hdfsTargetFilePath, localTagetFilePath, overwrite = True)
logger.info('Execute Ok!')
else:
logger.info('The Local Target File Not-Exists !')
if __name__ == '__main__':
uploadFileToHdfs()