Python | 基於PythonWebHDFS遷移HDFS數據到本地並壓縮

先回顧下之前PythonWebHDFS的操作 : 基於WebHDFS REST API操作HDFS

記錄下工作中寫過的腳本,如下:

1、dateUtile.py: 主要選取需要遷移數據的時間區間。

import datetime
import sys
import os  

def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates

# 準備遷移5周前~3周前這段時間內的數據到NAS
def getDateRegion():
    begin_max_days=35
    end_min_days=21

    current_date=datetime.date.today()

    delta_max=datetime.timedelta(days = begin_max_days)
    delta_min=datetime.timedelta(days = end_min_days)

    beginDate=(current_date - delta_max).strftime("%Y-%m-%d")
    endDate=(current_date - delta_min).strftime("%Y-%m-%d")
	
    return dateRange(beginDate,endDate)

2、iterate_migrate_wzmetro.py : 基於時間段獲取需要遷移的數據在HDFS上的具體路徑

import datetime
import sys
import os  
import dateUtil

# Full Path : /datalog/wzmetro/20190601
sourceFilePath = "/datalog/wzmetro/"
destFilePath = "/data4/hdfs/wzmetro/"

if __name__ == '__main__':

    dates = dateUtil.getDateRegion()
    for date in dates:
        year = date[0:4]
	month = date[5:7]
	day = date[8:10]

	datePath = year+month+day
	destDatePath = year+'/'+month+'/'+day+'/'
	source = sourceFilePath+datePath
	dest = destFilePath+destDatePath

	os.system("python migrate_nas_combine_gz.py " + source + " " + dest )

3、migrate_nas_combine_gz.py: 基於路徑開始將HDFS數據遷移到本地

import os
import shutil
import json 
import sys 
import gzip
import zipfile
from hdfs.client import Client 
from hdfs import InsecureClient

import logging
from logging import handlers

logger = logging.getLogger()
logger.setLevel(logging.INFO) 
 
 
logFile = './logs/migrate_hdfs_to_nas.log'
fileHandler = logging.FileHandler(logFile, mode='a')
fileHandler.setLevel(logging.INFO) 
 
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fileHandler.setFormatter(formatter)

timedRotatingFileHandler = handlers.TimedRotatingFileHandler(filename=logFile, when='D')
timedRotatingFileHandler.setLevel(logging.INFO)
timedRotatingFileHandler.setFormatter(formatter)

logger.addHandler(timedRotatingFileHandler)


hdfsPath=sys.argv[1]
localPath=sys.argv[2]

#client = Client("http://192.168.0.1:50070")
client = InsecureClient("http://192.168.0.1:50070", user='hadoop')
check_path_status=client.status(hdfsPath,strict=False)
if(check_path_status !=None):
    dirList= client.list( hdfsPath  ,status=False )

    for i in dirList:
        hdfsLeafPath=hdfsPath+'/'+i

	localPathExistsFlag=os.path.exists(localPath)
        if( not localPathExistsFlag ):
                os.makedirs( localPath )

	localLeafPath=localPath+i
        fileFlag=os.path.exists( localLeafPath )
        if(fileFlag):
                os.remove(localLeafPath)
                logger.info('The File Is Exists , Remove OK ! ')

	client.download(hdfsLeafPath, localPath,overwrite=True)
	
	#parent = os.path.dirname(os.path.realpath(localLeafPath))
	#print("parent name = "+ parent)
	#e_file_name=parent.split(os.path.sep)
	#zipName = e_file_name[len(e_file_name)-1]
	#print('zipName=='+zipName)
	#os.system('zip -r %s.zip  %s' %((zipName ,localPath) ))

        f = zipfile.ZipFile(localPath + '/' + i+".gz", 'w', zipfile.ZIP_DEFLATED)
        f.write(localLeafPath)
    	# 調用了close方法纔會保證完成壓縮
    	f.close()
	os.remove(localLeafPath)
	
    logger.info('=====>>>>>The Current Local Path Folder: ' + localPath + ',And Delete Every File.')
    # 刪除HDFS上路徑文件
    client.delete(hdfsPath, recursive=True)
    logger.info("===>>>Download HDFS To Local File ===>> Migrate Local File ===> NAS Server ===> GZip NAS File ===> Remove NAS Source File !!!")
else:
    logger.info("The HDFS Source Path: "+ hdfsPath + " , And The Status=="+str(check_path_status))

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章