Assume the file numbers and size can be handled by a single local machine.
from __future__ import print_function import os import time import pysftp import shutil ftp_username='xxx' ftp_password='xxx' ftp_host='xxx' year = time.strftime("%Y") month = time.strftime("%m") day = time.strftime("%d") ftp_dir = 'data/'+year+'/'+month+'/'+day hdfs_dir = 'data/'+year+'/'+month+'/'+day #download the whole folder to local with pysftp.Connection(ftp_host, username=ftp_username, password=ftp_password) as sftp: sftp.get_r(ftp_dir, os.getcwd(), preserve_mtime=True) sftp.close() local_dir = os.getcwd()+'/'+ftp_dir #decrpt gpg files in every folders for dirpath, dirnames, files in os.walk(local_dir): for f_name in files: if f_name.endswith(".gpg"): os.system("gpg -r --output {0} --decrypt {1}".format(os.path.join(dirpath, f_name.rstrip(".gpg")), os.path.join(dirpath, f_name))) os.remove(os.path.join(dirpath, f_name)) #upload local folder to hdfs exec_test = "hadoop fs -test –d " + hdfs_dir if os.system(exec_test): exec_mkdir = "hadoop fs -mkdir " + hdfs_dir os.system(exec_mkdir) exec_str = "hadoop fs -put "+local_dir+ " "+hdfs_dir os.system(exec_str) shutil.rmtree(local_dir)
No comments:
Post a Comment