Assume the file numbers and size can be handled by a single local machine.
from __future__ import print_function
import os
import time
import pysftp
import shutil
ftp_username='xxx'
ftp_password='xxx'
ftp_host='xxx'
year = time.strftime("%Y")
month = time.strftime("%m")
day = time.strftime("%d")
ftp_dir = 'data/'+year+'/'+month+'/'+day
hdfs_dir = 'data/'+year+'/'+month+'/'+day
#download the whole folder to local
with pysftp.Connection(ftp_host, username=ftp_username, password=ftp_password) as sftp:
sftp.get_r(ftp_dir, os.getcwd(), preserve_mtime=True)
sftp.close()
local_dir = os.getcwd()+'/'+ftp_dir
#decrpt gpg files in every folders
for dirpath, dirnames, files in os.walk(local_dir):
for f_name in files:
if f_name.endswith(".gpg"):
os.system("gpg -r --output {0} --decrypt {1}".format(os.path.join(dirpath, f_name.rstrip(".gpg")), os.path.join(dirpath, f_name)))
os.remove(os.path.join(dirpath, f_name))
#upload local folder to hdfs
exec_test = "hadoop fs -test –d " + hdfs_dir
if os.system(exec_test):
exec_mkdir = "hadoop fs -mkdir " + hdfs_dir
os.system(exec_mkdir)
exec_str = "hadoop fs -put "+local_dir+ " "+hdfs_dir
os.system(exec_str)
shutil.rmtree(local_dir)
No comments:
Post a Comment