Sunday, 31 August 2014

Move Files from FTP to HDFS

Download files from ftp folder, then upload to HDFS folder.
Assume the file numbers and size can be handled by a single local machine.


from __future__ import print_function
import os
import time
import pysftp
import shutil

ftp_username='xxx'
ftp_password='xxx'
ftp_host='xxx'

year = time.strftime("%Y")
month = time.strftime("%m")
day = time.strftime("%d")

ftp_dir = 'data/'+year+'/'+month+'/'+day
hdfs_dir = 'data/'+year+'/'+month+'/'+day

#download the whole folder to local
with pysftp.Connection(ftp_host, username=ftp_username, password=ftp_password) as sftp:
    sftp.get_r(ftp_dir, os.getcwd(), preserve_mtime=True)
sftp.close()

local_dir = os.getcwd()+'/'+ftp_dir
#decrpt gpg files in every folders 
for dirpath, dirnames, files in os.walk(local_dir):
    for f_name in files:
        if f_name.endswith(".gpg"):
            os.system("gpg -r  --output {0} --decrypt {1}".format(os.path.join(dirpath, f_name.rstrip(".gpg")), os.path.join(dirpath, f_name)))
            os.remove(os.path.join(dirpath, f_name))


#upload local folder to hdfs
exec_test = "hadoop fs -test –d " + hdfs_dir
if os.system(exec_test):
    exec_mkdir = "hadoop fs -mkdir " + hdfs_dir
    os.system(exec_mkdir)


exec_str = "hadoop fs -put "+local_dir+ " "+hdfs_dir
os.system(exec_str)

shutil.rmtree(local_dir)

No comments:

Post a Comment