Sometimes, we have to join multiple hive tables on the same key. Since there might be many column names, we don't want to type them one by one. Select * doesn't work in this case, due to duplicate join key in the column list.
Below python script can help.
It takes table names, join key, and the columns excluded in the results.
Below python script can help.
It takes table names, join key, and the columns excluded in the results.
import subprocess as sub import sys from monitor import os_error_exit import ConfigParser def getColumns(table_name, columns): p = sub.Popen(["hive","-e", "describe "+table_name],stdout=sub.PIPE,stderr=sub.PIPE) results, errors = p.communicate() schema = results.split('\t') for i in range(0, len(schema)-1, 2): if "# Partition Information" in schema[i] or schema[i].strip()=='': break columns.add(schema[i].strip()) def getJoinConditions(table_list, column_key): alias = ['t'+str(i) for i in range(100)] base_table = table_list[0] join_condition = base_table+" " +alias[0] for i in range(1, len(table_list)): current_table = table_list[i] join_condition += " left outer join " + current_table +" " +alias[i]+" on "+alias[0]+'.'+column_key+"="+alias[i]+'.'+column_key return join_condition def main(): columns = set() for tablename in tables: getColumns(tablename, columns) selected_columns = columns.difference(discard_columns) column_list = ', '.join(["COALESCE("+ c +",0.0) as "+c for c in selected_columns]) full_columns = "t0."+column_key+", "+ column_list join_condition_str = getJoinConditions(tables, column_key) cmd="hive -hiveconf COLUMNS='" + full_columns + "' -hiveconf JOIN_CONDITION='" + join_condition_str +"' -f " + hive_sql_path if __name__ == "__main__": main()
No comments:
Post a Comment