Sometimes, we have to join multiple hive tables on the same key. Since there might be many column names, we don't want to type them one by one. Select * doesn't work in this case, due to duplicate join key in the column list.
Below python script can help.
It takes table names, join key, and the columns excluded in the results.
Below python script can help.
It takes table names, join key, and the columns excluded in the results.
import subprocess as sub
import sys
from monitor import os_error_exit
import ConfigParser
def getColumns(table_name, columns):
p = sub.Popen(["hive","-e", "describe "+table_name],stdout=sub.PIPE,stderr=sub.PIPE)
results, errors = p.communicate()
schema = results.split('\t')
for i in range(0, len(schema)-1, 2):
if "# Partition Information" in schema[i] or schema[i].strip()=='':
break
columns.add(schema[i].strip())
def getJoinConditions(table_list, column_key):
alias = ['t'+str(i) for i in range(100)]
base_table = table_list[0]
join_condition = base_table+" " +alias[0]
for i in range(1, len(table_list)):
current_table = table_list[i]
join_condition += " left outer join " + current_table +" " +alias[i]+" on "+alias[0]+'.'+column_key+"="+alias[i]+'.'+column_key
return join_condition
def main():
columns = set()
for tablename in tables:
getColumns(tablename, columns)
selected_columns = columns.difference(discard_columns)
column_list = ', '.join(["COALESCE("+ c +",0.0) as "+c for c in selected_columns])
full_columns = "t0."+column_key+", "+ column_list
join_condition_str = getJoinConditions(tables, column_key)
cmd="hive -hiveconf COLUMNS='" + full_columns + "' -hiveconf JOIN_CONDITION='" + join_condition_str +"' -f " + hive_sql_path
if __name__ == "__main__":
main()
No comments:
Post a Comment