Wednesday, 2 September 2015

Join Multiple Hive Tables without manually typing column names

Sometimes, we have to join multiple hive tables on the same key. Since there might be many column names, we don't want to type them one by one. Select * doesn't work in this case, due to duplicate join key in the column list.

Below python script can help.
It takes table names, join key, and the columns excluded in the results. 

import subprocess as sub
import sys
from monitor import os_error_exit
import ConfigParser

def getColumns(table_name, columns):
    p = sub.Popen(["hive","-e", "describe "+table_name],stdout=sub.PIPE,stderr=sub.PIPE)
    results, errors = p.communicate()
    schema = results.split('\t')
    for i in range(0, len(schema)-1, 2):
        if "# Partition Information" in schema[i] or schema[i].strip()=='':
            break
        columns.add(schema[i].strip())

def getJoinConditions(table_list, column_key):
    alias = ['t'+str(i) for i in range(100)]
    base_table = table_list[0]
    join_condition = base_table+" " +alias[0]
    for i in range(1, len(table_list)):
        current_table = table_list[i]
        join_condition += " left outer join " + current_table +" " +alias[i]+" on "+alias[0]+'.'+column_key+"="+alias[i]+'.'+column_key
    return join_condition


def main():
    
    columns = set()

    for tablename in tables:
        getColumns(tablename, columns)

    selected_columns = columns.difference(discard_columns)
    column_list =  ', '.join(["COALESCE("+ c +",0.0) as "+c for c in selected_columns])
    full_columns = "t0."+column_key+", "+ column_list
    join_condition_str = getJoinConditions(tables, column_key)

    cmd="hive -hiveconf COLUMNS='" + full_columns + "' -hiveconf JOIN_CONDITION='" + join_condition_str +"' -f " + hive_sql_path

if __name__ == "__main__":
    main()


No comments:

Post a Comment