Below is the Hive script and python udf to calculate the HHI of ip of visitors.
create table ip_hhi as select TRANSFORM(*) using "python herfindahl_index_udf.py" as (user_id STRING, hhi DOUBLE) from ( select user_id, collect_list(case when CONDITION then VALUE else null end) as ip_list from Table group by user_id ) ips;
#!/usr/bin/python import sys from collections import Counter #Calculate Herfindahl index (HHI) def compute_hhi(bin_list): total = len(bin_list) unique_bin_amounts = Counter(bin_list).values() hhi_elements = [(value/float(total))**2 for value in unique_bin_amounts] hhi = sum(hhi_elements) return (hhi, len(unique_bin_amounts)) for line in sys.stdin: tokens = line.strip().split("\t") cust_id = tokens[0] bin_list = tokens[1].replace('[','').replace(']','').split(",") hhi, dist_bin = compute_hhi(bin_1d_list) print("\t".join([cust_id, str(hhi), str(dist_bin)]))
No comments:
Post a Comment