Below is the Hive script and python udf to calculate the HHI of ip of visitors.
create table ip_hhi as select TRANSFORM(*) using "python herfindahl_index_udf.py" as (user_id STRING, hhi DOUBLE) from ( select user_id, collect_list(case when CONDITION then VALUE else null end) as ip_list from Table group by user_id ) ips;
#!/usr/bin/python
import sys
from collections import Counter
#Calculate Herfindahl index (HHI)
def compute_hhi(bin_list):
total = len(bin_list)
unique_bin_amounts = Counter(bin_list).values()
hhi_elements = [(value/float(total))**2 for value in unique_bin_amounts]
hhi = sum(hhi_elements)
return (hhi, len(unique_bin_amounts))
for line in sys.stdin:
tokens = line.strip().split("\t")
cust_id = tokens[0]
bin_list = tokens[1].replace('[','').replace(']','').split(",")
hhi, dist_bin = compute_hhi(bin_1d_list)
print("\t".join([cust_id, str(hhi), str(dist_bin)]))
No comments:
Post a Comment