Thursday, 11 June 2015

Calculate Herfindahl index by Collection_list() in Hive


Below is the Hive script and python udf to calculate the HHI of ip of visitors.



create table ip_hhi as
select TRANSFORM(*)
using "python herfindahl_index_udf.py"
as (user_id STRING, hhi DOUBLE)
from 
(
select user_id, 
collect_list(case when CONDITION then VALUE else null end) as ip_list
from Table
group by user_id
) ips;

herfindahl_index_udf.py

#!/usr/bin/python
import sys
from collections import Counter

#Calculate Herfindahl index (HHI)

def compute_hhi(bin_list):
    total = len(bin_list)
    unique_bin_amounts = Counter(bin_list).values()
    hhi_elements = [(value/float(total))**2 for value in unique_bin_amounts]
    hhi = sum(hhi_elements)
    return (hhi, len(unique_bin_amounts))

for line in sys.stdin:
    tokens = line.strip().split("\t")
    cust_id = tokens[0]
    bin_list = tokens[1].replace('[','').replace(']','').split(",")
    hhi, dist_bin = compute_hhi(bin_1d_list)
    print("\t".join([cust_id, str(hhi), str(dist_bin)]))



No comments:

Post a Comment