add jar /tmp/hivemall-with-dependencies.jar; -- hivemall v0.3 or later source /tmp/define-all.hive;
2. Use HiveMall to run LogRegress on Avazu
----create tables create database mytest; USE mytest; CREATE EXTERNAL TABLE Avazu( id string,click int, hour bigint,C1 int,banner_pos int,site_id string,site_domain string,site_category string,app_id string, app_domain string,app_category string,device_id string,device_ip string,device_os string,device_make string, device_model string,device_type int,device_conn_type int,device_geo_country string, C17 int,C18 int,C19 int,C20 int,C21 int,C22 int,C23 int,C24 int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' STORED AS TEXTFILE LOCATION '/user/xxx/Avazu' CREATE TABLE Avazu_train( id string, click int, hour bigint, banner_pos int,site_id string,site_domain string,site_category string,app_id string, app_domain string,app_category string,device_id string,device_os string,device_type int,device_geo_country string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE;
Insert into table Avazu_train select id, click, hour, banner_pos,site_id,site_domain,site_category,app_id, app_domain,app_category,device_id,device_os,device_type,device_geo_country from Avazu where hour < 14101000 and hour is not null; CREATE TABLE Avazu_solution( id string, click float) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' STORED AS TEXTFILE; Insert into table Avazu_solution select id, cast(click as float) from Avazu where hour>=14101000 and hour is not null order by id ASC; ----prepare---- add jar hivemall-with-dependencies.jar; drop temporary function mhash; create temporary function mhash as 'hivemall.ftvec.hashing.MurmurHash3UDF'; use mytest; create table training_feature as select id, click, mhash(concat("1:", hour)) as hour, mhash(concat("2:", banner_pos)) as banner_pos, mhash(concat("3:", site_id)) as site_id, mhash(concat("4:", site_domain)) as site_domain, mhash(concat("5:", site_category)) as site_category, mhash(concat("6:", app_id)) as app_id, mhash(concat("7:", app_domain)) as app_domain, mhash(concat("8:", app_category)) as app_category, mhash(concat("9:", device_id)) as device_id, mhash(concat("10:", device_os)) as device_os, mhash(concat("11:", device_type)) as device_type, mhash(concat("12:", device_geo_country)) as device_geo_country, -1 as bias from avazu_train; create table training_orcfile ( rowid bigint, label float, features array) STORED AS orc tblproperties ("orc.compress"="SNAPPY"); add file kddconv.awk; INSERT OVERWRITE TABLE training_orcfile select transform(*) ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t" LINES TERMINATED BY "\n" using 'gawk -f kddconv.awk' as (rowid STRING, label FLOAT, features ARRAY ) ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t" COLLECTION ITEMS TERMINATED BY "," LINES TERMINATED BY "\n" from training_feature CLUSTER BY rand(); ---training--- add jar hivemall-with-dependencies.jar; source define-all.hive; set hivevar:total_steps=700000; drop table lr_model; create table lr_model as select feature, cast(avg(weight) as float) as weight from (select logress(features,label, "-total_steps ${total_steps}") as (feature,weight) from training_orcfile ) t group by feature; ----test/prediction----- create table testing_feature as select id, array(hour, banner_pos,site_id,site_domain,site_category,app_id, app_domain,app_category,device_id,device_os,device_type,device_geo_country, bias) as features from ( select id, mhash(concat("1:", hour)) as hour, mhash(concat("2:", banner_pos)) as banner_pos, mhash(concat("3:", site_id)) as site_id, mhash(concat("4:", site_domain)) as site_domain, mhash(concat("5:", site_category)) as site_category, mhash(concat("6:", app_id)) as app_id, mhash(concat("7:", app_domain)) as app_domain, mhash(concat("8:", app_category)) as app_category, mhash(concat("9:", device_id)) as device_id, mhash(concat("10:", device_os)) as device_os, mhash(concat("11:", device_type)) as device_type, mhash(concat("12:", device_geo_country)) as device_geo_country, -1 as bias from avazu_test ) t; create table testing_exploded as select id, feature from testing_feature LATERAL VIEW explode(features) t AS feature; drop table lr_predict; create table lr_predict ROW FORMAT DELIMITED FIELDS TERMINATED BY "," LINES TERMINATED BY "\n" STORED AS TEXTFILE as select t.id, sigmoid(sum(m.weight)) as prob from testing_exploded t LEFT OUTER JOIN lr_model m ON (t.feature = m.feature) group by t.id order by id ASC;
Reference:
http://www.slideshare.net/myui/hivemall-hadoop-summit-2014-san-jose
https://github.com/myui/hivemall/wiki/KDDCup-2012-track-2-CTR-prediction-(regression)
https://github.com/myui/hivemall/wiki
No comments:
Post a Comment