add jar /tmp/hivemall-with-dependencies.jar; -- hivemall v0.3 or later source /tmp/define-all.hive;
2. Use HiveMall to run LogRegress on Avazu
----create tables create database mytest; USE mytest; CREATE EXTERNAL TABLE Avazu( id string,click int, hour bigint,C1 int,banner_pos int,site_id string,site_domain string,site_category string,app_id string, app_domain string,app_category string,device_id string,device_ip string,device_os string,device_make string, device_model string,device_type int,device_conn_type int,device_geo_country string, C17 int,C18 int,C19 int,C20 int,C21 int,C22 int,C23 int,C24 int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' STORED AS TEXTFILE LOCATION '/user/xxx/Avazu' CREATE TABLE Avazu_train( id string, click int, hour bigint, banner_pos int,site_id string,site_domain string,site_category string,app_id string, app_domain string,app_category string,device_id string,device_os string,device_type int,device_geo_country string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE;
Insert into table Avazu_train
select id, click, hour, banner_pos,site_id,site_domain,site_category,app_id,
app_domain,app_category,device_id,device_os,device_type,device_geo_country from Avazu
where hour < 14101000 and hour is not null;
CREATE TABLE Avazu_solution(
id string, click float)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
Insert into table Avazu_solution
select id, cast(click as float)
from Avazu
where hour>=14101000 and hour is not null
order by id ASC;
----prepare----
add jar hivemall-with-dependencies.jar;
drop temporary function mhash;
create temporary function mhash as 'hivemall.ftvec.hashing.MurmurHash3UDF';
use mytest;
create table training_feature as
select
id,
click,
mhash(concat("1:", hour)) as hour,
mhash(concat("2:", banner_pos)) as banner_pos,
mhash(concat("3:", site_id)) as site_id,
mhash(concat("4:", site_domain)) as site_domain,
mhash(concat("5:", site_category)) as site_category,
mhash(concat("6:", app_id)) as app_id,
mhash(concat("7:", app_domain)) as app_domain,
mhash(concat("8:", app_category)) as app_category,
mhash(concat("9:", device_id)) as device_id,
mhash(concat("10:", device_os)) as device_os,
mhash(concat("11:", device_type)) as device_type,
mhash(concat("12:", device_geo_country)) as device_geo_country,
-1 as bias
from avazu_train;
create table training_orcfile (
rowid bigint,
label float,
features array
) STORED AS orc tblproperties ("orc.compress"="SNAPPY");
add file kddconv.awk;
INSERT OVERWRITE TABLE training_orcfile
select transform(*)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY "\t"
LINES TERMINATED BY "\n"
using 'gawk -f kddconv.awk'
as (rowid STRING, label FLOAT, features ARRAY)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY "\t"
COLLECTION ITEMS TERMINATED BY ","
LINES TERMINATED BY "\n"
from training_feature
CLUSTER BY rand();
---training---
add jar hivemall-with-dependencies.jar;
source define-all.hive;
set hivevar:total_steps=700000;
drop table lr_model;
create table lr_model
as
select
feature,
cast(avg(weight) as float) as weight
from
(select
logress(features,label, "-total_steps ${total_steps}") as (feature,weight)
from
training_orcfile
) t
group by feature;
----test/prediction-----
create table testing_feature as
select
id,
array(hour, banner_pos,site_id,site_domain,site_category,app_id,
app_domain,app_category,device_id,device_os,device_type,device_geo_country, bias) as features
from (
select
id,
mhash(concat("1:", hour)) as hour,
mhash(concat("2:", banner_pos)) as banner_pos,
mhash(concat("3:", site_id)) as site_id,
mhash(concat("4:", site_domain)) as site_domain,
mhash(concat("5:", site_category)) as site_category,
mhash(concat("6:", app_id)) as app_id,
mhash(concat("7:", app_domain)) as app_domain,
mhash(concat("8:", app_category)) as app_category,
mhash(concat("9:", device_id)) as device_id,
mhash(concat("10:", device_os)) as device_os,
mhash(concat("11:", device_type)) as device_type,
mhash(concat("12:", device_geo_country)) as device_geo_country,
-1 as bias
from avazu_test
) t;
create table testing_exploded as
select
id,
feature
from
testing_feature
LATERAL VIEW explode(features) t AS feature;
drop table lr_predict;
create table lr_predict
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ","
LINES TERMINATED BY "\n"
STORED AS TEXTFILE
as
select
t.id,
sigmoid(sum(m.weight)) as prob
from
testing_exploded t LEFT OUTER JOIN
lr_model m ON (t.feature = m.feature)
group by t.id
order by id ASC;
Reference:
http://www.slideshare.net/myui/hivemall-hadoop-summit-2014-san-jose
https://github.com/myui/hivemall/wiki/KDDCup-2012-track-2-CTR-prediction-(regression)
https://github.com/myui/hivemall/wiki
No comments:
Post a Comment