Alvin's Big Data Notebook : HiveMall: Machine learning library for Hive

1. Installation:

add jar /tmp/hivemall-with-dependencies.jar; -- hivemall v0.3 or later
source /tmp/define-all.hive;

2. Use HiveMall to run LogRegress on Avazu

----create tables

create database mytest;
USE mytest;

CREATE EXTERNAL TABLE Avazu(
id string,click int, hour bigint,C1 int,banner_pos int,site_id string,site_domain string,site_category string,app_id string,
app_domain string,app_category string,device_id string,device_ip string,device_os string,device_make string,
device_model string,device_type int,device_conn_type int,device_geo_country string,
C17 int,C18 int,C19 int,C20 int,C21 int,C22 int,C23 int,C24 int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/user/xxx/Avazu'

CREATE TABLE Avazu_train(
id string, click int, hour bigint, banner_pos int,site_id string,site_domain string,site_category string,app_id string,
app_domain string,app_category string,device_id string,device_os string,device_type int,device_geo_country string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

Insert into table Avazu_train
select id, click, hour, banner_pos,site_id,site_domain,site_category,app_id,
app_domain,app_category,device_id,device_os,device_type,device_geo_country from Avazu
where hour < 14101000 and hour is not null;


CREATE TABLE Avazu_solution(
id string, click float)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;


Insert into table Avazu_solution
select id, cast(click as float)
from Avazu
where hour>=14101000 and hour is not null
order by id ASC;

----prepare----
add jar hivemall-with-dependencies.jar;
drop temporary function mhash;
create temporary function mhash as 'hivemall.ftvec.hashing.MurmurHash3UDF';

use mytest;
create table training_feature as
select
  id,
  click,
  mhash(concat("1:", hour)) as hour, 
  mhash(concat("2:", banner_pos)) as banner_pos, 
  mhash(concat("3:", site_id)) as site_id, 
  mhash(concat("4:", site_domain)) as site_domain, 
  mhash(concat("5:", site_category)) as site_category, 
  mhash(concat("6:", app_id)) as app_id, 
  mhash(concat("7:", app_domain)) as app_domain, 
  mhash(concat("8:", app_category)) as app_category, 
  mhash(concat("9:", device_id)) as device_id, 
  mhash(concat("10:", device_os)) as device_os, 
  mhash(concat("11:", device_type)) as device_type, 
  mhash(concat("12:", device_geo_country)) as device_geo_country, 
  -1 as bias
from avazu_train;

create table training_orcfile (
 rowid bigint,
 label float,
 features array
) STORED AS orc tblproperties ("orc.compress"="SNAPPY");


add file kddconv.awk;
INSERT OVERWRITE TABLE training_orcfile 
select transform(*) 
  ROW FORMAT DELIMITED
     FIELDS TERMINATED BY "\t"
     LINES TERMINATED BY "\n"
using 'gawk -f kddconv.awk' 
  as (rowid STRING, label FLOAT, features ARRAY)
  ROW FORMAT DELIMITED
     FIELDS TERMINATED BY "\t"
     COLLECTION ITEMS TERMINATED BY ","
     LINES TERMINATED BY "\n"
from training_feature
CLUSTER BY rand();



---training---
add jar hivemall-with-dependencies.jar;
source define-all.hive;
set hivevar:total_steps=700000;

drop table lr_model;
create table lr_model 
as
select 
 feature,
 cast(avg(weight) as float) as weight
from 
 (select 
     logress(features,label, "-total_steps ${total_steps}") as (feature,weight)
  from 
     training_orcfile
 ) t 
group by feature;


----test/prediction-----


create table testing_feature as
select
  id, 
  array(hour, banner_pos,site_id,site_domain,site_category,app_id,
app_domain,app_category,device_id,device_os,device_type,device_geo_country, bias) as features
from (
select
  id,
  mhash(concat("1:", hour)) as hour, 
  mhash(concat("2:", banner_pos)) as banner_pos, 
  mhash(concat("3:", site_id)) as site_id, 
  mhash(concat("4:", site_domain)) as site_domain, 
  mhash(concat("5:", site_category)) as site_category, 
  mhash(concat("6:", app_id)) as app_id, 
  mhash(concat("7:", app_domain)) as app_domain, 
  mhash(concat("8:", app_category)) as app_category, 
  mhash(concat("9:", device_id)) as device_id, 
  mhash(concat("10:", device_os)) as device_os, 
  mhash(concat("11:", device_type)) as device_type, 
  mhash(concat("12:", device_geo_country)) as device_geo_country,
  -1 as bias
from avazu_test
) t;


create table testing_exploded as
select 
  id,
  feature
from 
  testing_feature 
  LATERAL VIEW explode(features) t AS feature;



drop table lr_predict;
create table lr_predict
  ROW FORMAT DELIMITED 
    FIELDS TERMINATED BY ","
    LINES TERMINATED BY "\n"
  STORED AS TEXTFILE
as
select
  t.id, 
  sigmoid(sum(m.weight)) as prob
from 
  testing_exploded  t LEFT OUTER JOIN
  lr_model m ON (t.feature = m.feature)
group by t.id
order by id ASC;

Reference:
http://www.slideshare.net/myui/hivemall-hadoop-summit-2014-san-jose
https://github.com/myui/hivemall/wiki/KDDCup-2012-track-2-CTR-prediction-(regression)
https://github.com/myui/hivemall/wiki

Alvin's Big Data Notebook

Wednesday, 26 November 2014

HiveMall: Machine learning library for Hive

No comments:

Post a Comment