hahadsg's note

Follow me on GitHub

Syntax

  • Select Syntax

    https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Select

  • Transform/Map-Reduce Syntax

    https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Transform

  • regexp dict

    https://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html

create table and load data

create table {table} (
    col1 string
)
partitioned by (group_id string)
ROW format delimited
fields terminated by '\t'
STORED AS TEXTFILE;

load data local inpath '{path}' into table {table} partition(group_id='1');

dynamic partition

自动根据day这列的值创建分区

SET hive.exec.dynamic.partition=true;  
SET hive.exec.dynamic.partition.mode=nonstrict; 
SET hive.exec.max.dynamic.partitions.pernode = 1000;
SET hive.exec.max.dynamic.partitions=1000;

insert overwrite table t_par partition (day)
select v1, v2, day from t_unpar;

export data

  • export with header
hive -e "set hive.cli.print.header=true; select * from table" > f.csv

time type

  • pattern参考

https://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html

u: day of week

  • convert
-- string to string
select from_unixtime(unix_timestamp('20181231', 'yyyyMMdd'), 'yyyy-MM-dd');

-- string to date
select to_date(from_unixtime(unix_timestamp('20181231', 'yyyyMMdd')));

grouping sets

http://lxw1234.com/archives/2015/04/193.htm

insert

-- insert overwrite partition
insert overwrite table test.test partition (day='20190114')
select col1, col2
from test.test2

Advance hive

https://www.qubole.com/blog/hive-best-practices/

Perference

高效抽样

-- 假设我们从1亿数据抽样100万
select *
from <table>
where rand() <= 0.05 -- 先大致淘汰大部分数据(剩下大约500万)
distribute by rand() -- 按rand()在每台机器上分布
sort by rand() -- 在每个reducer中sort
limit 1000000

Trick

大量数据打上序号

-- test.table_origin中有大量数据,希望给其打上序号
-- 按proba降序排序 然后打上序号
-- 将proba分成1000组 分别rank 再合并
drop table if exists test.table_group;
create table test.table_group as
select id, proba, ceil(proba * 1000) as group_id
from test.table_origin
;
drop table if exists test.table_group_cumcount;
create table test.table_group_cumcount as
select group_id, cnt
    , sum(cnt) over(order by group_id rows between unbounded preceding and current row) as cum_cnt
from (
    select group_id, count(*) as cnt
    from test.table_group
    group by group_id
) t
;
drop table if exists test.table_temp_rank;
create table test.table_temp_rank as
select id, proba, group_id
    , row_number() over(partition by group_id order by proba desc) as group_rk
from test.table_group
;
drop table if exists test.table_rank;
create table test.table_rank as
select t1.id, t1.proba, t1.group_rk, t1.group_rk + t2.cum_cnt - t2.cnt as rk
from test.table_temp_rank t1
join test.table_group_cumcount t2
    on t2.group_id = t1.group_id
;