一 、ods层
1.1创建订单表(ods_order_info)
hive (gmall)>
drop table if exists ods_order_info;
create external table ods_order_info (
`id` string COMMENT '订单编号',
`total_amount` decimal(10,2) COMMENT '订单金额',
`order_status` string COMMENT '订单状态',
`user_id` string COMMENT '用户id',
`payment_way` string COMMENT '支付方式',
`out_trade_no` string COMMENT '支付流水号',
`create_time` string COMMENT '创建时间',
`operate_time` string COMMENT '操作时间'
) COMMENT '订单表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_order_info/';
1.2创建订单详情表(ods_order_detail)
hive (gmall)>
drop table if exists ods_order_detail;
create external table ods_order_detail(
`id` string COMMENT '订单详情编号',
`order_id` string COMMENT '订单号',
`user_id` string COMMENT '用户id',
`sku_id` string COMMENT '商品id',
`sku_name` string COMMENT '商品名称',
`order_price` string COMMENT '商品单价',
`sku_num` string COMMENT '商品数量',
`create_time` string COMMENT '创建时间'
) COMMENT '订单明细表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_order_detail/';
1.3创建商品表(ods_sku_info)
hive (gmall)>
drop table if exists ods_sku_info;
create external table ods_sku_info(
`id` string COMMENT 'skuId',
`spu_id` string COMMENT 'spuid',
`price` decimal(10,2) COMMENT '价格',
`sku_name` string COMMENT '商品名称',
`sku_desc` string COMMENT '商品描述',
`weight` string COMMENT '重量',
`tm_id` string COMMENT '品牌id',
`category3_id` string COMMENT '品类id',
`create_time` string COMMENT '创建时间'
) COMMENT '商品表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_sku_info/';
1.4创建用户表(ods_user_info)
hive (gmall)>
drop table if exists ods_user_info;
create external table ods_user_info(
`id` string COMMENT '用户id',
`name` string COMMENT '姓名',
`birthday` string COMMENT '生日',
`gender` string COMMENT '性别',
`email` string COMMENT '邮箱',
`user_level` string COMMENT '用户等级',
`create_time` string COMMENT '创建时间'
) COMMENT '用户信息'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_user_info/';
1.5创建商品一级分类表(ds_base_category1)
hive (gmall)>
drop table if exists ods_base_category1;
create external table ods_base_category1(
`id` string COMMENT 'id',
`name` string COMMENT '名称'
) COMMENT '商品一级分类'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_base_category1/';
1.6创建商品二级分类表(ds_base_category2)
hive (gmall)>
drop table if exists ods_base_category2;
create external table ods_base_category2(
`id` string COMMENT ' id',
`name` string COMMENT '名称',
category1_id string COMMENT '一级品类id'
) COMMENT '商品二级分类'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_base_category2/';
1.7创建商品三级分类表(ds_base_category3)
hive (gmall)>
drop table if exists ods_base_category3;
create external table ods_base_category3(
`id` string COMMENT ' id',
`name` string COMMENT '名称',
category2_id string COMMENT '二级品类id'
) COMMENT '商品三级分类'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_base_category3/';
1.8创建支付流水表(ods_payment_info)
hive (gmall)>
drop table if exists ods_payment_info;
create external table ods_payment_info(
`id` bigint COMMENT '编号',
`out_trade_no` string COMMENT '对外业务编号',
`order_id` string COMMENT '订单编号',
`user_id` string COMMENT '用户编号',
`alipay_trade_no` string COMMENT '支付宝交易流水编号',
`total_amount` decimal(16,2) COMMENT '支付金额',
`subject` string COMMENT '交易内容',
`payment_type` string COMMENT '支付类型',
`payment_time` string COMMENT '支付时间'
) COMMENT '支付流水表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ods/ods_payment_info/';
1.9ods层数据导入脚本(ods_db.sh)
#!/bin/bash
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
load data inpath '/origin_data/$APP/db/order_info/$do_date' OVERWRITE into table "$APP".ods_order_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/order_detail/$do_date' OVERWRITE into table "$APP".ods_order_detail partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/sku_info/$do_date' OVERWRITE into table "$APP".ods_sku_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/user_info/$do_date' OVERWRITE into table "$APP".ods_user_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/payment_info/$do_date' OVERWRITE into table "$APP".ods_payment_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category1/$do_date' OVERWRITE into table "$APP".ods_base_category1 partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category2/$do_date' OVERWRITE into table "$APP".ods_base_category2 partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category3/$do_date' OVERWRITE into table "$APP".ods_base_category3 partition(dt='$do_date');
"
$hive -e "$sql"
二、dwd层
2.1创建订单表(dwd_order_info)
hive (gmall)>
drop table if exists dwd_order_info;
create external table dwd_order_info (
`id` string COMMENT '',
`total_amount` decimal(10,2) COMMENT '',
`order_status` string COMMENT ' 1 2 3 4 5',
`user_id` string COMMENT 'id',
`payment_way` string COMMENT '',
`out_trade_no` string COMMENT '',
`create_time` string COMMENT '',
`operate_time` string COMMENT ''
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_order_info/'
tblproperties ("parquet.compression"="snappy");
2.2创建订单详情表(dwd_order_detail)
hive (gmall)>
drop table if exists dwd_order_detail;
create external table dwd_order_detail(
`id` string COMMENT '',
`order_id` decimal(10,2) COMMENT '',
`user_id` string COMMENT 'id',
`sku_id` string COMMENT 'id',
`sku_name` string COMMENT '',
`order_price` string COMMENT '',
`sku_num` string COMMENT '',
`create_time` string COMMENT ''
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_order_detail/'
tblproperties ("parquet.compression"="snappy");
2.3创建用户表(dwd_user_info)
hive (gmall)>
drop table if exists dwd_user_info;
create external table dwd_user_info(
`id` string COMMENT 'id',
`name` string COMMENT '',
`birthday` string COMMENT '',
`gender` string COMMENT '',
`email` string COMMENT '',
`user_level` string COMMENT '',
`create_time` string COMMENT ''
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_user_info/'
tblproperties ("parquet.compression"="snappy");
2.4创建支付流水表(dwd_payment_info)
hive (gmall)>
drop table if exists dwd_payment_info;
create external table dwd_payment_info(
`id` bigint COMMENT '',
`out_trade_no` string COMMENT '',
`order_id` string COMMENT '',
`user_id` string COMMENT '',
`alipay_trade_no` string COMMENT '',
`total_amount` decimal(16,2) COMMENT '',
`subject` string COMMENT '',
`payment_tpe` string COMMENT '',
`payment_time` string COMMENT ''
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_payment_info/'
tblproperties ("parquet.compression"="snappy");
2.5创建商品表(增加分类)(dwd_sku_info)
hive (gmall)>
drop table if exists dwd_sku_info;
create external table dwd_sku_info(
`id` string COMMENT 'skuId',
`spu_id` string COMMENT 'spuid',
`price` decimal(10,2) COMMENT '',
`sku_name` string COMMENT '',
`sku_desc` string COMMENT '',
`weight` string COMMENT '',
`tm_id` string COMMENT 'id',
`category3_id` string COMMENT '1id',
`category2_id` string COMMENT '2id',
`category1_id` string COMMENT '3id',
`category3_name` string COMMENT '3',
`category2_name` string COMMENT '2',
`category1_name` string COMMENT '1',
`create_time` string COMMENT ''
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_sku_info/'
tblproperties ("parquet.compression"="snappy");
2.6dwd层数据导入脚本(dwd_db.sh)
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$APP".dwd_order_info partition(dt)
select * from "$APP".ods_order_info
where dt='$do_date' and id is not null;
insert overwrite table "$APP".dwd_order_detail partition(dt)
select * from "$APP".ods_order_detail
where dt='$do_date' and id is not null;
insert overwrite table "$APP".dwd_user_info partition(dt)
select * from "$APP".ods_user_info
where dt='$do_date' and id is not null;
insert overwrite table "$APP".dwd_payment_info partition(dt)
select * from "$APP".ods_payment_info
where dt='$do_date' and id is not null;
insert overwrite table "$APP".dwd_sku_info partition(dt)
select
sku.id,
sku.spu_id,
sku.price,
sku.sku_name,
sku.sku_desc,
sku.weight,
sku.tm_id,
sku.category3_id,
c2.id category2_id,
c1.id category1_id,
c3.name category3_name,
c2.name category2_name,
c1.name category1_name,
sku.create_time,
sku.dt
from
"$APP".ods_sku_info sku
join "$APP".ods_base_category3 c3 on sku.category3_id=c3.id
join "$APP".ods_base_category2 c2 on c3.category2_id=c2.id
join "$APP".ods_base_category1 c1 on c2.category1_id=c1.id
where sku.dt='$do_date' and c2.dt='$do_date'
and c3.dt='$do_date' and c1.dt='$do_date'
and sku.id is not null;
"
$hive -e "$sql"
三、dws层
3.1创建用户行为宽表(dws_user_action)
hive (gmall)>
drop table if exists dws_user_action;
create external table dws_user_action
(
user_id string comment '用户 id',
order_count bigint comment '下单次数 ',
order_amount decimal(16,2) comment '下单金额 ',
payment_count bigint comment '支付次数',
payment_amount decimal(16,2) comment '支付金额 ',
comment_count bigint comment '评论次数'
) COMMENT '每日用户行为宽表'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_action/';
3.2向用户行为宽表导入数据
hive (gmall)>
with
tmp_order as
(
select
user_id,
count(*) order_count,
sum(oi.total_amount) order_amount
from dwd_order_info oi
where date_format(oi.create_time,'yyyy-MM-dd')='2019-02-10'
group by user_id
) ,
tmp_payment as
(
select
user_id,
sum(pi.total_amount) payment_amount,
count(*) payment_count
from dwd_payment_info pi
where date_format(pi.payment_time,'yyyy-MM-dd')='2019-02-10'
group by user_id
),
tmp_comment as
(
select
user_id,
count(*) comment_count
from dwd_comment_log c
where date_format(c.dt,'yyyy-MM-dd')='2019-02-10'
group by user_id
)
insert overwrite table dws_user_action partition(dt='2019-02-10')
select
user_actions.user_id,
sum(user_actions.order_count),
sum(user_actions.order_amount),
sum(user_actions.payment_count),
sum(user_actions.payment_amount),
sum(user_actions.comment_count)
from
(
select
user_id,
order_count,
order_amount,
0 payment_count,
0 payment_amount,
0 comment_count
from tmp_order
union all
select
user_id,
0,
0,
payment_count,
payment_amount,
0
from tmp_payment
union all
select
user_id,
0,
0,
0,
0,
comment_count
from tmp_comment
) user_actions
group by user_id;
3.3用户行为数据宽表导入脚本(dws_db_wide.sh)
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
with
tmp_order as
(
select
user_id,
sum(oi.total_amount) order_amount,
count(*) order_count
from "$APP".dwd_order_info oi
where date_format(oi.create_time,'yyyy-MM-dd')='$do_date'
group by user_id
) ,
tmp_payment as
(
select
user_id,
sum(pi.total_amount) payment_amount,
count(*) payment_count
from "$APP".dwd_payment_info pi
where date_format(pi.payment_time,'yyyy-MM-dd')='$do_date'
group by user_id
),
tmp_comment as
(
select
user_id,
count(*) comment_count
from "$APP".dwd_comment_log c
where date_format(c.dt,'yyyy-MM-dd')='$do_date'
group by user_id
)
Insert overwrite table "$APP".dws_user_action partition(dt='$do_date')
select
user_actions.user_id,
sum(user_actions.order_count),
sum(user_actions.order_amount),
sum(user_actions.payment_count),
sum(user_actions.payment_amount),
sum(user_actions.comment_count)
from
(
select
user_id,
order_count,
order_amount,
0 payment_count,
0 payment_amount,
0 comment_count
from tmp_order
union all
select
user_id,
0,
0,
payment_count,
payment_amount,
0
from tmp_payment
union all
select
user_id,
0,
0,
0,
0,
comment_count
from tmp_comment
) user_actions
group by user_id;
"
$hive -e "$sql"
3.4用户购买商品明细表(宽表)(dws_sale_detail_daycount)
3.4.1建表
hive (gmall)>
drop table if exists dws_sale_detail_daycount;
create external table dws_sale_detail_daycount
(
user_id string comment '用户 id',
sku_id string comment '商品 Id',
user_gender string comment '用户性别',
user_age string comment '用户年龄',
user_level string comment '用户等级',
order_price decimal(10,2) comment '商品价格',
sku_name string comment '商品名称',
sku_tm_id string comment '品牌id',
sku_category3_id string comment '商品三级品类id',
sku_category2_id string comment '商品二级品类id',
sku_category1_id string comment '商品一级品类id',
sku_category3_name string comment '商品三级品类名称',
sku_category2_name string comment '商品二级品类名称',
sku_category1_name string comment '商品一级品类名称',
spu_id string comment '商品 spu',
sku_num int comment '购买个数',
order_count string comment '当日下单单数',
order_amount string comment '当日下单金额'
) COMMENT '用户购买商品明细表'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_sale_detail_daycount/'
tblproperties ("parquet.compression"="snappy");
3.4.2导入数据
hive (gmall)>
with
tmp_detail as
(
select
user_id,
sku_id,
sum(sku_num) sku_num,
count(*) order_count,
sum(od.order_price*sku_num) order_amount
from dwd_order_detail od
where od.dt='2019-02-10'
group by user_id, sku_id
)
insert overwrite table dws_sale_detail_daycount partition(dt='2019-02-10')
select
tmp_detail.user_id,
tmp_detail.sku_id,
u.gender,
months_between('2019-02-10', u.birthday)/12 age,
u.user_level,
price,
sku_name,
tm_id,
category3_id,
category2_id,
category1_id,
category3_name,
category2_name,
category1_name,
spu_id,
tmp_detail.sku_num,
tmp_detail.order_count,
tmp_detail.order_amount
from tmp_detail
left join dwd_user_info u on tmp_detail.user_id =u.id and u.dt='2019-02-10'
left join dwd_sku_info s on tmp_detail.sku_id =s.id and s.dt='2019-02-10';
3.4.3脚本(dws_sale.sh)
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
with
tmp_detail as
(
select
user_id,
sku_id,
sum(sku_num) sku_num,
count(*) order_count,
sum(od.order_price*sku_num) order_amount
from "$APP".dwd_order_detail od
where od.dt='$do_date'
group by user_id, sku_id
)
insert overwrite table "$APP".dws_sale_detail_daycount partition(dt='$do_date')
select
tmp_detail.user_id,
tmp_detail.sku_id,
u.gender,
months_between('$do_date', u.birthday)/12 age,
u.user_level,
price,
sku_name,
tm_id,
category3_id,
category2_id,
category1_id,
category3_name,
category2_name,
category1_name,
spu_id,
tmp_detail.sku_num,
tmp_detail.order_count,
tmp_detail.order_amount
from tmp_detail
left join "$APP".dwd_user_info u
on tmp_detail.user_id=u.id and u.dt='$do_date'
left join "$APP".dwd_sku_info s on tmp_detail.sku_id =s.id and s.dt='$do_date';
"
$hive -e "$sql"
3.5新付费用户数(dws_pay_user_detail)
3.5.1建表
drop table if exists dws_pay_user_detail;
create external table dws_pay_user_detail(
`user_id` string comment '付费用户id',
`name` string comment '付费用户姓名',
`birthday` string COMMENT '',
`gender` string COMMENT '',
`email` string COMMENT '',
`user_level` string COMMENT ''
) COMMENT '付费用户表'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_pay_user_detail/';
3.5.2导入数据
insert overwrite table dws_pay_user_detail partition(dt='2019-10-03')
select
ua.user_id,
ui.name,
ui.birthday,
ui.gender,
ui.email,
ui.user_level
from (
select user_id from dws_user_action where dt='2019-10-03'
) ua join(
select * from dwd_user_info where dt='2019-10-03' –- 用户表是每日全量导入
) ui on ua.user_id=ui.id
left join dws_pay_user_detail ud on ua.user_id=ud.user_i
3.5.3脚本(dws_pay_user_detail.sh)
#!/bin/bash
db=gmall
hive=/opt/module/hive/bin/hive
hadoop=/opt/module/hadoop-2.7.2/bin/hadoop
if [[ -n $1 ]]; then
do_date=$1
else
do_date=`date -d '-1 day' +%F`
fi
sql="
use gmall;
insert overwrite table dws_pay_user_detail partition(dt='$do_date')
select
ua.user_id,
ui.name,
ui.birthday,
ui.gender,
ui.email,
ui.user_level
from (
select user_id from dws_user_action where dt='$do_date'
) ua join(
select * from dwd_user_info where dt='$do_date'
) ui on ua.user_id=ui.id
left join dws_pay_user_detail ud on ua.user_id=ud.user_id
where ud.user_id is null;
"
$hive -e "$sql"
四、ads层
4.1一天成交总额(ads_gmv_sum_day)
4.1.1建表
hive (gmall)>
drop table if exists ads_gmv_sum_day;
create external table ads_gmv_sum_day(
`dt` string COMMENT '统计日期',
`gmv_count` bigint COMMENT '当日gmv订单个数',
`gmv_amount` decimal(16,2) COMMENT '当日gmv订单总金额',
`gmv_payment` decimal(16,2) COMMENT '当日支付金额'
) COMMENT 'GMV'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_gmv_sum_day/';
4.1.2数据导入
hive (gmall)>
insert into table ads_gmv_sum_day
select
'2019-02-10' dt,
sum(order_count) gmv_count,
sum(order_amount) gmv_amount,
sum(payment_amount) payment_amount
from dws_user_action
where dt ='2019-02-10'
group by dt;
4.1.3脚本(ads_db_gmv.sh)
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
insert into table "$APP".ads_gmv_sum_day
select
'$do_date' dt,
sum(order_count) gmv_count,
sum(order_amount) gmv_amount,
sum(payment_amount) payment_amount
from "$APP".dws_user_action
where dt ='$do_date'
group by dt;
"
$hive -e "$sql"
4.2新增用户占日活跃用户比率(ads_user_convert_day)
4.2.1建表
hive (gmall)>
drop table if exists ads_user_convert_day;
create external table ads_user_convert_day(
`dt` string COMMENT '统计日期',
`uv_m_count` bigint COMMENT '当日活跃设备',
`new_m_count` bigint COMMENT '当日新增设备',
`new_m_ratio` decimal(10,2) COMMENT '当日新增占日活的比率'
) COMMENT '转化率'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_convert_day/';
4.2.2数据导入
hive (gmall)>
insert into table ads_user_convert_day
select
'2021-06-09',
sum(uc.dc) sum_dc,
sum(uc.nmc) sum_nmc,
sum( uc.nmc)/sum( uc.dc)*100 new_m_ratio
from
(
select
day_count dc,
0 nmc
from ads_uv_count
where dt='2019-02-10'
union all
select
0 dc,
new_mid_count nmc
from ads_new_mid_count
where create_date='2019-02-10'
)uc;
4.3用户行为漏斗分析(ads_user_action_convert_day)
4.3.1建表
hive (gmall)>
drop table if exists ads_user_action_convert_day;
create external table ads_user_action_convert_day(
`dt` string COMMENT '统计日期',
`total_visitor_m_count` bigint COMMENT '总访问人数',
`order_u_count` bigint COMMENT '下单人数',
`visitor2order_convert_ratio` decimal(10,2) COMMENT '访问到下单转化率',
`payment_u_count` bigint COMMENT '支付人数',
`order2payment_convert_ratio` decimal(10,2) COMMENT '下单到支付的转化率'
) COMMENT '用户行为漏斗分析'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_action_convert_day/';
4.3.2数据导入
hive (gmall)>
insert into table ads_user_action_convert_day
select
'2019-06-09',
uv.day_count,
ua.order_count,
cast(ua.order_count/uv.day_count as decimal(10,2)) visitor2order_convert_ratio,
ua.payment_count,
cast(ua.payment_count/ua.order_count as decimal(10,2)) order2payment_convert_ratio
from
(
select
dt,
sum(if(order_count>0,1,0)) order_count,
sum(if(payment_count>0,1,0)) payment_count
from dws_user_action
where dt='2021-06-09'
group by dt
)ua join ads_uv_count uv on uv.dt=ua.dt;
4.4品牌复购率
4.4.1建表
hive (gmall)>
drop table ads_sale_tm_category1_stat_mn;
create external table ads_sale_tm_category1_stat_mn
(
tm_id string comment '品牌id',
category1_id string comment '1级品类id ',
category1_name string comment '1级品类名称 ',
buycount bigint comment '购买人数',
buy_twice_last bigint comment '两次以上购买人数',
buy_twice_last_ratio decimal(10,2) comment '单次复购率',
buy_3times_last bigint comment '三次以上购买人数',
buy_3times_last_ratio decimal(10,2) comment '多次复购率',
stat_mn string comment '统计月份',
stat_date string comment '统计日期'
) COMMENT '复购率统计'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_sale_tm_category1_stat_mn/';
4.4.2数据导入
hive (gmall)>
insert into table ads_sale_tm_category1_stat_mn
select
mn.sku_tm_id,
mn.sku_category1_id,
mn.sku_category1_name,
sum(if(mn.order_count>=1,1,0)) buycount,
sum(if(mn.order_count>=2,1,0)) buyTwiceLast,
sum(if(mn.order_count>=2,1,0))/sum( if(mn.order_count>=1,1,0)) buyTwiceLastRatio,
sum(if(mn.order_count>=3,1,0)) buy3timeLast ,
sum(if(mn.order_count>=3,1,0))/sum( if(mn.order_count>=1,1,0)) buy3timeLastRatio ,
date_format('2019-02-10' ,'yyyy-MM') stat_mn,
'2019-02-10' stat_date
from
(
select
user_id,
sd.sku_tm_id,
sd.sku_category1_id,
sd.sku_category1_name,
sum(order_count) order_count
from dws_sale_detail_daycount sd
where date_format(dt,'yyyy-MM')=date_format('2019-02-10' ,'yyyy-MM')
group by user_id, sd.sku_tm_id, sd.sku_category1_id, sd.sku_category1_name
) mn
group by mn.sku_tm_id, mn.sku_category1_id, mn.sku_category1_name;
4.4.3脚本(ads_sale.sh)
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table "$APP".ads_sale_tm_category1_stat_mn
select
mn.sku_tm_id,
mn.sku_category1_id,
mn.sku_category1_name,
sum(if(mn.order_count>=1,1,0)) buycount,
sum(if(mn.order_count>=2,1,0)) buyTwiceLast,
sum(if(mn.order_count>=2,1,0))/sum( if(mn.order_count>=1,1,0)) buyTwiceLastRatio,
sum(if(mn.order_count>=3,1,0)) buy3timeLast,
sum(if(mn.order_count>=3,1,0))/sum( if(mn.order_count>=1,1,0)) buy3timeLastRatio ,
date_format('$do_date' ,'yyyy-MM') stat_mn,
'$do_date' stat_date
from
(
select
user_id,
od.sku_tm_id,
od.sku_category1_id,
od.sku_category1_name,
sum(order_count) order_count
from "$APP".dws_sale_detail_daycount od
where date_format(dt,'yyyy-MM')=date_format('$do_date' ,'yyyy-MM')
group by user_id, od.sku_tm_id, od.sku_category1_id, od.sku_category1_name
) mn
group by mn.sku_tm_id, mn.sku_category1_id, mn.sku_category1_name;
"
$hive -e "$sql"
4.5各用户等级对应的复购率前十的商品(ads_ul_rep_ratio)
4.5.1建表
drop table ads_ul_rep_ratio;
create table ads_ul_rep_ratio(
user_level string comment '用户等级' ,
sku_id string comment '商品id',
buy_count bigint comment '购买总人数',
buy_twice_count bigint comment '两次购买总数',
buy_twice_rate decimal(10,2) comment '二次复购率',
rank string comment '排名' ,
state_date string comment '统计日期'
) COMMENT '复购率统计'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_ul_rep_ratio/';
4.5.2数据导入
with
tmp_count as(
select -- 每个等级内每个用户对每个产品的下单次数
user_level,
user_id,
sku_id,
sum(order_count) order_count
from dws_sale_detail_daycount
where dt<='2019-10-05'
group by user_level, user_id, sku_id
)
insert overwrite table ads_ul_rep_ratio
select
*
from(
select
user_level,
sku_id,
sum(if(order_count >=1, 1, 0)) buy_count,
sum(if(order_count >=2, 1, 0)) buy_twice_count,
sum(if(order_count >=2, 1, 0)) / sum(if(order_count >=1, 1, 0)) * 100 buy_twice_rate,
row_number() over(partition by user_level order by sum(if(order_count >=2, 1, 0)) / sum(if(order_count >=1, 1, 0)) desc) rn,
'2019-10-05'
from tmp_count
group by user_level, sku_id
) t1
where rn<=10
4.5.3脚本(ads_ul_rep_ratio.sh)
#!/bin/bash
db=gmall
hive=/opt/module/hive-1.2.1/bin/hive
hadoop=/opt/module/hadoop-2.7.2/bin/hadoop
if [[ -n $1 ]]; then
do_date=$1
else
do_date=`date -d '-1 day' +%F`
fi
sql="
use gmall;
with
tmp_count as(
select -- 每等级用户每产品的下单次数
user_level,
sku_id,
sum(order_count) order_count
from dws_sale_detail_daycount
where dt<='$do_date'
group by user_level, sku_id
)
insert overwrite table ads_ul_rep_ratio
select
*
from(
select
user_level,
sku_id,
sum(if(order_count >=1, 1, 0)) buy_count,
sum(if(order_count >=2, 1, 0)) buy_twice_count,
sum(if(order_count >=2, 1, 0)) / sum(if(order_count >=1, 1, 0)) * 100 buy_twice_rate,
row_number() over(partition by user_level order by sum(if(order_count >=2, 1, 0)) / sum(if(order_count >=1, 1, 0)) desc) rn,
'$do_date'
from tmp_count
group by user_level, sku_id
) t1
where rn<=10
"
$hive -e "$sql"
4.6新付费用户数(ads_pay_user_count)
4.6.1建表
drop table if exists ads_pay_user_count;
create external table ads_pay_user_count(
dt string COMMENT '统计日期',
pay_count bigint COMMENT '付费用户数'
) COMMENT '付费用户表'
stored as parquet
location '/warehouse/gmall/dws/ads_pay_user_count/';
4.6.2数据导入
insert into table ads_pay_user_count
select
'2019-02-10',
count(*) pay_count
from
dws_pay_user_detail
where
dt='2019-02-10';
4.6.3脚本(ads_pay_user_count.sh)
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
hadoop=/opt/module/hadoop-2.7.2/bin/hadoop
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
echo "===日志日期为 $do_date==="
sql="
insert into table "$APP".ads_pay_user_count
select
'$do_date',
count(*) pay_count
from
"$APP".dws_pay_user_detail
where
dt='$do_date';
"
$hive -e "$sql"
五、azkaban
5.1安装azkaban
- 在/opt/module/目录下创建azkaban目录
mkdir azkaban
- 解压azkaban-web-server-2.5.0.tar.gz、azkaban-executor-server-2.5.0.tar.gz、azkaban-sql-script-2.5.0.tar.gz到/opt/module/azkaban目录下
tar -zxvf azkaban-web-server-2.5.0.tar.gz -C /opt/module/azkaban/
tar -zxvf azkaban-executor-server-2.5.0.tar.gz -C /opt/module/azkaban/
tar -zxvf azkaban-sql-script-2.5.0.tar.gz -C /opt/module/azkaban/
- 对解压后的文件重新命名
mv azkaban-web-2.5.0/ server
mv azkaban-executor-2.5.0/ executor
- azkaban脚本导入
? 进入mysql,创建azkaban数据库,并将解压的脚本导入到azkaban数据库。
mysql> create database azkaban;
mysql> use azkaban;
mysql> source /opt/module/azkaban/azkaban-2.5.0/create-all-sql-2.5.0.sql
注:source后跟.sql文件,用于批量处理.sql文件中的sql语句。
5.2生成密钥库
Keytool:是java数据证书的管理工具,使用户能够管理自己的公/私钥对及相关证书。
-keystore:指定密钥库的名称及位置(产生的各类信息将不在.keystore文件中)
-genkey:在用户主目录中创建一个默认文件".keystore"
-alias:对我们生成的.keystore进行指认别名;如果没有默认是mykey
-keyalg:指定密钥的算法 RSA/DSA 默认是DSA
1)生成 keystore的密码及相应信息的密钥库
[ azkaban]$ keytool -keystore keystore -alias jetty -genkey -keyalg RSA
5.3配置文件
Web服务器配置
1)进入azkaban web服务器安装目录 conf目录,打开azkaban.properties文件
pwd
/opt/module/azkaban/server/conf
vim azkaban.properties
2)按照如下配置修改azkaban.properties文件。
#Azkaban Personalization Settings
#服务器UI名称,用于服务器上方显示的名字
azkaban.name=Test
#描述
azkaban.label=My Local Azkaban
#UI颜色
azkaban.color=#FF3601
azkaban.default.servlet.path=/index
#默认web server存放web文件的目录
web.resource.dir=/opt/module/azkaban/server/web/
#默认时区,已改为亚洲/上海 默认为美国
default.timezone.id=Asia/Shanghai
#Azkaban UserManager class
user.manager.class=azkaban.user.XmlUserManager
#用户权限管理默认类(绝对路径)
user.manager.xml.file=/opt/module/azkaban/server/conf/azkaban-users.xml
#Loader for projects
#global配置文件所在位置(绝对路径)
executor.global.properties=/opt/module/azkaban/executor/conf/global.properties
azkaban.project.dir=projects
#数据库类型
database.type=mysql
#端口号
mysql.port=3306
#数据库连接IP
mysql.host=hadoop102
#数据库实例名
mysql.database=azkaban
#数据库用户名
mysql.user=root
#数据库密码
mysql.password=123456
#最大连接数
mysql.numconnections=100
# Velocity dev mode
velocity.dev.mode=false
# Azkaban Jetty server properties.
# Jetty服务器属性.
#最大线程数
jetty.maxThreads=25
#Jetty SSL端口
jetty.ssl.port=8443
#Jetty端口
jetty.port=8081
#SSL文件名(绝对路径)
jetty.keystore=/opt/module/azkaban/server/keystore
#SSL文件密码
jetty.password=000000
#Jetty主密码与keystore文件相同
jetty.keypassword=000000
#SSL文件名(绝对路径)
jetty.truststore=/opt/module/azkaban/server/keystore
#SSL文件密码
jetty.trustpassword=000000
# Azkaban Executor settings
executor.port=12321
# mail settings
mail.sender=
mail.host=
job.failure.email=
job.success.email=
lockdown.create.projects=false
cache.directory=cache
5.4启动executor服务器
在executor服务器目录下执行启动命令
pwd
/opt/module/azkaban/executor
bin/azkaban-executor-start.sh
5.5 启动web服务器
在azkaban web服务器目录下执行启动命令
[atguigu@hadoop102 server]$ pwd
/opt/module/azkaban/server
bin/azkaban-web-start.sh
注意:
先执行executor,再执行web,避免Web Server会因为找不到执行器启动失败。
jps查看进程
[atguigu@hadoop102 server]$ jps
3601 AzkabanExecutorServer
5880 Jps
3661 AzkabanWebServer
启动完成后,在浏览器(建议使用谷歌浏览器)中输入https://服务器IP地址:8443,即可访问azkaban服务了。
在登录中输入刚才在azkaban-users.xml文件中新添加的户用名及密码,点击 login。
六、安装Presto
[root@hadoop103 soft]# tar -zxvf presto-server-0.196.tar.gz -C …/module
[root@hadoop103 soft]# mv presto-cli-0.196-executable.jar presto-cli [root@hadoop103 soft]# mv presto-cli /opt/module/
[root@hadoop103 module]# mv presto-server-0.196 presto
进入到/opt/module/presto目录,并创建存储数据文件夹
[root@hadoop103 presto]# mkdir data [root@hadoop103 presto]# mkdir etc
配置在/opt/module/presto/etc目录下添加jvm.config配置文件
-server
-Xmx16G
-XX:+UseG1GC
-XX:G1HeapRegionSize=32M
-XX:+UseGCOverheadLimit
-XX:+ExplicitGCInvokesConcurrent
-XX:+HeapDumpOnOutOfMemoryError
-XX:+ExitOnOutOfMemoryError
Presto可以支持多个数据源,在Presto里面叫catalog,这里我们配置支持Hive的数据源,配置一个Hive的catalog
[root@hadoop103 etc]# mkdir catalog [root@hadoop103 catalog]# vim hive.properties
xsync presto
分发之后,分别进入hadoop102、hadoop103、hadoop104三台主机的/opt/module/presto/etc的路径。配置node属性,node id每个节点都不一样。
[root@hadoop102 etc]#vim node.properties
node.environment=production
node.id=ffffffff-ffff-ffff-ffff-fffffffffffe
node.data-dir=/opt/module/presto/data
[root@hadoop103 etc]#vim node.properties
node.environment=production
node.id=ffffffff-ffff-ffff-ffff-ffffffffffff
node.data-dir=/opt/module/presto/data
[root@hadoop104 etc]#vim node.properties
node.environment=production
node.id=ffffffff-ffff-ffff-ffff-fffffffffffd
node.data-dir=/opt/module/presto/data
Presto是由一个coordinator节点和多个worker节点组成。在hadoop102上配置成coordinator,在hadoop103、hadoop104上配置为worker。
(1)hadoop102上配置coordinator节点
[root@hadoop103 etc]$ vim config.properties
添加内容如下
coordinator=true
node-scheduler.include-coordinator=false
http-server.http.port=8881
query.max-memory=50GB
discovery-server.enabled=true
discovery.uri=http://hadoop103:8881
(2)hadoop103、hadoop104上配置worker节点
[root@hadoop102 etc]$ vim config.properties
添加内容如下
coordinator=false
http-server.http.port=8881
query.max-memory=50GB
discovery.uri=http://hadoop103:8881
[root@hadoop104 etc]$ vim config.properties
添加内容如下
coordinator=false
http-server.http.port=8881
query.max-memory=50GB
discovery.uri=http://hadoop103:8881
在hadoop103的/opt/module/hive目录下,启动Hive Metastore,用root角色
[root@hadoop103 hive]$nohup bin/hive --service metastore >/dev/null 2>&1 &
前台启动Presto,控制台显示日志
[root@hadoop102 presto]$ bin/launcher run
[root@hadoop103 presto]$ bin/launcher run
[root@hadoop104 presto]$ bin/launcher run
后台启动Presto
[root@hadoop102 presto]$ bin/launcher start
[root@hadoop103 presto]$ bin/launcher start
[root@hadoop104 presto]$ bin/launcher start
日志查看路径/opt/module/presto/data/var/log
进入/opt/module/presto/plugin/hive-hadoop2
[root@hadoop103 hive-hadoop2]# cp /opt/module/hadoop-2.7.2/share/hadoop/common/hadoop-lzo-0.4.20.jar ./
xsync hadoop-lzo-0.4.20.jar
连接presto
java -jar presto-cli --server hadoop103:8881 --catalog hive --schema default
七、安装druid
解压
进入/opt/module/imply-2.7.10/conf/druid/_common
vim common.runtime.properties
修改druid.zk.service.host=hadoop102:2181,hadoop103:2181,hadoop104:2181
vim /opt/module/imply/conf/supervise/quickstart.conf
修改
:verify bin/verify-java
#:verify bin/verify-default-ports
#:verify bin/verify-version-check
:kill-timeout 10
#!p10 zk bin/run-zk conf-quickstart
(1)启动Zookeeper
zk start
(2)启动kafka
kf start
(2)启动imply
bin/supervise -c conf/supervise/quickstart.conf
web界面
hadoop103:9095
八、安装hbase
tar -zxvf hbase-1.3.1-bin.tar.gz -C /opt/module/
改名
mv hbase-1.3.1 hbase
配置环境变量
vim /etc/profile
HBASE_HOME=/opt/module/hbase
8.1配置文件
进入/opt/module/hbase/conf
vim hbase-env.sh
将下面代码注释掉
export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS -XX:PermSize=128m -XX:MaxPermSize=128m"
export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS -XX:PermSize=128m -XX:MaxPermSize=128m"
export HBASE_MANAGES_ZK=false将注释去掉,并将true改为false
vim hbase-site.xml
添加如下代码
<configuration>
<property>
<name>hbase.rootdir</name>
<value>hdfs://hadoop102:9000/hbase</value>
</property>
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<!-- 0.98后的新变动,之前版本没有.port,默认端口为60000 -->
<property>
<name>hbase.master.port</name>
<value>16000</value>
</property>
<property>
<name>hbase.zookeeper.quorum</name>
<value>hadoop102,hadoop103,hadoop104</value>
</property>
<property>
<name>hbase.zookeeper.property.dataDir</name>
<value>/opt/module/zookeeper-3.4.10/datas</value>
</property>
</configuration>
vim regionservers
添加
hadoop102
hadoop103
hadoop104
启动
[hbase]$ bin/start-hbase.sh
对应的停止服务:
[hbase]$ bin/stop-hbase.sh
启动成功后,可以通过“host:port”的方式来访问HBase管理页面,例如:
http://hadoop103:16010
九、安装kylin
tar -zxvf apache-kylin-2.5.1-bin-hbase1x.tar.gz -C /opt/module/
mv apache-kylin-2.5.1 kylin
启动
bin/kylin.sh start
kylin界面
http://hadoop103:7070/kylin
9.1kylin的使用
1、基于之前的雪花模型或星座模型
2、基于之前的建模构建诺干cube
3、写sql查询即可
具体步骤
1、创建一个project
2、选中当前的project,向proje中加载表(hive中)
? 加载一个事实表:dwd_payment_info
? 加载两个维度表:dwd_order_info,dwd_user_info
3、建模(星型模型或雪花模型)
4、创建cube
? 注意:要求事实表和维度表在关联时不能出现主键重复的现象
? 在关联时,对于每日全量同步的维度表,最好只选择最新的日期的数据进行关联
? ①将最新的分区数据,查询,写入一个临时表,关联时,指定临时表作为关联的维度
? 表
? ②创建view,将最新的分区的数据,放入视图(虚表中)(推荐)
CREATE VIEW dwd_user_info_view as SELECT * from dwd_user_info
WHERE dt='2021-06-10';
CREATE VIEW dwd_order_info_view as SELECT * from dwd_user_info
WHERE dt='2021-06-10;
5、查询
? 只能写select支持使用聚集函数和group by
? 在join时,事实表必须放在左边
6、选择拉链表作为维度表时的注意事项
? dwd_order_info: 每日全量-----维度表id不能重复—只能选择今天最新的分区 ? 每日增量同步,今天最新的分区只有今天最新的数据,没有之前的数据! ? dwd_order_info_his: 同步所有数据的状态变化 ? 含有全部信息
|