CREATE TABLE emp_partition 
   (  EMPNO bigint, 
  ENAME string, 
  JOB string, 
  MGR bigint, 
  HIREDATE string, 
  SAL bigint, 
  COMM bigint, 
  DEPTNO bigint
  ) partitioned by (dt string)
row format delimited  fields terminated by ','
;

若提示权限问题，向相关文件夹用chmod赋权即可

2、插入一个分区数据

insert into table emp_partition partition(dt='20220401')
select * from emp where sal >3000;

3、查看表数据和HDFS

表中数据如下：

HDFS上如下：

4、再插另一个分区数据?

insert into table emp_partition partition(dt='20220402')
select * from emp where sal >3000;

5、查看表数据和HDFS

表数据如下：

HDFS如下：

6、分区表的好处?

查询时指定分区，避免全表扫描

二、hive动态分区

1、定义

可以按照原表中的某一个字段作为分区，但是这个分区字段不能存在于分区表中

2、创建一个动态分区表

CREATE TABLE emp_dynamic_partition 
   (  EMPNO bigint, 
  ENAME string, 
  JOB string, 
  MGR bigint, 
  HIREDATE string, 
  SAL bigint, 
  COMM bigint
  ) partitioned by (DEPTNO bigint)
row format delimited  fields terminated by ','
;

3、插入数据

需要开启动态分区，hive默认是关闭的，也可以在hive-site.xml中设置。

set hive.exec.dynamic.partition=true;    
set hive.exec.dynamic.partition.mode=nonstrict; 
insert  into table emp_dynamic_partition partition (deptno)
select 
	EMPNO, 
  	ENAME , 
  	JOB , 
  	MGR , 
  	HIREDATE , 
  	SAL , 
  	COMM ,
  	deptno
 from emp

4、查看表中数据和HDFS

表中：

HDFS中

三、函数?function

1、函数分类

（1）内置函数?build-in

（2）UDF? user?define?function?用户自定义函数

UDF：一进一出（abs，upper，lower）

UDAF：多进一出（sum，count，avg）

UDTF：一进多出（explode，json_tuple）

所有的内置函数均可以通过

desc? function? xxx ；

来查看其用法，UDF函数经过一些配置后也可以

2、常用的一些函数整理

（1）时间相关

select current_date();    --查看当前日期不带时分秒

select current_timestamp();  --查看当前日期带时分秒

select unix_timestamp();    --时间戳，当前至1970-1-1的秒数

select from_unixtime(unix_timestamp(current_timestamp()),'YYYY-MM-dd')  --时间转换


select to_date('2022-01-01');  --string类型转date


select  year(current_timestamp())      -- 得出  年
		,month(current_timestamp())    -- 得出  月
		,day(current_timestamp())      -- 得出  天
		,hour(current_timestamp())     -- 得出  小时
		,minute(current_timestamp())   -- 得出  分钟
		,second(current_timestamp())   -- 得出  秒
		
select dayofmonth(current_timestamp())      --日期所在的月
		,dayofweek(current_timestamp())     --日期在所在周的第几天 默认星期天为第一天
		,weekofyear(current_timestamp())    --日期所在周为当年的第几周
		
select months_between(current_timestamp(),'2020-01-01')    --两个日期相差月份带小数

select add_months(current_timestamp(),2);      --增加月份

select datediff(current_timestamp(),'2020-01-01')  -- 两个日期相差天数

select date_add(current_timestamp(),10)   --增加天数

select date_sub(current_timestamp(),10)   --减少天数

select last_day(current_timestamp())    -- 日期所在月最后一天

select trunc(current_timestamp(),'YYYY')   -- 取日期所在年第一天
		 ,trunc(current_timestamp(),'MM')    -- 取日期所在月第一天

3、JSON

（1）创建一个存放json数据格式的表

CREATE  TABLE json_test
(
    `line` string
)

（2）插入几条json格式数据

INSERT  into  json_test  values('{"ceo":"马化腾","company":"腾讯","city":"深圳","phone":"666666"}') ;
INSERT  into  json_test  values('{"ceo":"马云","company":"阿里","city":"杭州"}') ;
INSERT  into  json_test  values('{"ceo":"蒙古上单","company":"b站","city":"上海"}') ;
INSERT  into  json_test  values('{"ceo":"雷军","company":"小米","city":"北京"}') ;

（3）查看数据

（4）?使用? json_tuple函数

select json_tuple(line,'ceo','company','city','phone') as (ceo,company,address,phone)
from json_test;

（5）lateral?viev? +? json_tuple

?

select
	case when phone is null then '0000' else phone end  as phone
from json_test
lateral view json_tuple(line,'phone') t as phone;

(6) get_json_object?函数?

select  GET_JSON_OBJECT(line,'$.ceo') as ceo
from json_test

四、hive实现wc案例?

1、创建一个wc表如下

create  table  hive_wc(word string) row format delimited  fields terminated by ','

2、插入一些数据

insert into  hive_wc values('kafka hadoop spark hive scala');
insert into  hive_wc values('hadoop hive flume kafka flume');
insert into  hive_wc values('spark hive');

3、实现wc

select 
	lower(word) as word
	,count(*)  as num
from 
(
	select 
		explode(split(word,' ')) as word
   from hive_wc
) a
group by lower(word)

五、常见的分区表逻辑

?

df表意思是? ?每天的分区都保存全量数据

比如有个?dwd_order_df? 表，它其中的每天分区都保存着全量数据（全部）

它前一层?有一个 ods_order_di? ?每天的分区中都保存着昨天的全部数据（一天）

如果我想对?df表中数据做更新用di表? 可以使用 full? join? ?如下：

insert  into overwrite dwd_order_df partition (dt ='${yesterday}')
select 
    coaleasc(a.order_id,b.order_id) as order_id
    ,coaleasc(a.xx,b.xx) as xx
from
    dwd_order_df t1
full join 
    ods_order_di t2
on t1.order_id = t2.order_id
and t1.ds = '${yesterday}' - 1
and t2.ds = '${yesterday}' - 1

六、思考题

1、多级分区，HDFS目录变化

（1）建立一个两级分区表

CREATE TABLE emp_partition2 
   (  EMPNO bigint, 
  ENAME string, 
  JOB string, 
  MGR bigint, 
  HIREDATE string, 
  SAL bigint, 
  COMM bigint, 
  DEPTNO bigint
  ) partitioned by (dt string,hour string )
row format delimited  fields terminated by ','
;

（2）插入两条数据---两个分区都一致

--两个分区都一致
insert into emp_partition2 partition (dt = '2022-04-05',hour ='11')
select * from emp where ename="KING";
insert into emp_partition2 partition (dt = '2022-04-05',hour ='11')
select * from emp where ename="KING";

HDFS上数据均在两个分区下

（3）?插入两条数据---第一个分区都一致，第二个不一致

--第一个分区都一致，第二个不一致
insert into emp_partition2 partition (dt = '2022-04-06',hour ='11')
select * from emp where ename="KING";
insert into emp_partition2 partition (dt = '2022-04-06',hour ='12')
select * from emp where ename="KING";

HDFS?在第一个分区下，第二个分区分开存储数据

（4）?插入两条数据---分区均不一致

--分区均不一致
insert into emp_partition2 partition (dt = '2022-04-07',hour ='11')
select * from emp where ename="KING";
insert into emp_partition2 partition (dt = '2022-04-08',hour ='12')
select * from emp where ename="KING";

?HDFS上?先在不同的第一个分区下有第二个分区

2、创建一个表，数据随意，分隔符以 $%#? ,并查询出来

（1）准备数据

? ? ?/home/peizk/emp2.txt??

7369$%#SMITH$%#CLERK$%#7902$%#1980-12-17$%#800$%#$%#20
7499$%#ALLEN$%#SALESMAN$%#7698$%#1981-02-20$%#1600$%#300$%#30
7521$%#WARD$%#SALESMAN$%#7698$%#1981-02-22$%#1250$%#500$%#30
7566$%#JONES$%#MANAGER$%#7839$%#1981-04-02$%#2975$%#$%#20
7654$%#MARTIN$%#SALESMAN$%#7698$%#1981-09-28$%#1250$%#1400$%#30
7698$%#BLAKE$%#MANAGER$%#7839$%#1981-05-01$%#2850$%#$%#30
7782$%#CLARK$%#MANAGER$%#7839$%#1981-06-09$%#2450$%#$%#10
7788$%#SCOTT$%#ANALYST$%#7566$%#1987-04-19$%#3000$%#$%#20
7839$%#KING$%#PRESIDENT$%#$%#1981-11-17$%#5000$%#$%#10
7844$%#TURNER$%#SALESMAN$%#7698$%#1981-09-08$%#1500$%#0$%#30
7876$%#ADAMS$%#CLERK$%#7788$%#1987-05-23$%#1100$%#$%#20
7900$%#JAMES$%#CLERK$%#7698$%#1981-12-03$%#950$%#$%#30
7902$%#FORD$%#ANALYST$%#7566$%#1981-12-03$%#3000$%#$%#20
7934$%#MILLER$%#CLERK$%#7782$%#1982-01-23$%#1300$%#$%#10

（2）建表并指定分隔符?并导入数据

CREATE TABLE emp_fengefu 
   (  EMPNO bigint, 
  ENAME string, 
  JOB string, 
  MGR bigint, 
  HIREDATE string, 
  SAL bigint, 
  COMM bigint, 
  DEPTNO bigint
  ) 
row format delimited  fields terminated by '$%#'
;

load  data  local  inpath  '/home/peizk/emp2.txt' overwrite into table  emp_fengefu;

（3）查询表信息

查询得知??row format delimited fields terminated by?此种方法分隔符只能为一个

（4）解决办法使用? MultiDelimitSerDe?

CREATE TABLE emp_fengefu 
   (  EMPNO bigint, 
  ENAME string, 
  JOB string, 
  MGR bigint, 
  HIREDATE string, 
  SAL bigint, 
  COMM bigint, 
  DEPTNO bigint
  ) 
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.MultiDelimitSerDe'
WITH SERDEPROPERTIES ("field.delim"="$%#")

导入数据后查询有