执行计划code

explain
--your sql code
select
	*
from
	xxx.xxxx as x
left join
	yyy.yyyy as y
on
	x.a=y.b
group by 
	c,d
having 
	e
order by 
	f
limit 
	1000

举例

拿一个实际生产中的复杂sql为例，涉及多表join，开窗函数，子查询，多维分组聚合grouping sets的执行计划为例，可以看出：

stage之间的依赖关系
各个stage的执行计划
sql语句的执行顺序，from(TableScan)→join on(Reduce Output Operator)→where(Filter Operator)→select(Select Operator)→group by(Group By Operator)→having→order by→limit
hive sql 转换为map reduce的过程，及其中间的key-value值

Explain
STAGE DEPENDENCIES: # 各个stage之间的依赖性
  Stage-1 is a root stage
  Stage-2 depends on stages: Stage-1, Stage-3, Stage-5
  Stage-3 is a root stage
  Stage-4 is a root stage
  Stage-5 depends on stages: Stage-4
  Stage-0 depends on stages: Stage-2

STAGE PLANS: # 各个stage的执行计划
  Stage: Stage-1 # 阶段一执行计划
    Map Reduce # Map Reduce中的内容即为抽象语法树AST
      Map Operator Tree: # MAP端的执行计划树
          TableScan # map端第一个操作肯定是加载表，所以就是 TableScan 表扫描操作
            alias: t_ads_indicator_wxvideoaccountevaluate_detailfeature_d_i # 表名称
            Statistics: Num rows: 1486973 Data size: 32713406 Basic stats: COMPLETE Column stats: NONE # 表统计信息，包含表中数据条数，数据大小等
            Filter Operator # 过滤操作
              predicate: ((operator is not null and (operator <> '')) and (operator <> 'sys')) (type: boolean) # 过滤条件
              Statistics: Num rows: 743487 Data size: 16356714 Basic stats: COMPLETE Column stats: NONE
              Select Operator # 选取操作
                # expressions：需要的字段名称及字段类型
                expressions: day (type: bigint), hour (type: bigint), half_hour (type: bigint), operator (type: string), scene_id (type: bigint), scene_layer (type: bigint), action_type (type: bigint), action_result (type: bigint), evaluate_id (type: string)
                outputColumnNames: day, hour, half_hour, operator, scene_id, scene_layer, action_type, action_result, evaluate_id # 输出的列名称
                Statistics: Num rows: 743487 Data size: 16356714 Basic stats: COMPLETE Column stats: NONE
                Group By Operator # 分组聚合操作
                  # aggregations: 显示聚合函数信息
                  aggregations: count(DISTINCT CASE WHEN (((action_type = 3) and ((action_result = 0) or (action_result = 1)))) THEN (evaluate_id) ELSE (null) END), count(DISTINCT CASE WHEN (((action_type = 4) and (action_result = 0))) THEN (evaluate_id) ELSE (null) END), count(DISTINCT CASE WHEN (((action_type = 3) and (action_result = 0))) THEN (evaluate_id) ELSE (null) END), count(DISTINCT CASE WHEN (((action_type = 3) and (action_result = 1))) THEN (evaluate_id) ELSE (null) END)
                  # keys: 分组的字段，如果没有分组，则没有此字段
                  keys: day (type: bigint), hour (type: bigint), half_hour (type: bigint), operator (type: string), scene_id (type: bigint), scene_layer (type: bigint), '0' (type: string), CASE WHEN (((action_type = 3) and ((action_result = 0) or (action_result = 1)))) THEN (evaluate_id) ELSE (null) END (type: string), CASE WHEN (((action_type = 4) and (action_result = 0))) THEN (evaluate_id) ELSE (null) END (type: string), CASE WHEN (((action_type = 3) and (action_result = 0))) THEN (evaluate_id) ELSE (null) END (type: string), CASE WHEN (((action_type = 3) and (action_result = 1))) THEN (evaluate_id) ELSE (null) END (type: string)
                  mode: hash # 聚合模式，值有 hash：随机聚合，就是hash partition；partial：局部聚合；final：最终聚合
                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14 # 聚合之后输出列名
                  Statistics: Num rows: 2230461 Data size: 49070142 Basic stats: COMPLETE Column stats: NONE
                  Reduce Output Operator # 输出到reduce操作
                    # key expressions: 依据什么进行排序，即用于排序的列，即map阶段的输出字段（临时字段），也就是reduce的key
                    key expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: string), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: string), _col7 (type: string), _col8 (type: string), _col9 (type: string), _col10 (type: string)
                    sort order: +++++++++++  # 表示键表达式中用于排序的列数。每个“ +”代表一列以升序排序，每个“ -”代表一列以降序排序。
                    # Map-reduce partition columns: 保证具有相同此列的值的行被分到同一个reduce中去，即Map阶段输出到Reduce阶段的分区列
                    Map-reduce partition columns: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: string), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: string)
                    Statistics: Num rows: 2230461 Data size: 49070142 Basic stats: COMPLETE Column stats: NONE
      Reduce Operator Tree: # Reduce端的执行计划树
        Group By Operator
          aggregations: count(DISTINCT KEY._col7:0._col0), count(DISTINCT KEY._col7:1._col0), count(DISTINCT KEY._col7:2._col0), count(DISTINCT KEY._col7:3._col0)
          keys: KEY._col0 (type: bigint), KEY._col1 (type: bigint), KEY._col2 (type: bigint), KEY._col3 (type: string), KEY._col4 (type: bigint), KEY._col5 (type: bigint), KEY._col6 (type: string)
          mode: mergepartial
          outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col7, _col8, _col9, _col10
          Statistics: Num rows: 1115230 Data size: 24535060 Basic stats: COMPLETE Column stats: NONE
          pruneGroupingSetId: true
          Select Operator
            expressions: _col0 (type: bigint), COALESCE(_col1,-10000) (type: bigint), COALESCE(_col2,-10000) (type: bigint), _col4 (type: bigint), _col5 (type: bigint), _col3 (type: string), _col7 (type: bigint), _col8 (type: bigint), _col9 (type: bigint), _col10 (type: bigint)
            outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9
            Statistics: Num rows: 1115230 Data size: 24535060 Basic stats: COMPLETE Column stats: NONE
            File Output Operator # 文件输出操作
              compressed: false # 是否压缩
              table: # 表的信息，包含输入输出文件格式化方式，序列化方式等
                  input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                  output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                  serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe

  Stage: Stage-2
    Map Reduce
      Map Operator Tree:
          TableScan
            Reduce Output Operator
              key expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: bigint), _col5 (type: string)
              sort order: ++++++
              Map-reduce partition columns: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: bigint), _col5 (type: string)
              Statistics: Num rows: 1115230 Data size: 24535060 Basic stats: COMPLETE Column stats: NONE
              value expressions: _col6 (type: bigint), _col7 (type: bigint), _col8 (type: bigint), _col9 (type: bigint)
          TableScan
            Reduce Output Operator
              key expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: bigint), _col5 (type: string)
              sort order: ++++++
              Map-reduce partition columns: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: bigint), _col5 (type: string)
              Statistics: Num rows: 278808 Data size: 6133774 Basic stats: COMPLETE Column stats: NONE
              value expressions: _col6 (type: bigint)
          TableScan
            Reduce Output Operator
              key expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: bigint), _col5 (type: string)
              sort order: ++++++
              Map-reduce partition columns: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: bigint), _col5 (type: string)
              Statistics: Num rows: 557614 Data size: 12267508 Basic stats: COMPLETE Column stats: NONE
              value expressions: _col6 (type: bigint)
      Reduce Operator Tree:
        Join Operator
          condition map:
               Left Outer Join0 to 1
               Left Outer Join0 to 2
          keys:
            0 _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: bigint), _col5 (type: string)
            1 _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: bigint), _col5 (type: string)
            2 _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: bigint), _col5 (type: string)
          outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col16, _col23
          Statistics: Num rows: 2453506 Data size: 53977133 Basic stats: COMPLETE Column stats: NONE
          Select Operator
            expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col5 (type: string), COALESCE(_col16,0) (type: bigint), _col6 (type: bigint), _col7 (type: bigint), _col8 (type: bigint), _col9 (type: bigint), COALESCE((_col9 / _col6),0) (type: double), COALESCE(_col23,0) (type: bigint), _col3 (type: bigint), _col4 (type: bigint)
            outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
            Statistics: Num rows: 2453506 Data size: 53977133 Basic stats: COMPLETE Column stats: NONE
            Limit
              Number of rows: 1000000
              Statistics: Num rows: 1000000 Data size: 22000000 Basic stats: COMPLETE Column stats: NONE
              File Output Operator
                compressed: true
                Statistics: Num rows: 1000000 Data size: 22000000 Basic stats: COMPLETE Column stats: NONE
                table:
                    input format: org.apache.hadoop.mapred.TextInputFormat
                    output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                    serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-3
    Map Reduce
      Map Operator Tree:
          TableScan
            alias: t_ads_indicator_wxvideoaccountevaluate_detailfeature_d_i
            Statistics: Num rows: 1486973 Data size: 32713406 Basic stats: COMPLETE Column stats: NONE
            Filter Operator
              predicate: ((((action_type = 2) and (action_result = 0)) and receiptor is not null) and (receiptor <> '')) (type: boolean)
              Statistics: Num rows: 185872 Data size: 4089183 Basic stats: COMPLETE Column stats: NONE
              Select Operator
                expressions: day (type: bigint), hour (type: bigint), half_hour (type: bigint), receiptor (type: string), scene_id (type: bigint), scene_layer (type: bigint), 2 (type: bigint), 0 (type: bigint), evaluate_id (type: string)
                outputColumnNames: day, hour, half_hour, receiptor, scene_id, scene_layer, action_type, action_result, evaluate_id
                Statistics: Num rows: 185872 Data size: 4089183 Basic stats: COMPLETE Column stats: NONE
                Group By Operator
                  aggregations: count(DISTINCT CASE WHEN (((action_type = 2) and (action_result = 0))) THEN (evaluate_id) ELSE (null) END)
                  keys: day (type: bigint), hour (type: bigint), half_hour (type: bigint), receiptor (type: string), scene_id (type: bigint), scene_layer (type: bigint), '0' (type: string), CASE WHEN (((action_type = 2) and (action_result = 0))) THEN (evaluate_id) ELSE (null) END (type: string)
                  mode: hash
                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
                  Statistics: Num rows: 557616 Data size: 12267549 Basic stats: COMPLETE Column stats: NONE
                  Reduce Output Operator
                    key expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: string), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: string), _col7 (type: string)
                    sort order: ++++++++
                    Map-reduce partition columns: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: string), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: string)
                    Statistics: Num rows: 557616 Data size: 12267549 Basic stats: COMPLETE Column stats: NONE
      Reduce Operator Tree:
        Group By Operator
          aggregations: count(DISTINCT KEY._col7:0._col0)
          keys: KEY._col0 (type: bigint), KEY._col1 (type: bigint), KEY._col2 (type: bigint), KEY._col3 (type: string), KEY._col4 (type: bigint), KEY._col5 (type: bigint), KEY._col6 (type: string)
          mode: mergepartial
          outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col7
          Statistics: Num rows: 278808 Data size: 6133774 Basic stats: COMPLETE Column stats: NONE
          pruneGroupingSetId: true
          Select Operator
            expressions: _col0 (type: bigint), COALESCE(_col1,-10000) (type: bigint), COALESCE(_col2,-10000) (type: bigint), _col4 (type: bigint), _col5 (type: bigint), _col3 (type: string), _col7 (type: bigint)
            outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
            Statistics: Num rows: 278808 Data size: 6133774 Basic stats: COMPLETE Column stats: NONE
            File Output Operator
              compressed: false
              table:
                  input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                  output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                  serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe

  Stage: Stage-4
    Map Reduce
      Map Operator Tree:
          TableScan
            alias: t_ads_indicator_wxvideoaccountevaluate_detailfeature_d_i
            Statistics: Num rows: 1486973 Data size: 32713406 Basic stats: COMPLETE Column stats: NONE
            Filter Operator
              predicate: (((((action_type = 3) and ((action_result = 0) or (action_result = 1))) and operator is not null) and (operator <> '')) and (operator <> 'sys')) (type: boolean)
              Statistics: Num rows: 371743 Data size: 8178346 Basic stats: COMPLETE Column stats: NONE
              Reduce Output Operator
                key expressions: operator (type: string), pull_time (type: bigint), evaluate_time (type: bigint)
                sort order: ++-
                Map-reduce partition columns: operator (type: string), pull_time (type: bigint)
                Statistics: Num rows: 371743 Data size: 8178346 Basic stats: COMPLETE Column stats: NONE
                value expressions: day (type: bigint), hour (type: bigint), half_hour (type: bigint), scene_id (type: bigint), scene_layer (type: bigint)
      Reduce Operator Tree:
        Select Operator
          expressions: KEY.reducesinkkey1 (type: bigint), KEY.reducesinkkey2 (type: bigint), KEY.reducesinkkey0 (type: string), VALUE._col12 (type: bigint), VALUE._col13 (type: bigint), VALUE._col14 (type: bigint), VALUE._col17 (type: bigint), VALUE._col18 (type: bigint)
          outputColumnNames: _col10, _col12, _col14, _col15, _col16, _col17, _col20, _col21
          Statistics: Num rows: 371743 Data size: 8178346 Basic stats: COMPLETE Column stats: NONE
          PTF Operator
            Function definitions:
                Input definition
                  input alias: ptf_0
                  output shape: _col10: bigint, _col12: bigint, _col14: string, _col15: bigint, _col16: bigint, _col17: bigint, _col20: bigint, _col21: bigint
                  type: WINDOWING
                Windowing table definition
                  input alias: ptf_1
                  name: windowingtablefunction
                  order by: _col12(DESC)
                  partition by: _col14, _col10
                  raw input shape:
                  window functions:
                      window function definition
                        alias: row_number_window_0
                        name: row_number
                        window function: GenericUDAFRowNumberEvaluator
                        window frame: PRECEDING(MAX)~FOLLOWING(MAX)
                        isPivotResult: true
            Statistics: Num rows: 371743 Data size: 8178346 Basic stats: COMPLETE Column stats: NONE
            Select Operator
              expressions: _col15 (type: bigint), _col16 (type: bigint), _col17 (type: bigint), _col20 (type: bigint), _col21 (type: bigint), _col14 (type: string), _col12 (type: bigint), _col10 (type: bigint), row_number_window_0 (type: int)
              outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
              Statistics: Num rows: 371743 Data size: 8178346 Basic stats: COMPLETE Column stats: NONE
              Group By Operator
                aggregations: sum(CASE WHEN ((((_col8 = 1) and (_col6 <> (- 1))) and (_col7 <> (- 1)))) THEN ((_col6 - _col7)) ELSE (null) END)
                keys: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col5 (type: string), _col3 (type: bigint), _col4 (type: bigint), '0' (type: string)
                mode: hash
                outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7
                Statistics: Num rows: 1115229 Data size: 24535038 Basic stats: COMPLETE Column stats: NONE
                File Output Operator
                  compressed: false
                  table:
                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                      serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe

  Stage: Stage-5
    Map Reduce
      Map Operator Tree:
          TableScan
            Reduce Output Operator
              key expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: string), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: string)
              sort order: +++++++
              Map-reduce partition columns: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: string), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: string)
              Statistics: Num rows: 1115229 Data size: 24535038 Basic stats: COMPLETE Column stats: NONE
              value expressions: _col7 (type: bigint)
      Reduce Operator Tree:
        Group By Operator
          aggregations: sum(VALUE._col0)
          keys: KEY._col0 (type: bigint), KEY._col1 (type: bigint), KEY._col2 (type: bigint), KEY._col3 (type: string), KEY._col4 (type: bigint), KEY._col5 (type: bigint), KEY._col6 (type: string)
          mode: mergepartial
          outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col7
          Statistics: Num rows: 557614 Data size: 12267508 Basic stats: COMPLETE Column stats: NONE
          pruneGroupingSetId: true
          Select Operator
            expressions: _col0 (type: bigint), COALESCE(_col1,-10000) (type: bigint), COALESCE(_col2,-10000) (type: bigint), _col4 (type: bigint), _col5 (type: bigint), _col3 (type: string), _col7 (type: bigint)
            outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
            Statistics: Num rows: 557614 Data size: 12267508 Basic stats: COMPLETE Column stats: NONE
            File Output Operator
              compressed: false
              table:
                  input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                  output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                  serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe

  Stage: Stage-0
    Fetch Operator # 客户端获取数据操作
      limit: 1000000 # 值为 -1 表示不限制条数，其他值为限制的条数
      Processor Tree:
        ListSink