山重水复疑无路,柳暗花明又一村。 ——陆游《游山西村》
前言
JOIN连接是SQL常用的关联方式,但他们之前连用时可能会出现数据缺失的情况,本文分享生产中的bug案例,目前已有解决方案,具体原因为个人理解,如有错误,请各位小伙伴解答。
一、具体场景
Hive建表时,需要用到left outer join加上inner join,当他们连用时,发现数据缺失严重。
SELECT
t1.arrange_date
,t2.fmaterialnumber AS new_material_code
,t1.fbillno AS bill_code
,MAX(t2.ftaxprice) AS price_unit
,SUM(t2.frealqty) AS sale_out_qty
,SUM(t2.fallamount_lc) AS sale_out_price_total_rmb
,IF((t1.arrange_date <= '2021-05-31' AND t1.fbillno IN ('OUSO20210319321696', 'OUSO20210322323127'))
,'Lumidept00165', t1.fsaledeptnumber) AS dept_code
,t1.salesman_code
,t2.fmainid AS main_id
,t1.system_source
,t1.fcustnumber
,t2.warehouse_code
,t1.fbilltypenumber
,t2.delivery_order_id
,t1.f_aqa_recaddr
,t1.fsaleorgid AS sales_organization
,SUM(t2.fcostamount_lc) AS cost_rmb
,t2.fsrcbillno
,t1.fbilltypename
,t2.fentryid
,t1.fbasecurrnumber AS currency_code
,t2.funitnumber AS unit
,SUM(t2.famount) AS sale_out_amt_total
,SUM(t2.famount_lc) AS sale_out_amt_total_rmb
,t1.fsettleorgid
FROM
(
SELECT
t3.fid
,t3.fbillno
,t3.system_source
,t3.arrange_date
,t3.fsaledeptnumber
,t3.salesman_code
,t3.fcustnumber
,t3.fbilltypenumber
,t3.fbilltypename
,t3.f_aqa_recaddr
,t3.fsaleorgid
,t3.fbasecurrnumber
,t4.fsettleorgid
FROM
(
SELECT
fid
,fbillno
,system_source
,arrange_date
,fsaledeptnumber
,salesman_code
,fcustnumber
,fbilltypenumber
,fbilltypename
,f_aqa_recaddr
,fsaleorgid
,fbasecurrnumber
FROM tmp.xxx
WHERE rn = 1
AND is_delete IS NOT TRUE
) t3
LEFT OUTER JOIN
(
SELECT
fmainid
,fsettleorgid
FROM tmp.yyy
WHERE rn = 1
AND is_delete IS NOT TRUE
) t4
ON t3.fid = t4.fmainid
) t1
INNER JOIN
(
SELECT
fmainid
,ftaxprice
,frealqty
,fallamount_lc
,fmaterialnumber
,delivery_order_id
,warehouse_code
,fcostamount_lc
,fsrcbillno
,fentryid
,funitnumber
,famount
,famount_lc
FROM tmp.zzz
WHERE rn = 1
AND is_delete IS NOT TRUE
) t2
ON t1.fid = t2.fmainid
GROUP BY
t1.arrange_date
,t2.fmaterialnumber
,t1.fbillno
,t1.fsaledeptnumber
,t1.salesman_code
,t2.fmainid
,t1.system_source
,t1.fcustnumber
,t1.fbilltypenumber
,t2.delivery_order_id
,t1.f_aqa_recaddr
,t1.fsaleorgid
,t2.warehouse_code
,t2.fsrcbillno
,t1.fbilltypename
,t2.fentryid
,t1.fbasecurrnumber
,t2.funitnumber
,t1.fsettleorgid
;
二、问题分析思路
- 将逻辑分步跑,发现在inner join后数据开始不正确。
- 将inner join换为left outer join结果也依然是数据缺失。
- 网上博文启发,Inner join和Outer join一起使用的注意点,如下图所示。
- 查各关联条件发现为同一关联条件,但各表中关联条件的数据量并不一致,就像下图案例一样,因为inner join的关联条件宠物类型是需要有宠物才会有相应宠物类型,有一个人没有养宠物,那他实际上就不会有相应的宠物类型,这时关联会出现数据缺失。
select
t1.personname
,t2.petname
from
(
select
personid
,personname
from people
) t1
left outer join
(
select
ownerid
,petname
from pets
) t2
on t1.personid = t2.ownerid
;
select
t3.personname
,t3.petname
,t4.pettype
from
(
select
t1.personname
,t2.petname
,t2.pettypeid
from
(
select
personid
,personname
from people
) t1
left outer join
(
select
ownerid
,petname
,pettypeid
from pets
) t2
on t1.personid = t2.ownerid
) t3
inner join
(
select
pettypeid
,pettype
from pettypes
) t4
on t3.pettypeid = t4.pettypeid
;
三、解决方案
- 将left outer join提到外层
SELECT
t3.arrange_date
,t3.new_material_code
,t3.bill_code
,t3.price_unit
,t3.sale_out_qty
,t3.sale_out_price_total_rmb
,t3.dept_code
,t3.salesman_code
,t3.main_id
,t3.system_source
,t3.fcustnumber
,t3.warehouse_code
,t3.fbilltypenumber
,t3.delivery_order_id
,t3.f_aqa_recaddr
,t3.sales_organization
,t3.cost_rmb
,t3.fsrcbillno
,t3.fbilltypename
,t3.fentryid
,t3.currency_code
,t3.unit
,t3.sale_out_amt_total
,t3.sale_out_amt_total_rmb
,t4.fsettleorgid
FROM
(
SELECT
t1.arrange_date
,t2.fmaterialnumber AS new_material_code
,t1.fbillno AS bill_code
,MAX(t2.ftaxprice) AS price_unit
,SUM(t2.frealqty) AS sale_out_qty
,SUM(t2.fallamount_lc) AS sale_out_price_total_rmb
,IF((t1.arrange_date <= '2021-05-31' AND t1.fbillno IN ('OUSO20210319321696', 'OUSO20210322323127'))
,'Lumidept00165', t1.fsaledeptnumber) AS dept_code
,t1.salesman_code
,t2.fmainid AS main_id
,t1.system_source
,t1.fcustnumber
,t2.warehouse_code
,t1.fbilltypenumber
,t2.delivery_order_id
,t1.f_aqa_recaddr
,t1.fsaleorgid AS sales_organization
,SUM(t2.fcostamount_lc) AS cost_rmb
,t2.fsrcbillno
,t1.fbilltypename
,t2.fentryid
,t1.fbasecurrnumber AS currency_code
,t2.funitnumber AS unit
,SUM(t2.famount) AS sale_out_amt_total
,SUM(t2.famount_lc) AS sale_out_amt_total_rmb
FROM
(
SELECT
fid
,fbillno
,system_source
,arrange_date
,fsaledeptnumber
,salesman_code
,fcustnumber
,fbilltypenumber
,fbilltypename
,f_aqa_recaddr
,fsaleorgid
,fbasecurrnumber
FROM tmp.xxx
WHERE rn = 1
AND is_delete IS NOT TRUE
) t1
INNER JOIN
(
SELECT
fmainid
,ftaxprice
,frealqty
,fallamount_lc
,fmaterialnumber
,delivery_order_id
,warehouse_code
,fcostamount_lc
,fsrcbillno
,fentryid
,funitnumber
,famount
,famount_lc
FROM tmp.yyy
WHERE rn = 1
AND is_delete IS NOT TRUE
) t2
ON t1.fid = t2.fmainid
GROUP BY
t1.arrange_date
,t2.fmaterialnumber
,t1.fbillno
,t1.fsaledeptnumber
,t1.salesman_code
,t2.fmainid
,t1.system_source
,t1.fcustnumber
,t1.fbilltypenumber
,t2.delivery_order_id
,t1.f_aqa_recaddr
,t1.fsaleorgid
,t2.warehouse_code
,t2.fsrcbillno
,t1.fbilltypename
,t2.fentryid
,t1.fbasecurrnumber
,t2.funitnumber
) t3
LEFT OUTER JOIN
(
SELECT
fmainid
,fsettleorgid
FROM tmp.zzz
WHERE rn = 1
AND is_delete IS NOT TRUE
) t4
ON t3.main_id = t4.fmainid
;
- 先形成临时表再inner join的话结果是正确的。
CREATE TABLE IF NOT EXISTS tmp.aaa
AS
SELECT
t3.fid
,t3.fbillno
,t3.system_source
,t3.arrange_date
,t3.fsaledeptnumber
,t3.salesman_code
,t3.fcustnumber
,t3.fbilltypenumber
,t3.fbilltypename
,t3.f_aqa_recaddr
,t3.fsaleorgid
,t3.fbasecurrnumber
,t4.fsettleorgid
FROM
(
SELECT
fid
,fbillno
,system_source
,arrange_date
,fsaledeptnumber
,salesman_code
,fcustnumber
,fbilltypenumber
,fbilltypename
,f_aqa_recaddr
,fsaleorgid
,fbasecurrnumber
FROM tmp.xxx
WHERE rn = 1
AND is_delete IS NOT TRUE
) t3
LEFT OUTER JOIN
(
SELECT
fmainid
,fsettleorgid
FROM tmp.yyy
WHERE rn = 1
AND is_delete IS NOT TRUE
) t4
ON t3.fid = t4.fmainid
;
SELECT
t1.arrange_date
,t2.fmaterialnumber AS new_material_code
,t1.fbillno AS bill_code
,MAX(t2.ftaxprice) AS price_unit
,SUM(t2.frealqty) AS sale_out_qty
,SUM(t2.fallamount_lc) AS sale_out_price_total_rmb
,IF((t1.arrange_date <= '2021-05-31' AND t1.fbillno IN ('OUSO20210319321696', 'OUSO20210322323127'))
,'Lumidept00165', t1.fsaledeptnumber) AS dept_code
,t1.salesman_code
,t2.fmainid AS main_id
,t1.system_source
,t1.fcustnumber
,t2.warehouse_code
,t1.fbilltypenumber
,t2.delivery_order_id
,t1.f_aqa_recaddr
,t1.fsaleorgid AS sales_organization
,SUM(t2.fcostamount_lc) AS cost_rmb
,t2.fsrcbillno
,t1.fbilltypename
,t2.fentryid
,t1.fbasecurrnumber AS currency_code
,t2.funitnumber AS unit
,SUM(t2.famount) AS sale_out_amt_total
,SUM(t2.famount_lc) AS sale_out_amt_total_rmb
,t1.fsettleorgid
FROM
(
SELECT
fid
,fbillno
,system_source
,arrange_date
,fsaledeptnumber
,salesman_code
,fcustnumber
,fbilltypenumber
,fbilltypename
,f_aqa_recaddr
,fsaleorgid
,fbasecurrnumber
,fsettleorgid
from tmp.aaa
) t1
INNER JOIN
(
SELECT
fmainid
,ftaxprice
,frealqty
,fallamount_lc
,fmaterialnumber
,delivery_order_id
,warehouse_code
,fcostamount_lc
,fsrcbillno
,fentryid
,funitnumber
,famount
,famount_lc
FROM tmp.zzz
WHERE rn = 1
AND is_delete IS NOT TRUE
) t2
ON t1.fid = t2.fmainid
GROUP BY
t1.arrange_date
,t2.fmaterialnumber
,t1.fbillno
,t1.fsaledeptnumber
,t1.salesman_code
,t2.fmainid
,t1.system_source
,t1.fcustnumber
,t1.fbilltypenumber
,t2.delivery_order_id
,t1.f_aqa_recaddr
,t1.fsaleorgid
,t2.warehouse_code
,t2.fsrcbillno
,t1.fbilltypename
,t2.fentryid
,t1.fbasecurrnumber
,t2.funitnumber
,t1.fsettleorgid
;
总结
山重水复疑无路,柳暗花明又一村。出自宋代陆游的《游山西村》,描绘的是当地路漫漫,长途跋涉以为前方没有路要放弃时,突然发现不远的地方有个村庄的场景,带有一种豁然开朗的喜悦。 当我们面临复杂的SQL关联时,因为SQL内部机制会导致结果和我们想象中的不一样,这时不妨换个思路,可能就会“柳暗花明又一村”了。
|