Observations :
- Count(1) or Count(*) – This is never expanded on each column individually so will work perfectly fine on complete data.
- Count(1) is more optimized then Count(*)
- Count(source.*) – source represents “Left table” of “Left Outer Join”:
- This will be evaluated as Count(source.col1, source.col2, …. source.colN )
- So, if any column has NULL, then the complete row will get filtered out in the count result.
- Rows, in which all columns from the Source query has “NOT NULLs” will be counted only
- Count(target.*) – target represents “Right table” of “Left Outer Join”
- Same explanation as above.
- This will be evaluated as Count(target.col1, target.col2, …. target.colN )
- So, if any column has NULL, then the complete row will get filtered out in the count result.
- Rows, in which all columns from the Target query has “NOT NULLs” will be counted only
- Solution to count target table – Use any column (not limited to key column with non NULL values) for which we will have no NULLs (e.g. bookmark/last_modified_ts column, as this, is linked to each row and will have current_timestamp value)
The above was analysed from the Query plan. Please refer to the below queries :
Source Table — stg.nik_test_1
Target Table — stg.nik_test_2
DDLs:
create table stg.nik_test_1 (id int, first string, last string);
create table stg.nik_test_2 (id int, first string, last string);
insert into stg.nik_test_1 values(1,’NN’,’A1′);
insert into stg.nik_test_1 values(2,NULL,’A1′);
insert into stg.nik_test_1 values(NULL,’M1′,NULL);
insert into stg.nik_test_2 values(4,’Y1′,’S1′);
insert into stg.nik_test_2 values(5,NULL,’S1′);
insert into stg.nik_test_2 values(NULL,’N1′,NULL);
insert into stg.nik_test_2 values(2,’Y1′,’S1′);
Case -1:
explain using json select count(*) from
(select * from stg.nik_test_1) source
left outer join
(select * from stg.nik_test_2) target
on source.id = target.id
;
-- OUTPUT
{"GlobalStats":{"partitionsTotal":2,"partitionsAssigned":2,"bytesAssigned":3072},
"Operations":[[
{"id":0,"operation":"Result","expressions":["COUNT(*)"]},
{"id":1,"parent":0,"operation":"Aggregate","expressions":["aggExprs: [COUNT(*)]"]},
{"id":2,"parent":1,"operation":"LeftOuterJoin","expressions":["joinKey: (NIK_TEST_1.ID = NIK_TEST_2.ID)"]},
{"id":3,"parent":2,"operation":"TableScan","objects":["STG.NIK_TEST_1"],"expressions":["ID"],"partitionsAssigned":1,"partitionsTotal":1,"bytesAssigned":1536},
{"id":4,"parent":2,"operation":"Filter","expressions":["NIK_TEST_2.ID IS NOT NULL"]},
{"id":5,"parent":4,"operation":"JoinFilter","expressions":["joinKey: (NIK_TEST_1.ID = NIK_TEST_2.ID)"]},
{"id":6,"parent":5,"operation":"TableScan","objects":["STG.NIK_TEST_2"],"expressions":["ID"],"partitionsAssigned":1,"partitionsTotal":1,"bytesAssigned":1536}
]]}
Case-2:
explain using json select count(source.*) from
(select * from stg.nik_test_1) source
left outer join
(select * from stg.nik_test_2) target
on source.id = target.id
;
{"GlobalStats":{"partitionsTotal":2,"partitionsAssigned":2,"bytesAssigned":3072},"Operations":[[
{"id":0,"operation":"Result","expressions":["COUNT(COUNT(COUNT(NIK_TEST_1.ID, NIK_TEST_1.FIRST, NIK_TEST_1.LAST)))"]},
{"id":1,"parent":0,"operation":"LeftOuterJoin","expressions":["joinKey: (NIK_TEST_1.ID = NIK_TEST_2.ID)"]},
{"id":2,"parent":1,"operation":"TableScan","objects":["STG.NIK_TEST_1"],"expressions":["ID","FIRST","LAST"],"partitionsAssigned":1,"partitionsTotal":1,"bytesAssigned":1536},
{"id":3,"parent":1,"operation":"Filter","expressions":["NIK_TEST_2.ID IS NOT NULL"]},
{"id":4,"parent":3,"operation":"JoinFilter","expressions":["joinKey: (NIK_TEST_1.ID = NIK_TEST_2.ID)"]},
{"id":5,"parent":4,"operation":"TableScan","objects":["STG.NIK_TEST_2"],"expressions":["ID"],"partitionsAssigned":1,"partitionsTotal":1,"bytesAssigned":1536}
]]}
Case-3:
explain using json select count(target.*) from
(select * from stg.nik_test_1) source
left outer join
(select * from stg.nik_test_2) target
on source.id = target.id
;
{"GlobalStats":{"partitionsTotal":2,"partitionsAssigned":2,"bytesAssigned":3072},"Operations":[[
{"id":0,"operation":"Result","expressions":["COUNT(NIK_TEST_2.ID, NIK_TEST_2.FIRST, NIK_TEST_2.LAST)"]},
{"id":1,"parent":0,"operation":"Aggregate","expressions":["aggExprs: [COUNT(NIK_TEST_2.ID, NIK_TEST_2.FIRST, NIK_TEST_2.LAST)]"]},
{"id":2,"parent":1,"operation":"LeftOuterJoin","expressions":["joinKey: (NIK_TEST_1.ID = NIK_TEST_2.ID)"]},
{"id":3,"parent":2,"operation":"TableScan","objects":["STG.NIK_TEST_1"],"expressions":["ID"],"partitionsAssigned":1,"partitionsTotal":1,"bytesAssigned":1536},
{"id":4,"parent":2,"operation":"Filter","expressions":["NIK_TEST_2.ID IS NOT NULL"]},
{"id":5,"parent":4,"operation":"JoinFilter","expressions":["joinKey: (NIK_TEST_1.ID = NIK_TEST_2.ID)"]},
{"id":6,"parent":5,"operation":"TableScan","objects":["STG.NIK_TEST_2"],"expressions":["ID","FIRST","LAST"],"partitionsAssigned":1,"partitionsTotal":1,"bytesAssigned":1536}
]]}