Observations :
- Count(1) or Count(*) – This is never expanded on each column individually so will work perfectly fine on complete data.
- Count(1) is more optimized then Count(*)
- Count(source.*) – source represents “Left table” of “Left Outer Join”:
- This will be evaluated as Count(source.col1, source.col2, …. source.colN )
- So, if any column has NULL, then the complete row will get filtered out in the count result.
- Rows, in which all columns from the Source query has “NOT NULLs” will be counted only
- Count(target.*) – target represents “Right table” of “Left Outer Join”
- Same explanation as above.
- This will be evaluated as Count(target.col1, target.col2, …. target.colN )
- So, if any column has NULL, then the complete row will get filtered out in the count result.
- Rows, in which all columns from the Target query has “NOT NULLs” will be counted only
- Solution to count target table – Use any column (not limited to key column with non NULL values) for which we will have no NULLs (e.g. bookmark/last_modified_ts column, as this, is linked to each row and will have current_timestamp value)
The above was analysed from the Query plan. Please refer to the below queries :
Source Table — stg.nik_test_1
Target Table — stg.nik_test_2
DDLs:
create table stg.nik_test_1 (id int, first string, last string);
create table stg.nik_test_2 (id int, first string, last string);
insert into stg.nik_test_1 values(1,’NN’,’A1′);
insert into stg.nik_test_1 values(2,NULL,’A1′);
insert into stg.nik_test_1 values(NULL,’M1′,NULL);
insert into stg.nik_test_2 values(4,’Y1′,’S1′);
insert into stg.nik_test_2 values(5,NULL,’S1′);
insert into stg.nik_test_2 values(NULL,’N1′,NULL);
insert into stg.nik_test_2 values(2,’Y1′,’S1′);
Case -1:
explain using json select count(*) from (select * from stg.nik_test_1) source left outer join (select * from stg.nik_test_2) target on source.id = target.id ; -- OUTPUT {"GlobalStats":{"partitionsTotal":2,"partitionsAssigned":2,"bytesAssigned":3072}, "Operations":[[ {"id":0,"operation":"Result","expressions":["COUNT(*)"]}, {"id":1,"parent":0,"operation":"Aggregate","expressions":["aggExprs: [COUNT(*)]"]}, {"id":2,"parent":1,"operation":"LeftOuterJoin","expressions":["joinKey: (NIK_TEST_1.ID = NIK_TEST_2.ID)"]}, {"id":3,"parent":2,"operation":"TableScan","objects":["STG.NIK_TEST_1"],"expressions":["ID"],"partitionsAssigned":1,"partitionsTotal":1,"bytesAssigned":1536}, {"id":4,"parent":2,"operation":"Filter","expressions":["NIK_TEST_2.ID IS NOT NULL"]}, {"id":5,"parent":4,"operation":"JoinFilter","expressions":["joinKey: (NIK_TEST_1.ID = NIK_TEST_2.ID)"]}, {"id":6,"parent":5,"operation":"TableScan","objects":["STG.NIK_TEST_2"],"expressions":["ID"],"partitionsAssigned":1,"partitionsTotal":1,"bytesAssigned":1536} ]]}
Case-2:
explain using json select count(source.*) from (select * from stg.nik_test_1) source left outer join (select * from stg.nik_test_2) target on source.id = target.id ; {"GlobalStats":{"partitionsTotal":2,"partitionsAssigned":2,"bytesAssigned":3072},"Operations":[[ {"id":0,"operation":"Result","expressions":["COUNT(COUNT(COUNT(NIK_TEST_1.ID, NIK_TEST_1.FIRST, NIK_TEST_1.LAST)))"]}, {"id":1,"parent":0,"operation":"LeftOuterJoin","expressions":["joinKey: (NIK_TEST_1.ID = NIK_TEST_2.ID)"]}, {"id":2,"parent":1,"operation":"TableScan","objects":["STG.NIK_TEST_1"],"expressions":["ID","FIRST","LAST"],"partitionsAssigned":1,"partitionsTotal":1,"bytesAssigned":1536}, {"id":3,"parent":1,"operation":"Filter","expressions":["NIK_TEST_2.ID IS NOT NULL"]}, {"id":4,"parent":3,"operation":"JoinFilter","expressions":["joinKey: (NIK_TEST_1.ID = NIK_TEST_2.ID)"]}, {"id":5,"parent":4,"operation":"TableScan","objects":["STG.NIK_TEST_2"],"expressions":["ID"],"partitionsAssigned":1,"partitionsTotal":1,"bytesAssigned":1536} ]]}
Case-3:
explain using json select count(target.*) from (select * from stg.nik_test_1) source left outer join (select * from stg.nik_test_2) target on source.id = target.id ; {"GlobalStats":{"partitionsTotal":2,"partitionsAssigned":2,"bytesAssigned":3072},"Operations":[[ {"id":0,"operation":"Result","expressions":["COUNT(NIK_TEST_2.ID, NIK_TEST_2.FIRST, NIK_TEST_2.LAST)"]}, {"id":1,"parent":0,"operation":"Aggregate","expressions":["aggExprs: [COUNT(NIK_TEST_2.ID, NIK_TEST_2.FIRST, NIK_TEST_2.LAST)]"]}, {"id":2,"parent":1,"operation":"LeftOuterJoin","expressions":["joinKey: (NIK_TEST_1.ID = NIK_TEST_2.ID)"]}, {"id":3,"parent":2,"operation":"TableScan","objects":["STG.NIK_TEST_1"],"expressions":["ID"],"partitionsAssigned":1,"partitionsTotal":1,"bytesAssigned":1536}, {"id":4,"parent":2,"operation":"Filter","expressions":["NIK_TEST_2.ID IS NOT NULL"]}, {"id":5,"parent":4,"operation":"JoinFilter","expressions":["joinKey: (NIK_TEST_1.ID = NIK_TEST_2.ID)"]}, {"id":6,"parent":5,"operation":"TableScan","objects":["STG.NIK_TEST_2"],"expressions":["ID","FIRST","LAST"],"partitionsAssigned":1,"partitionsTotal":1,"bytesAssigned":1536} ]]}