非分布key join使用hash join性能一般最好
最近更新时间: 2025-02-18 16:02:00
为了提高tp类业务查询的性能,我们经常需要对一些字段建立索引,使用有索引字段join时系统往往也会使用Merge Cond和nestloop。
mydb=# CREATE TABLE t1(f1 serial not null,f2 text,f3 text,f4 text,f5 text,f6 text,f7 text,f8 text,f9 text,f10 text,f11 text,f12 text) distribute by shard(f1);
CREATE TABLE
Time: 481.042 ms
mydb=# create index t1_f1_idx on t1(f2);
CREATE INDEX
Time: 85.521 ms
mydb=# CREATE TABLE t2(f1 serial not null,f2 text,f3 text,f4 text,f5 text,f6 text,f7 text,f8 text,f9 text,f10 text,f11 text,f12 text) distribute by shard(f1);
CREATE TABLE
Time: 75.973 ms
mydb=# create index t2_f1_idx on t2(f2);
CREATE INDEX
Time: 29.890 ms
mydb=# insert into t1 select t,md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text) from generate_series(1,1000000) as t;
INSERT 0 1000000
Time: 16450.623 ms (00:16.451)
mydb=# insert into t2 select t,md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text) from generate_series(1,1000000) as t;
INSERT 0 1000000
Time: 17218.738 ms (00:17.219)
mydb=# analyze t1;
ANALYZE
Time: 2219.341 ms (00:02.219)
mydb=# analyze t2;
ANALYZE
Time: 1649.506 ms (00:01.650)
mydb=#
--merge join
mydb=# explain select t1.* from t1,t2 where t1.f2=t2.f2 limit 10;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------
Limit (cost=100.25..102.78 rows=10 width=367)
-> Remote Subquery Scan on all (dn001,dn002) (cost=100.25..102.78 rows=10 width=367)
-> Limit (cost=0.25..2.73 rows=10 width=367)
-> Merge Join (cost=0.25..248056.80 rows=1000000 width=367)
Merge Cond: (t1.f2 = t2.f2)
-> Remote Subquery Scan on all (dn001,dn002) (cost=100.12..487380.85 rows=1000000 width=367)
Distribute results by S: f2
-> Index Scan using t1_f1_idx on t1 (cost=0.12..115280.85 rows=1000000 width=367)
-> Materialize (cost=100.12..155875.95 rows=1000000 width=33)
-> Remote Subquery Scan on all (dn001,dn002) (cost=100.12..153375.95 rows=1000000 width=33)
Distribute results by S: f2
-> Index Only Scan using t2_f1_idx on t2 (cost=0.12..115275.95 rows=1000000 width=33)
(12 rows)
Time: 4.183 ms
mydb=# explain analyze select t1.* from t1,t2 where t1.f2=t2.f2 limit 10;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------
Limit (cost=100.25..102.78 rows=10 width=367) (actual time=6555.346..6556.296 rows=10 loops=1)
-> Remote Subquery Scan on all (dn001,dn002) (cost=100.25..102.78 rows=10 width=367) (actual time=6555.343..6555.349 rows=10 loops=1)
Planning time: 0.473 ms
Execution time: 6569.828 ms
(4 rows)
Time: 6614.439 ms (00:06.614)
--nested loop
mydb=# set enable_mergejoin to off;
SET
Time: 0.422 ms
mydb=# explain select t1.* from t1,t2 where t1.f2=t2.f2 limit 10;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------
Limit (cost=100.12..103.57 rows=10 width=367)
-> Remote Subquery Scan on all (dn001,dn002) (cost=100.12..103.57 rows=10 width=367)
-> Limit (cost=0.12..3.52 rows=10 width=367)
-> Nested Loop (cost=0.12..339232.00 rows=1000000 width=367)
-> Remote Subquery Scan on all (dn001,dn002) (cost=100.00..434740.00 rows=1000000 width=367)
Distribute results by S: f2
-> Seq Scan on t1 (cost=0.00..62640.00 rows=1000000 width=367)
-> Materialize (cost=100.12..100.31 rows=1 width=33)
-> Remote Subquery Scan on all (dn001,dn002) (cost=100.12..100.30 rows=1 width=33)
Distribute results by S: f2
-> Index Only Scan using t2_f1_idx on t2 (cost=0.12..0.27 rows=1 width=33)
Index Cond: (f2 = t1.f2)
(12 rows)
Time: 1.033 ms
mydb=# explain analyze select t1.* from t1,t2 where t1.f2=t2.f2 limit 10;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------
Limit (cost=100.12..103.57 rows=10 width=367) (actual time=5608.326..5609.571 rows=10 loops=1)
-> Remote Subquery Scan on all (dn001,dn002) (cost=100.12..103.57 rows=10 width=367) (actual time=5608.323..5608.349 rows=10 loops=1)
Planning time: 0.347 ms
Execution time: 5669.901 ms
(4 rows)
Time: 5672.584 ms (00:05.673)
mydb=# set enable_nestloop to off;
SET
Time: 0.436 ms
mydb=# explain select t1.* from t1,t2 where t1.f2=t2.f2 limit 10;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------
Limit (cost=85983.00..85984.94 rows=10 width=367)
-> Remote Subquery Scan on all (dn001,dn002) (cost=85983.00..85984.94 rows=10 width=367)
-> Limit (cost=85883.00..85884.89 rows=10 width=367)
-> Hash Join (cost=85883.00..274580.00 rows=1000000 width=367)
Hash Cond: (t1.f2 = t2.f2)
-> Remote Subquery Scan on all (dn001,dn002) (cost=100.00..434740.00 rows=1000000 width=367)
Distribute results by S: f2
-> Seq Scan on t1 (cost=0.00..62640.00 rows=1000000 width=367)
-> Hash (cost=100740.00..100740.00 rows=1000000 width=33)
-> Remote Subquery Scan on all (dn001,dn002) (cost=100.00..100740.00 rows=1000000 width=33)
Distribute results by S: f2
-> Seq Scan on t2 (cost=0.00..62640.00 rows=1000000 width=33)
(12 rows)
Time: 1.141 ms
mydb=# explain analyze select t1.* from t1,t2 where t1.f2=t2.f2 limit 10;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------------------
Limit (cost=85983.00..85984.94 rows=10 width=367) (actual time=1083.691..1085.962 rows=10 loops=1)
-> Remote Subquery Scan on all (dn001,dn002) (cost=85983.00..85984.94 rows=10 width=367) (actual time=1083.688..1083.699 rows=10 loops=1)
Planning time: 0.530 ms
Execution time: 1108.830 ms
(4 rows)
Time: 1117.713 ms (00:01.118)
mydb=#