非分布key join使用hash join性能一般最好

最近更新时间: 2025-02-18 16:02:00

为了提高tp类业务查询的性能,我们经常需要对一些字段建立索引,使用有索引字段join时系统往往也会使用Merge Cond和nestloop。

  • 复制
    复制成功
mydb=# CREATE TABLE t1(f1 serial not null,f2 text,f3 text,f4 text,f5 text,f6 text,f7 text,f8 text,f9 text,f10 text,f11 text,f12 text) distribute by shard(f1); 
CREATE TABLE
Time: 481.042 ms

mydb=# create index t1_f1_idx on t1(f2); 
CREATE INDEX
Time: 85.521 ms

mydb=# CREATE TABLE t2(f1 serial not null,f2 text,f3 text,f4 text,f5 text,f6 text,f7 text,f8 text,f9 text,f10 text,f11 text,f12 text) distribute by shard(f1); 
CREATE TABLE
Time: 75.973 ms

mydb=# create index t2_f1_idx on t2(f2);  
CREATE INDEX
Time: 29.890 ms

mydb=# insert into t1 select t,md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text) from generate_series(1,1000000) as t;
INSERT 0 1000000
Time: 16450.623 ms (00:16.451)

mydb=# insert into t2 select t,md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text),md5(t::text) from generate_series(1,1000000) as t; 
INSERT 0 1000000
Time: 17218.738 ms (00:17.219)
mydb=# analyze t1;
ANALYZE
Time: 2219.341 ms (00:02.219)
mydb=# analyze t2;
ANALYZE
Time: 1649.506 ms (00:01.650)
mydb=# 
--merge join
mydb=# explain select t1.* from t1,t2 where t1.f2=t2.f2 limit 10;
                                    QUERY PLAN                                                        
-------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=100.25..102.78 rows=10 width=367)
   ->  Remote Subquery Scan on all (dn001,dn002)  (cost=100.25..102.78 rows=10 width=367)
         ->  Limit  (cost=0.25..2.73 rows=10 width=367)
               ->  Merge Join  (cost=0.25..248056.80 rows=1000000 width=367)
                     Merge Cond: (t1.f2 = t2.f2)
                     ->  Remote Subquery Scan on all (dn001,dn002)  (cost=100.12..487380.85 rows=1000000 width=367)
                           Distribute results by S: f2
                           ->  Index Scan using t1_f1_idx on t1  (cost=0.12..115280.85 rows=1000000 width=367)
                     ->  Materialize  (cost=100.12..155875.95 rows=1000000 width=33)
                           ->  Remote Subquery Scan on all (dn001,dn002)  (cost=100.12..153375.95 rows=1000000 width=33)
                                 Distribute results by S: f2
                                 ->  Index Only Scan using t2_f1_idx on t2  (cost=0.12..115275.95 rows=1000000 width=33)
(12 rows)

Time: 4.183 ms
mydb=# explain analyze select t1.* from t1,t2 where t1.f2=t2.f2 limit 10;
                                          QUERY PLAN                                                                 
-------------------------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=100.25..102.78 rows=10 width=367) (actual time=6555.346..6556.296 rows=10 loops=1)
   ->  Remote Subquery Scan on all (dn001,dn002)  (cost=100.25..102.78 rows=10 width=367) (actual time=6555.343..6555.349 rows=10 loops=1)
 Planning time: 0.473 ms
 Execution time: 6569.828 ms
(4 rows)

Time: 6614.439 ms (00:06.614)

--nested loop

mydb=# set enable_mergejoin to off;
SET
Time: 0.422 ms
mydb=# explain select t1.* from t1,t2 where t1.f2=t2.f2 limit 10;
                                          QUERY PLAN                                                     
--------------------------------------------------------------------------------------------------------------------
 Limit  (cost=100.12..103.57 rows=10 width=367)
   ->  Remote Subquery Scan on all (dn001,dn002)  (cost=100.12..103.57 rows=10 width=367)
         ->  Limit  (cost=0.12..3.52 rows=10 width=367)
               ->  Nested Loop  (cost=0.12..339232.00 rows=1000000 width=367)
                     ->  Remote Subquery Scan on all (dn001,dn002)  (cost=100.00..434740.00 rows=1000000 width=367)
                           Distribute results by S: f2
                           ->  Seq Scan on t1  (cost=0.00..62640.00 rows=1000000 width=367)
                     ->  Materialize  (cost=100.12..100.31 rows=1 width=33)
                           ->  Remote Subquery Scan on all (dn001,dn002)  (cost=100.12..100.30 rows=1 width=33)
                                 Distribute results by S: f2
                                 ->  Index Only Scan using t2_f1_idx on t2  (cost=0.12..0.27 rows=1 width=33)
                                       Index Cond: (f2 = t1.f2)
(12 rows)

Time: 1.033 ms
mydb=# explain analyze select t1.* from t1,t2 where t1.f2=t2.f2 limit 10;
                                              QUERY PLAN                                                                 
-------------------------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=100.12..103.57 rows=10 width=367) (actual time=5608.326..5609.571 rows=10 loops=1)
   ->  Remote Subquery Scan on all (dn001,dn002)  (cost=100.12..103.57 rows=10 width=367) (actual time=5608.323..5608.349 rows=10 loops=1)
 Planning time: 0.347 ms
 Execution time: 5669.901 ms
(4 rows)

Time: 5672.584 ms (00:05.673)

mydb=# set enable_nestloop to off;
SET
Time: 0.436 ms
mydb=#  explain select t1.* from t1,t2 where t1.f2=t2.f2 limit 10;
                                             QUERY PLAN                                                        
-------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=85983.00..85984.94 rows=10 width=367)
   ->  Remote Subquery Scan on all (dn001,dn002)  (cost=85983.00..85984.94 rows=10 width=367)
         ->  Limit  (cost=85883.00..85884.89 rows=10 width=367)
               ->  Hash Join  (cost=85883.00..274580.00 rows=1000000 width=367)
                     Hash Cond: (t1.f2 = t2.f2)
                     ->  Remote Subquery Scan on all (dn001,dn002)  (cost=100.00..434740.00 rows=1000000 width=367)
                           Distribute results by S: f2
                           ->  Seq Scan on t1  (cost=0.00..62640.00 rows=1000000 width=367)
                     ->  Hash  (cost=100740.00..100740.00 rows=1000000 width=33)
                           ->  Remote Subquery Scan on all (dn001,dn002)  (cost=100.00..100740.00 rows=1000000 width=33)
                                 Distribute results by S: f2
                                 ->  Seq Scan on t2  (cost=0.00..62640.00 rows=1000000 width=33)
(12 rows)
Time: 1.141 ms
mydb=#  explain analyze select t1.* from t1,t2 where t1.f2=t2.f2 limit 10;
                                                                  QUERY PLAN                                                                   
-----------------------------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=85983.00..85984.94 rows=10 width=367) (actual time=1083.691..1085.962 rows=10 loops=1)
   ->  Remote Subquery Scan on all (dn001,dn002)  (cost=85983.00..85984.94 rows=10 width=367) (actual time=1083.688..1083.699 rows=10 loops=1)
 Planning time: 0.530 ms
 Execution time: 1108.830 ms
(4 rows)
Time: 1117.713 ms (00:01.118)
mydb=#