chore: remove the static filter for merge into (#14092)

databendlabs · Dec 27, 2023 · a226452 · a226452
2 parents 51f3514 + 9b63dd3
commit a226452
Show file tree

Hide file tree

Showing 7 changed files with 101 additions and 41 deletions.
diff --git a/src/query/service/src/interpreters/interpreter_merge_into.rs b/src/query/service/src/interpreters/interpreter_merge_into.rs
@@ -193,12 +193,10 @@ impl MergeIntoInterpreter {
             (input, false)
         };
 
-        // let optimized_input =
-        //     Self::build_static_filter(&input, meta_data, self.ctx.clone(), check_table).await?;
         let mut builder = PhysicalPlanBuilder::new(meta_data.clone(), self.ctx.clone(), false);
-
         // build source for MergeInto
-        let join_input = builder.build(input.as_ref(), *columns_set.clone()).await?;
+        let join_input = builder.build(&input, *columns_set.clone()).await?;
+
 
         // find row_id column index
         let join_output_schema = join_input.output_schema()?;

diff --git a/src/query/service/src/interpreters/mod.rs b/src/query/service/src/interpreters/mod.rs
@@ -48,7 +48,6 @@ mod interpreter_index_refresh;
 mod interpreter_insert;
 mod interpreter_kill;
 mod interpreter_merge_into;
-mod interpreter_merge_into_static_filter;
 mod interpreter_metrics;
 mod interpreter_network_policies_show;
 mod interpreter_network_policy_alter;

diff --git a/src/query/service/src/pipelines/builders/builder_merge_into.rs b/src/query/service/src/pipelines/builders/builder_merge_into.rs
@@ -738,7 +738,12 @@ impl PipelineBuilder {
                 // insert only
                 (output_lens, 0)
             } else {
-                // (with row_id and without row_number/unmatched) or (without row_id and with row_number/unmatched)
+                // I. (with row_id and without row_number/unmatched) (need_match and !need_unmatch)
+                // II. (without row_id and with row_number/unmatched) (!need_match and need_unmatch)
+                // in fact for II, it should be (output_lens-1,1), but in this case, the
+                // output_lens = 1, so it will be (0,1), and we just need to append a dummy_item.
+                // but we use (output_lens - 1, 0) instead of (output_lens-1,1), because they will
+                // arrive the same result (that's appending only one dummy item)
                 (output_lens - 1, 0)
             };
             table.cluster_gen_for_append_with_specified_len(

diff --git a/...llogictests/suites/base/09_fuse_engine/09_0033_distributed_merge_into_without_enable.test b/...llogictests/suites/base/09_fuse_engine/09_0033_distributed_merge_into_without_enable.test
@@ -10,7 +10,7 @@ statement ok
 drop table if exists distributed_source_test;
 
 statement ok
-create table distributed_target_test(a int,b string);
+create table distributed_target_test(a int,b string) cluster by(a,b);
 
 ## multi blocks
 statement ok
@@ -42,7 +42,7 @@ select * from distributed_target_test order by a;
 8 x
 
 statement ok
-create table distributed_source_test(a int,b string,is_databend_deleted bool);
+create table distributed_source_test(a int,b string,is_databend_deleted bool) cluster by(a,b);
 
 statement ok
 insert into distributed_source_test values(1,'d',true),(2,'e',true),(3,'f',false),(4,'e',true),(5,'f',false);
@@ -82,13 +82,13 @@ statement ok
 drop table if exists corner_target_table;
 
 statement ok
-create table corner_target_table(a int,b string,c string);
+create table corner_target_table(a int,b string,c string) cluster by(a,b);
 
 statement ok
 drop table if exists corner_source_table;
 
 statement ok
-create table corner_source_table(a int,b string,c string);
+create table corner_source_table(a int,b string,c string) cluster by(a,b);
 
 ## add block1
 statement ok
@@ -164,7 +164,7 @@ statement ok
 merge into distributed_test_order as t using (select id,34 as id1,238 as id2, id3, id4, id5, id6, id7,s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,d1, d2, d3, d4, d5, d6, d7, d8, d9, d10,insert_time,insert_time1,insert_time2,insert_time3,i from distributed_random_store) as s on t.id = s.id and t.insert_time = s.insert_time when matched then update * when not matched then insert *;
 
 statement ok
-create table orders2(a int,b string,c string);
+create table orders2(a int,b string,c string) cluster by(a,b);
 
 statement ok
 insert into orders2 values(1,'a1','b1'),(2,'a2','b2'),(3,'a3','b3');

diff --git a/.../sqllogictests/suites/base/09_fuse_engine/09_0034_pr13848_without_distributed_enable.test b/.../sqllogictests/suites/base/09_fuse_engine/09_0034_pr13848_without_distributed_enable.test
@@ -3,10 +3,10 @@ statement ok
 set enable_experimental_merge_into = 1;
 
 statement ok
-create table merge_target_0(a int,b string);
+create table merge_target_0(a int,b string) cluster by(a,b);
 
 statement ok
-create table merge_source_0(a int,b string);
+create table merge_source_0(a int,b string) cluster by(a,b);
 
 statement ok
 insert into merge_target_0 values(1,'a1'),(2,'b1');
@@ -87,7 +87,7 @@ select * from merge_target_0 order by a,b;
 
 ### test copy into table unsupport
 statement ok
-create table copy_table_test0(a int,b string);
+create table copy_table_test0(a int,b string) cluster by(a,b);
 
 statement ok
 create stage parquet_table0 FILE_FORMAT = (TYPE = PARQUET);

diff --git a/.../base/09_fuse_engine/09_0035_merge_into_separate_pipeline_without_distributed_enable.test b/.../base/09_fuse_engine/09_0035_merge_into_separate_pipeline_without_distributed_enable.test
@@ -8,7 +8,7 @@ statement ok
 drop table if exists t2_separate;
 
 statement ok
-create table t1_separate(a int,b string, c string);
+create table t1_separate(a int,b string, c string) cluster by(a,b);
 
 statement ok
 create table t2_separate(a int,b string, c string);
@@ -92,5 +92,63 @@ select * from t1_separate order by a,b,c;
 8 a8 b8
 9 a9 b9
 
+## test insert-only cluster by
+statement ok
+truncate table t1_separate;
+
+statement ok
+truncate table t2_separate;
+
+statement ok
+insert into t2_separate values(8,'a8','b8'),(9,'a9','b9'),(1,'a5','b5'),(3,'a6','b6');
+
+query T
+merge into t1_separate as t1 using (select * from t2_separate) as t2 on t1.a = t2.a when not matched then insert *;
+----
+4
+
+## without order by
+query TTT
+select * from t1_separate;
+----
+1 a5 b5
+3 a6 b6
+8 a8 b8
+9 a9 b9
+
+## test macthed-only cluster by
+query T
+merge into t1_separate as t1 using (select * from t2_separate) as t2 on t1.a = t2.a when matched then update *;
+----
+4
+
+query TTT
+select * from t1_separate;
+----
+1 a5 b5
+3 a6 b6
+8 a8 b8
+9 a9 b9
+
+## test full operation cluster by
+statement ok
+insert into t2_separate values(5,'a5','b5'),(7,'a7','b7');
+
+query TT
+merge into t1_separate as t1 using (select * from t2_separate) as t2 on t1.a = t2.a when matched then update * when not matched then insert *;
+----
+2 4
+
+## we will do compact
+query TTT
+select * from t1_separate;
+----
+1 a5 b5
+3 a6 b6
+5 a5 b5
+7 a7 b7
+8 a8 b8
+9 a9 b9
+
 statement ok
 set enable_experimental_merge_into = 0;
diff --git a/...llogictests/suites/base/09_fuse_engine/09_0036_merge_into_without_distributed_enable.test b/...llogictests/suites/base/09_fuse_engine/09_0036_merge_into_without_distributed_enable.test
@@ -8,10 +8,10 @@ statement ok
 drop table if exists t2;
 
 statement ok
-create table t1(a int,b string, c string);
+create table t1(a int,b string, c string) cluster by(a,b);
 
 statement ok
-create table t2(a int,b string, c string);
+create table t2(a int,b string, c string) cluster by(a,b);
 
 statement ok
 insert into t1 values(1,'b1','c1'),(2,'b2','c2');
@@ -276,7 +276,7 @@ statement ok
 drop table if exists target_table;
 
 statement ok
-create table target_table(a int,b string,c string);
+create table target_table(a int,b string,c string) cluster by(a,b);
 
 statement ok
 insert into target_table values(1,'a_1','b_1'),(2,'a_2','b_2');
@@ -288,7 +288,7 @@ select * from target_table order by a,b,c;
 2 a_2 b_2
 
 statement ok
-create table test_stage(a int,b string,c string);
+create table test_stage(a int,b string,c string) cluster by(a,b);
 
 statement ok
 insert into test_stage values(1,'a1','b1'),(2,'a2','b2'),(3,'a3','b3');
@@ -468,10 +468,10 @@ select * from t1 order by a,b,c;
 1 a1 b1
 
 statement ok
-CREATE TABLE employees (employee_id INT, employee_name VARCHAR(255),department VARCHAR(255));
+CREATE TABLE employees (employee_id INT, employee_name VARCHAR(255),department VARCHAR(255))  cluster by(employee_id,employee_name);
 
 statement ok
-CREATE TABLE salaries (employee_id INT,salary DECIMAL(10, 2));
+CREATE TABLE salaries (employee_id INT,salary DECIMAL(10, 2)) cluster by(employee_id,salary);
 
 statement ok
 INSERT INTO employees VALUES(1, 'Alice', 'HR'),(2, 'Bob', 'IT'),(3, 'Charlie', 'Finance'),(4, 'David', 'HR');
@@ -494,10 +494,10 @@ select * from salaries order by employee_id;
 
 ## null cast bug fix
 statement ok
-create table t1_target(a int not null);
+create table t1_target(a int not null) cluster by(a);
 
 statement ok
-create table t2_source(a int not null);
+create table t2_source(a int not null) cluster by(a);
 
 statement ok
 insert into t1_target values(1);
@@ -559,13 +559,13 @@ statement ok
 drop table if exists source_test;
 
 statement ok
-create table target_test(a int,b string);
+create table target_test(a int,b string) cluster by(a,b);
 
 statement ok
 insert into target_test values(1,'a'),(2,'b'),(3,'c');
 
 statement ok
-create table source_test(a int,b string,delete_flag bool);
+create table source_test(a int,b string,delete_flag bool) cluster by(a,b);
 
 statement ok
 insert into source_test values(1,'d',true),(2,'e',true),(3,'f',false),(4,'e',true),(5,'f',false);
@@ -609,10 +609,10 @@ merge into test_order as t using (select id,34 as id1,238 as id2, id3, id4, id5,
 
 ## test update list #13297
 statement ok
-create table t11(a int,b string, c string);
+create table t11(a int,b string, c string) cluster by(a,b);
 
 statement ok
-create table t12(a int,b string, c string);
+create table t12(a int,b string, c string) cluster by(a,b);
 
 statement ok
 insert into t11 values(1,'b1','c1'),(2,'b2','c2');
@@ -628,7 +628,7 @@ merge into t11 using (select a, c from t12) as t12 on t11.a = t12.a when matched
 
 ## test issue #13287
 statement ok
-create table tt1 (a int, b int);
+create table tt1 (a int, b int) cluster by(a,b);
 
 statement error 1065
 merge into tt1 using(select 10, 20) as tt2 on tt1.a = 1 when not matched and tt1.b = 2 then insert values (10, 20);
@@ -645,7 +645,7 @@ select count(*) from tt1;
 
 ## test issue #13367
 statement ok
-create table tt2(a bool, b variant, c map(string, string));
+create table tt2(a bool, b variant, c map(string, string)) cluster by(a);
 
 statement ok
 insert into tt2 values (true, '10', {'k1':'v1'}), (false, '20', {'k2':'v2'})
@@ -669,10 +669,10 @@ statement ok
 drop table if exists t2;
 
 statement ok
-create table t1(a int);
+create table t1(a int) cluster by(a);
 
 statement ok
-create table t2(a int);
+create table t2(a int) cluster by(a);
 
 statement ok
 insert into t1 values(1);
@@ -697,10 +697,10 @@ statement ok
 drop table if exists t2;
 
 statement ok
-create table t1(b int);
+create table t1(b int) cluster by(b);
 
 statement ok
-create table t2(a int);
+create table t2(a int) cluster by(a);
 
 statement ok
 insert into t1 values(1);
@@ -719,10 +719,10 @@ statement ok
 drop table if exists t2;
 
 statement ok
-create table t1(a int,b string,c bool);
+create table t1(a int,b string,c bool) cluster by(a,b);
 
 statement ok
-create table t2(a int,b string,c bool);
+create table t2(a int,b string,c bool) cluster by(a,b);
 
 statement ok
 insert into t1 values(1,'a1',true),(2,'a2',false),(3,'a3',true);
@@ -783,7 +783,7 @@ statement ok
 drop table if exists tt1;
 
 statement ok
-create table tt1(a bool, b int);
+create table tt1(a bool, b int) cluster by(a,b);
 
 statement ok
 insert into tt1 values (true, 1), (false, 2);
@@ -806,10 +806,10 @@ statement ok
 drop table if exists t12;
 
 statement ok
-create table t12 (a int, b int);
+create table t12 (a int, b int) cluster by(a,b);
 
 statement ok
-create table t11 (a int, b int);
+create table t11 (a int, b int) cluster by(a,b);
 
 statement ok
 insert into t11 values (1, 10),(2, 20),(3, 30),(4, 40);
@@ -1006,7 +1006,7 @@ FROM orders;
 64.16764110 6.416764110000 1.97683658 19.29134884
 
 statement ok
-create table tb_01 (id int,c1 varchar,c2 datetime(0),c3 json);
+create table tb_01 (id int,c1 varchar,c2 datetime(0),c3 json) cluster by(c1,c2);
 
 statement ok
 create table tmp_01 like tb_01;
@@ -1026,10 +1026,10 @@ select id,c1,to_date(c2),c3 from tb_01;
 
 ## test #issue13932
 statement ok
-create table null_target(a int not null,b text);
+create table null_target(a int not null,b text) cluster by(a,b);
 
 statement ok
-create table null_source(a int not null,b text);
+create table null_source(a int not null,b text) cluster by(a,b);
 
 statement ok
 insert into null_target values(1,'a1');