fix: do compact first , then do sort for cluster table (#14707)

* add block count test * fix flaky * fix * fix test
databendlabs · Feb 21, 2024 · 926ac0a · 926ac0a
1 parent c5f2be7
commit 926ac0a
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 33 deletions.
diff --git a/src/query/service/src/pipelines/builders/builder_merge_into.rs b/src/query/service/src/pipelines/builders/builder_merge_into.rs
@@ -229,18 +229,9 @@ impl PipelineBuilder {
                 self.main_pipeline.add_pipe(builder.finalize());
             }
 
-            // 3. cluster sort
-            let block_thresholds = table.get_block_thresholds();
-            table.cluster_gen_for_append_with_specified_len(
-                self.ctx.clone(),
-                &mut self.main_pipeline,
-                block_thresholds,
-                1,
-                1,
-            )?;
-
-            // 4. we should avoid too much little block write, because for s3 write, there are too many
+            // 3. we should avoid too much little block write, because for s3 write, there are too many
             // little blocks, it will cause high latency.
+            let block_thresholds = table.get_block_thresholds();
             let mut builder = self.main_pipeline.add_transform_with_specified_len(
                 |transform_input_port, transform_output_port| {
                     Ok(ProcessorPtr::create(TransformCompact::try_create(
@@ -254,6 +245,15 @@ impl PipelineBuilder {
             builder.add_items(vec![create_dummy_item()]);
             self.main_pipeline.add_pipe(builder.finalize());
 
+            // 4. cluster sort
+            table.cluster_gen_for_append_with_specified_len(
+                self.ctx.clone(),
+                &mut self.main_pipeline,
+                block_thresholds,
+                1,
+                1,
+            )?;
+
             // 5. serialize block
             let cluster_stats_gen =
                 table.get_cluster_stats_gen(self.ctx.clone(), 0, block_thresholds, None)?;
@@ -747,6 +747,28 @@ impl PipelineBuilder {
                 output_lens
             };
 
+            // we should avoid too much little block write, because for s3 write, there are too many
+            // little blocks, it will cause high latency.
+            let mut builder = self.main_pipeline.add_transform_with_specified_len(
+                |transform_input_port, transform_output_port| {
+                    Ok(ProcessorPtr::create(TransformCompact::try_create(
+                        transform_input_port,
+                        transform_output_port,
+                        BlockCompactor::new(block_thresholds),
+                    )?))
+                },
+                mid_len,
+            )?;
+            if need_match {
+                builder.add_items_prepend(vec![create_dummy_item()]);
+            }
+
+            // need to receive row_number, we should give a dummy item here.
+            if *distributed && need_unmatch && !*change_join_order {
+                builder.add_items(vec![create_dummy_item()]);
+            }
+            self.main_pipeline.add_pipe(builder.finalize());
+
             table.cluster_gen_for_append_with_specified_len(
                 self.ctx.clone(),
                 &mut self.main_pipeline,
@@ -776,6 +798,29 @@ impl PipelineBuilder {
                 // arrive the same result (that's appending only one dummy item)
                 (output_lens - 1, 0)
             };
+
+            // we should avoid too much little block write, because for s3 write, there are too many
+            // little blocks, it will cause high latency.
+            let mut builder = self.main_pipeline.add_transform_with_specified_len(
+                |transform_input_port, transform_output_port| {
+                    Ok(ProcessorPtr::create(TransformCompact::try_create(
+                        transform_input_port,
+                        transform_output_port,
+                        BlockCompactor::new(block_thresholds),
+                    )?))
+                },
+                mid_len,
+            )?;
+            if need_match {
+                builder.add_items_prepend(vec![create_dummy_item()]);
+            }
+
+            // need to receive row_number, we should give a dummy item here.
+            if *distributed && need_unmatch && !*change_join_order {
+                builder.add_items(vec![create_dummy_item()]);
+            }
+            self.main_pipeline.add_pipe(builder.finalize());
+
             table.cluster_gen_for_append_with_specified_len(
                 self.ctx.clone(),
                 &mut self.main_pipeline,
@@ -787,28 +832,6 @@ impl PipelineBuilder {
         };
         pipe_items.clear();
 
-        // we should avoid too much little block write, because for s3 write, there are too many
-        // little blocks, it will cause high latency.
-        let mut builder = self.main_pipeline.add_transform_with_specified_len(
-            |transform_input_port, transform_output_port| {
-                Ok(ProcessorPtr::create(TransformCompact::try_create(
-                    transform_input_port,
-                    transform_output_port,
-                    BlockCompactor::new(block_thresholds),
-                )?))
-            },
-            serialize_len,
-        )?;
-        if need_match {
-            builder.add_items_prepend(vec![create_dummy_item()]);
-        }
-
-        // need to receive row_number, we should give a dummy item here.
-        if *distributed && need_unmatch && !*change_join_order {
-            builder.add_items(vec![create_dummy_item()]);
-        }
-        self.main_pipeline.add_pipe(builder.finalize());
-
         if need_match {
             // rowid should be accumulated in main node.
             if *change_join_order && *distributed {

diff --git a/.../base/09_fuse_engine/09_0035_merge_into_separate_pipeline_without_distributed_enable.test b/.../base/09_fuse_engine/09_0035_merge_into_separate_pipeline_without_distributed_enable.test
@@ -139,6 +139,11 @@ merge into t1_separate as t1 using (select * from t2_separate) as t2 on t1.a = t
 ----
 2 4
 
+query T
+select count(*) from fuse_block('default','t1_separate');
+----
+1
+
 ## we will do compact
 query TTT
 select * from t1_separate;