feat: support histogram for equal condition

databendlabs · May 4, 2023 · ef90684 · ef90684
1 parent a40976e
commit ef90684
Show file tree

Hide file tree

Showing 8 changed files with 65 additions and 66 deletions.
diff --git a/src/query/service/src/pipelines/processors/transforms/window/transform_window.rs b/src/query/service/src/pipelines/processors/transforms/window/transform_window.rs
@@ -951,8 +951,8 @@ macro_rules! impl_advance_frame_bound_method {
                             .column_at(&self.[<frame_ $bound>], ref_idx)
                             .as_nullable()
                             .unwrap();
-                        let valdity = &col.validity;
-                        if unsafe { !valdity.get_bit_unchecked(self.[<frame_ $bound>].row) } {
+                        let validity = &col.validity;
+                        if unsafe { !validity.get_bit_unchecked(self.[<frame_ $bound>].row) } {
                             // Need to skip null rows.
                             if nulls_first {
                                 // The null rows are at front.

diff --git a/src/query/sql/src/planner/optimizer/property/selectivity.rs b/src/query/sql/src/planner/optimizer/property/selectivity.rs
@@ -287,7 +287,7 @@ fn is_true_constant_predicate(constant: &ConstantExpr) -> bool {
 fn evaluate_equal(column_stat: &ColumnStat, constant: &ConstantExpr, const_datum: &Datum) -> f64 {
     if column_stat.histogram.is_some() {
         let res = evaluate_equal_by_histogram(const_datum, column_stat);
-        if let Some(res) = res {
+        if let Ok(res) = res {
             return res;
         }
     }
@@ -382,58 +382,57 @@ fn update_statistic(
     Ok(())
 }
 
-fn evaluate_equal_by_histogram(const_datum: &Datum, col_stat: &ColumnStat) -> Option<f64> {
+fn evaluate_equal_by_histogram(const_datum: &Datum, col_stat: &ColumnStat) -> Result<f64> {
     let hist = col_stat.histogram.as_ref().unwrap();
-    let min = &col_stat.min;
-    let max = &col_stat.max;
-    // Find how many buckets in [min, max]
-    let mut num_buckets = 0;
+    let min = col_stat.min.to_double()?;
+    let max = col_stat.max.to_double()?;
+    let mut range = 0.0;
     for (idx, bucket) in hist.buckets_iter().enumerate() {
         if idx == 0 {
             continue;
         }
+        let bucket_min = hist.buckets[idx - 1].upper_bound().to_double()?;
+        let bucket_max = bucket.upper_bound().to_double()?;
         // If the bucket max is less than min, skip it
-        if let Ok(ord) = bucket.upper_bound().compare(min) {
-            if ord == Ordering::Less || ord == Ordering::Equal {
-                continue;
-            }
-        } else {
-            return None;
+        if bucket_max < min {
+            continue;
         }
         // If the bucket min is greater than max, stop iteration
-        if let Ok(ord) = hist.buckets[idx - 1].upper_bound().compare(max) {
-            if ord == Ordering::Greater || ord == Ordering::Equal {
-                break;
-            }
-        } else {
-            return None;
+        if bucket_min >= max {
+            break;
+        }
+        if bucket_min > min && bucket_max <= max {
+            // ---min---bucket_min---bucket_max---max---
+            range += 1.0;
+        } else if bucket_min <= min {
+            // left part
+            range += (bucket_max - min) / (bucket_max - bucket_min);
+        } else if bucket_max > max {
+            // right part
+            range += (max - bucket_min) / (bucket_max - bucket_min);
         }
-        num_buckets += 1;
     }
 
-    // Find how many buckets in [const_datum, const_datum]
-    let mut num_equal_buckets = 0;
+    let mut equal_range = 0.0;
+    let const_value = const_datum.to_double()?;
     for (idx, bucket) in hist.buckets_iter().enumerate() {
         if idx == 0 {
             continue;
         }
-        if let Ok(ord) = hist.buckets[idx - 1].upper_bound().compare(const_datum) {
-            if ord == Ordering::Less || ord == Ordering::Equal {
-                if let Ok(ord) = bucket.upper_bound().compare(const_datum) {
-                    if ord == Ordering::Greater || ord == Ordering::Equal {
-                        num_equal_buckets += 1;
-                    }
-                } else {
-                    return None;
-                }
-            }
-        } else {
-            return None;
+        let bucket_min = hist.buckets[idx - 1].upper_bound().to_double()?;
+        let bucket_max = bucket.upper_bound().to_double()?;
+        // If the bucket max is less than min, skip it
+        if bucket_max < const_value {
+            continue;
         }
+        // If the bucket min is greater than max, stop iteration
+        if bucket_min >= const_value {
+            break;
+        }
+        equal_range += 1.0 / bucket.num_distinct()
     }
-    if num_buckets == 0 {
-        return Some(0.0);
+    if range == 0.0 {
+        return Ok(0.0);
     }
-
-    Some(num_equal_buckets as f64 / num_buckets as f64)
+    Ok(equal_range / range)
 }
diff --git a/tests/sqllogictests/suites/mode/standalone/explain/bloom_filter.test b/tests/sqllogictests/suites/mode/standalone/explain/bloom_filter.test
@@ -16,10 +16,10 @@ explain select * from bloom_test_t where c1 = 5
 ----
 EvalScalar
 ├── expressions: [bloom_test_t.c1 (#0), bloom_test_t.c2 (#1)]
-├── estimated rows: 1.00
+├── estimated rows: 1.50
 └── Filter
     ├── filters: [bloom_test_t.c1 (#0) = 5]
-    ├── estimated rows: 1.00
+    ├── estimated rows: 1.50
     └── TableScan
         ├── table: default.default.bloom_test_t
         ├── read rows: 3
@@ -115,10 +115,10 @@ explain select * from bloom_test_t where c2=3;
 ----
 EvalScalar
 ├── expressions: [bloom_test_t.c1 (#0), bloom_test_t.c2 (#1)]
-├── estimated rows: 1.00
+├── estimated rows: 1.33
 └── Filter
     ├── filters: [bloom_test_t.c2 (#1) = 3]
-    ├── estimated rows: 1.00
+    ├── estimated rows: 1.33
     └── TableScan
         ├── table: default.default.bloom_test_t
         ├── read rows: 6
@@ -146,10 +146,10 @@ explain select * from bloom_test_nullable_t where c1 = 5 and c2 > 1;
 ----
 EvalScalar
 ├── expressions: [bloom_test_nullable_t.c1 (#0), bloom_test_nullable_t.c2 (#1)]
-├── estimated rows: 0.80
+├── estimated rows: 1.20
 └── Filter
     ├── filters: [is_true(bloom_test_nullable_t.c1 (#0) = 5), is_true(bloom_test_nullable_t.c2 (#1) > 1)]
-    ├── estimated rows: 0.80
+    ├── estimated rows: 1.20
     └── TableScan
         ├── table: default.default.bloom_test_nullable_t
         ├── read rows: 3

diff --git a/tests/sqllogictests/suites/mode/standalone/explain/eliminate_outer_join.test b/tests/sqllogictests/suites/mode/standalone/explain/eliminate_outer_join.test
@@ -287,16 +287,16 @@ explain select * from t left join t t1 on t.a = t1.a where t1.a = 1
 ----
 EvalScalar
 ├── expressions: [t.a (#0), t1.a (#1)]
-├── estimated rows: 1.00
+├── estimated rows: 1.56
 └── HashJoin
     ├── join type: INNER
     ├── build keys: [t1.a (#1)]
     ├── probe keys: [t.a (#0)]
     ├── filters: []
-    ├── estimated rows: 1.00
+    ├── estimated rows: 1.56
     ├── Filter(Build)
     │   ├── filters: [is_true(t1.a (#1) = 1)]
-    │   ├── estimated rows: 1.00
+    │   ├── estimated rows: 1.25
     │   └── TableScan
     │       ├── table: default.eliminate_outer_join.t
     │       ├── read rows: 10
@@ -308,7 +308,7 @@ EvalScalar
     │       └── estimated rows: 10.00
     └── Filter(Probe)
         ├── filters: [is_true(t.a (#0) = 1)]
-        ├── estimated rows: 1.00
+        ├── estimated rows: 1.25
         └── TableScan
             ├── table: default.eliminate_outer_join.t
             ├── read rows: 10
@@ -398,16 +398,16 @@ explain select * from t left join t t1 on t.a = t1.a where t1.a <> 1
 ----
 EvalScalar
 ├── expressions: [t.a (#0), t1.a (#1)]
-├── estimated rows: 9.00
+├── estimated rows: 9.57
 └── HashJoin
     ├── join type: INNER
     ├── build keys: [t1.a (#1)]
     ├── probe keys: [t.a (#0)]
     ├── filters: []
-    ├── estimated rows: 9.00
+    ├── estimated rows: 9.57
     ├── Filter(Build)
     │   ├── filters: [is_true(t1.a (#1) <> 1)]
-    │   ├── estimated rows: 9.00
+    │   ├── estimated rows: 8.75
     │   └── TableScan
     │       ├── table: default.eliminate_outer_join.t
     │       ├── read rows: 10
@@ -419,7 +419,7 @@ EvalScalar
     │       └── estimated rows: 10.00
     └── Filter(Probe)
         ├── filters: [is_true(t.a (#0) <> 1)]
-        ├── estimated rows: 9.00
+        ├── estimated rows: 8.75
         └── TableScan
             ├── table: default.eliminate_outer_join.t
             ├── read rows: 10

diff --git a/tests/sqllogictests/suites/mode/standalone/explain/explain.test b/tests/sqllogictests/suites/mode/standalone/explain/explain.test
@@ -678,10 +678,10 @@ explain select * from t1,t2, t3 where (t1.a > 1 and t2.a > 2) or (t1.b < 3 and t
 ----
 EvalScalar
 ├── expressions: [t1.a (#0), t1.b (#1), t2.a (#2), t2.b (#3), t3.a (#4), t3.b (#5)]
-├── estimated rows: 19.06
+├── estimated rows: 19.92
 └── Filter
     ├── filters: [t1.a (#0) > 1 AND t2.a (#2) > 2 OR t1.b (#1) < 3 AND t2.b (#3) < 4 OR t3.a (#4) = 2]
-    ├── estimated rows: 19.06
+    ├── estimated rows: 19.92
     └── HashJoin
         ├── join type: CROSS
         ├── build keys: []

diff --git a/tests/sqllogictests/suites/mode/standalone/explain/join.test b/tests/sqllogictests/suites/mode/standalone/explain/join.test
@@ -707,16 +707,16 @@ explain SELECT o.x, t.y FROM onecolumn o INNER JOIN twocolumn t ON (o.x=t.x AND
 ----
 EvalScalar
 ├── expressions: [o.x (#0), t.y (#2)]
-├── estimated rows: 1.00
+├── estimated rows: 1.33
 └── HashJoin
     ├── join type: INNER
     ├── build keys: [t.x (#1)]
     ├── probe keys: [o.x (#0)]
     ├── filters: []
-    ├── estimated rows: 1.00
+    ├── estimated rows: 1.33
     ├── Filter(Build)
     │   ├── filters: [is_true(t.y (#2) = 53)]
-    │   ├── estimated rows: 1.00
+    │   ├── estimated rows: 1.33
     │   └── TableScan
     │       ├── table: default.default.twocolumn
     │       ├── read rows: 8

diff --git a/tests/sqllogictests/suites/mode/standalone/explain_native/bloom_filter.test b/tests/sqllogictests/suites/mode/standalone/explain_native/bloom_filter.test
@@ -16,7 +16,7 @@ explain select * from bloom_test_t where c1 = 5
 ----
 EvalScalar
 ├── expressions: [bloom_test_t.c1 (#0), bloom_test_t.c2 (#1)]
-├── estimated rows: 1.00
+├── estimated rows: 1.50
 └── TableScan
     ├── table: default.default.bloom_test_t
     ├── read rows: 3
@@ -25,7 +25,7 @@ EvalScalar
     ├── partitions scanned: 1
     ├── pruning stats: [segments: <range pruning: 2 to 2>, blocks: <range pruning: 2 to 2, bloom pruning: 2 to 1>]
     ├── push downs: [filters: [bloom_test_t.c1 (#0) = 5], limit: NONE]
-    └── estimated rows: 1.00
+    └── estimated rows: 1.50
 
 query T
 select * from bloom_test_t where c1 = 5
@@ -109,7 +109,7 @@ explain select * from bloom_test_t where c2=3;
 ----
 EvalScalar
 ├── expressions: [bloom_test_t.c1 (#0), bloom_test_t.c2 (#1)]
-├── estimated rows: 1.00
+├── estimated rows: 1.33
 └── TableScan
     ├── table: default.default.bloom_test_t
     ├── read rows: 6
@@ -118,7 +118,7 @@ EvalScalar
     ├── partitions scanned: 2
     ├── pruning stats: [segments: <range pruning: 3 to 3>, blocks: <range pruning: 3 to 3, bloom pruning: 3 to 2>]
     ├── push downs: [filters: [bloom_test_t.c2 (#1) = 3], limit: NONE]
-    └── estimated rows: 1.00
+    └── estimated rows: 1.33
 
 statement ok
 drop table bloom_test_t
@@ -137,7 +137,7 @@ explain select * from bloom_test_nullable_t where c1 = 5 and c2 > 1;
 ----
 EvalScalar
 ├── expressions: [bloom_test_nullable_t.c1 (#0), bloom_test_nullable_t.c2 (#1)]
-├── estimated rows: 0.80
+├── estimated rows: 1.20
 └── TableScan
     ├── table: default.default.bloom_test_nullable_t
     ├── read rows: 3
@@ -146,7 +146,7 @@ EvalScalar
     ├── partitions scanned: 1
     ├── pruning stats: [segments: <range pruning: 2 to 2>, blocks: <range pruning: 2 to 2, bloom pruning: 2 to 1>]
     ├── push downs: [filters: [and_filters(bloom_test_nullable_t.c1 (#0) = 5, bloom_test_nullable_t.c2 (#1) > 1)], limit: NONE]
-    └── estimated rows: 0.80
+    └── estimated rows: 1.20
 
 statement ok
 drop table bloom_test_nullable_t
diff --git a/tests/sqllogictests/suites/mode/standalone/explain_native/explain.test b/tests/sqllogictests/suites/mode/standalone/explain_native/explain.test
@@ -660,10 +660,10 @@ explain select * from t1,t2, t3 where (t1.a > 1 and t2.a > 2) or (t1.b < 3 and t
 ----
 EvalScalar
 ├── expressions: [t1.a (#0), t1.b (#1), t2.a (#2), t2.b (#3), t3.a (#4), t3.b (#5)]
-├── estimated rows: 19.06
+├── estimated rows: 19.92
 └── Filter
     ├── filters: [t1.a (#0) > 1 AND t2.a (#2) > 2 OR t1.b (#1) < 3 AND t2.b (#3) < 4 OR t3.a (#4) = 2]
-    ├── estimated rows: 19.06
+    ├── estimated rows: 19.92
     └── HashJoin
         ├── join type: CROSS
         ├── build keys: []