Skip to content

Commit

Permalink
feat: support histogram for equal condition
Browse files Browse the repository at this point in the history
  • Loading branch information
xudong963 committed May 4, 2023
1 parent a40976e commit ef90684
Show file tree
Hide file tree
Showing 8 changed files with 65 additions and 66 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -951,8 +951,8 @@ macro_rules! impl_advance_frame_bound_method {
.column_at(&self.[<frame_ $bound>], ref_idx)
.as_nullable()
.unwrap();
let valdity = &col.validity;
if unsafe { !valdity.get_bit_unchecked(self.[<frame_ $bound>].row) } {
let validity = &col.validity;
if unsafe { !validity.get_bit_unchecked(self.[<frame_ $bound>].row) } {
// Need to skip null rows.
if nulls_first {
// The null rows are at front.
Expand Down
73 changes: 36 additions & 37 deletions src/query/sql/src/planner/optimizer/property/selectivity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ fn is_true_constant_predicate(constant: &ConstantExpr) -> bool {
fn evaluate_equal(column_stat: &ColumnStat, constant: &ConstantExpr, const_datum: &Datum) -> f64 {
if column_stat.histogram.is_some() {
let res = evaluate_equal_by_histogram(const_datum, column_stat);
if let Some(res) = res {
if let Ok(res) = res {
return res;
}
}
Expand Down Expand Up @@ -382,58 +382,57 @@ fn update_statistic(
Ok(())
}

fn evaluate_equal_by_histogram(const_datum: &Datum, col_stat: &ColumnStat) -> Option<f64> {
fn evaluate_equal_by_histogram(const_datum: &Datum, col_stat: &ColumnStat) -> Result<f64> {
let hist = col_stat.histogram.as_ref().unwrap();
let min = &col_stat.min;
let max = &col_stat.max;
// Find how many buckets in [min, max]
let mut num_buckets = 0;
let min = col_stat.min.to_double()?;
let max = col_stat.max.to_double()?;
let mut range = 0.0;
for (idx, bucket) in hist.buckets_iter().enumerate() {
if idx == 0 {
continue;
}
let bucket_min = hist.buckets[idx - 1].upper_bound().to_double()?;
let bucket_max = bucket.upper_bound().to_double()?;
// If the bucket max is less than min, skip it
if let Ok(ord) = bucket.upper_bound().compare(min) {
if ord == Ordering::Less || ord == Ordering::Equal {
continue;
}
} else {
return None;
if bucket_max < min {
continue;
}
// If the bucket min is greater than max, stop iteration
if let Ok(ord) = hist.buckets[idx - 1].upper_bound().compare(max) {
if ord == Ordering::Greater || ord == Ordering::Equal {
break;
}
} else {
return None;
if bucket_min >= max {
break;
}
if bucket_min > min && bucket_max <= max {
// ---min---bucket_min---bucket_max---max---
range += 1.0;
} else if bucket_min <= min {
// left part
range += (bucket_max - min) / (bucket_max - bucket_min);
} else if bucket_max > max {
// right part
range += (max - bucket_min) / (bucket_max - bucket_min);
}
num_buckets += 1;
}

// Find how many buckets in [const_datum, const_datum]
let mut num_equal_buckets = 0;
let mut equal_range = 0.0;
let const_value = const_datum.to_double()?;
for (idx, bucket) in hist.buckets_iter().enumerate() {
if idx == 0 {
continue;
}
if let Ok(ord) = hist.buckets[idx - 1].upper_bound().compare(const_datum) {
if ord == Ordering::Less || ord == Ordering::Equal {
if let Ok(ord) = bucket.upper_bound().compare(const_datum) {
if ord == Ordering::Greater || ord == Ordering::Equal {
num_equal_buckets += 1;
}
} else {
return None;
}
}
} else {
return None;
let bucket_min = hist.buckets[idx - 1].upper_bound().to_double()?;
let bucket_max = bucket.upper_bound().to_double()?;
// If the bucket max is less than min, skip it
if bucket_max < const_value {
continue;
}
// If the bucket min is greater than max, stop iteration
if bucket_min >= const_value {
break;
}
equal_range += 1.0 / bucket.num_distinct()
}
if num_buckets == 0 {
return Some(0.0);
if range == 0.0 {
return Ok(0.0);
}

Some(num_equal_buckets as f64 / num_buckets as f64)
Ok(equal_range / range)
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ explain select * from bloom_test_t where c1 = 5
----
EvalScalar
├── expressions: [bloom_test_t.c1 (#0), bloom_test_t.c2 (#1)]
├── estimated rows: 1.00
├── estimated rows: 1.50
└── Filter
├── filters: [bloom_test_t.c1 (#0) = 5]
├── estimated rows: 1.00
├── estimated rows: 1.50
└── TableScan
├── table: default.default.bloom_test_t
├── read rows: 3
Expand Down Expand Up @@ -115,10 +115,10 @@ explain select * from bloom_test_t where c2=3;
----
EvalScalar
├── expressions: [bloom_test_t.c1 (#0), bloom_test_t.c2 (#1)]
├── estimated rows: 1.00
├── estimated rows: 1.33
└── Filter
├── filters: [bloom_test_t.c2 (#1) = 3]
├── estimated rows: 1.00
├── estimated rows: 1.33
└── TableScan
├── table: default.default.bloom_test_t
├── read rows: 6
Expand Down Expand Up @@ -146,10 +146,10 @@ explain select * from bloom_test_nullable_t where c1 = 5 and c2 > 1;
----
EvalScalar
├── expressions: [bloom_test_nullable_t.c1 (#0), bloom_test_nullable_t.c2 (#1)]
├── estimated rows: 0.80
├── estimated rows: 1.20
└── Filter
├── filters: [is_true(bloom_test_nullable_t.c1 (#0) = 5), is_true(bloom_test_nullable_t.c2 (#1) > 1)]
├── estimated rows: 0.80
├── estimated rows: 1.20
└── TableScan
├── table: default.default.bloom_test_nullable_t
├── read rows: 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,16 +287,16 @@ explain select * from t left join t t1 on t.a = t1.a where t1.a = 1
----
EvalScalar
├── expressions: [t.a (#0), t1.a (#1)]
├── estimated rows: 1.00
├── estimated rows: 1.56
└── HashJoin
├── join type: INNER
├── build keys: [t1.a (#1)]
├── probe keys: [t.a (#0)]
├── filters: []
├── estimated rows: 1.00
├── estimated rows: 1.56
├── Filter(Build)
│ ├── filters: [is_true(t1.a (#1) = 1)]
│ ├── estimated rows: 1.00
│ ├── estimated rows: 1.25
│ └── TableScan
│ ├── table: default.eliminate_outer_join.t
│ ├── read rows: 10
Expand All @@ -308,7 +308,7 @@ EvalScalar
│ └── estimated rows: 10.00
└── Filter(Probe)
├── filters: [is_true(t.a (#0) = 1)]
├── estimated rows: 1.00
├── estimated rows: 1.25
└── TableScan
├── table: default.eliminate_outer_join.t
├── read rows: 10
Expand Down Expand Up @@ -398,16 +398,16 @@ explain select * from t left join t t1 on t.a = t1.a where t1.a <> 1
----
EvalScalar
├── expressions: [t.a (#0), t1.a (#1)]
├── estimated rows: 9.00
├── estimated rows: 9.57
└── HashJoin
├── join type: INNER
├── build keys: [t1.a (#1)]
├── probe keys: [t.a (#0)]
├── filters: []
├── estimated rows: 9.00
├── estimated rows: 9.57
├── Filter(Build)
│ ├── filters: [is_true(t1.a (#1) <> 1)]
│ ├── estimated rows: 9.00
│ ├── estimated rows: 8.75
│ └── TableScan
│ ├── table: default.eliminate_outer_join.t
│ ├── read rows: 10
Expand All @@ -419,7 +419,7 @@ EvalScalar
│ └── estimated rows: 10.00
└── Filter(Probe)
├── filters: [is_true(t.a (#0) <> 1)]
├── estimated rows: 9.00
├── estimated rows: 8.75
└── TableScan
├── table: default.eliminate_outer_join.t
├── read rows: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -678,10 +678,10 @@ explain select * from t1,t2, t3 where (t1.a > 1 and t2.a > 2) or (t1.b < 3 and t
----
EvalScalar
├── expressions: [t1.a (#0), t1.b (#1), t2.a (#2), t2.b (#3), t3.a (#4), t3.b (#5)]
├── estimated rows: 19.06
├── estimated rows: 19.92
└── Filter
├── filters: [t1.a (#0) > 1 AND t2.a (#2) > 2 OR t1.b (#1) < 3 AND t2.b (#3) < 4 OR t3.a (#4) = 2]
├── estimated rows: 19.06
├── estimated rows: 19.92
└── HashJoin
├── join type: CROSS
├── build keys: []
Expand Down
6 changes: 3 additions & 3 deletions tests/sqllogictests/suites/mode/standalone/explain/join.test
Original file line number Diff line number Diff line change
Expand Up @@ -707,16 +707,16 @@ explain SELECT o.x, t.y FROM onecolumn o INNER JOIN twocolumn t ON (o.x=t.x AND
----
EvalScalar
├── expressions: [o.x (#0), t.y (#2)]
├── estimated rows: 1.00
├── estimated rows: 1.33
└── HashJoin
├── join type: INNER
├── build keys: [t.x (#1)]
├── probe keys: [o.x (#0)]
├── filters: []
├── estimated rows: 1.00
├── estimated rows: 1.33
├── Filter(Build)
│ ├── filters: [is_true(t.y (#2) = 53)]
│ ├── estimated rows: 1.00
│ ├── estimated rows: 1.33
│ └── TableScan
│ ├── table: default.default.twocolumn
│ ├── read rows: 8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ explain select * from bloom_test_t where c1 = 5
----
EvalScalar
├── expressions: [bloom_test_t.c1 (#0), bloom_test_t.c2 (#1)]
├── estimated rows: 1.00
├── estimated rows: 1.50
└── TableScan
├── table: default.default.bloom_test_t
├── read rows: 3
Expand All @@ -25,7 +25,7 @@ EvalScalar
├── partitions scanned: 1
├── pruning stats: [segments: <range pruning: 2 to 2>, blocks: <range pruning: 2 to 2, bloom pruning: 2 to 1>]
├── push downs: [filters: [bloom_test_t.c1 (#0) = 5], limit: NONE]
└── estimated rows: 1.00
└── estimated rows: 1.50

query T
select * from bloom_test_t where c1 = 5
Expand Down Expand Up @@ -109,7 +109,7 @@ explain select * from bloom_test_t where c2=3;
----
EvalScalar
├── expressions: [bloom_test_t.c1 (#0), bloom_test_t.c2 (#1)]
├── estimated rows: 1.00
├── estimated rows: 1.33
└── TableScan
├── table: default.default.bloom_test_t
├── read rows: 6
Expand All @@ -118,7 +118,7 @@ EvalScalar
├── partitions scanned: 2
├── pruning stats: [segments: <range pruning: 3 to 3>, blocks: <range pruning: 3 to 3, bloom pruning: 3 to 2>]
├── push downs: [filters: [bloom_test_t.c2 (#1) = 3], limit: NONE]
└── estimated rows: 1.00
└── estimated rows: 1.33

statement ok
drop table bloom_test_t
Expand All @@ -137,7 +137,7 @@ explain select * from bloom_test_nullable_t where c1 = 5 and c2 > 1;
----
EvalScalar
├── expressions: [bloom_test_nullable_t.c1 (#0), bloom_test_nullable_t.c2 (#1)]
├── estimated rows: 0.80
├── estimated rows: 1.20
└── TableScan
├── table: default.default.bloom_test_nullable_t
├── read rows: 3
Expand All @@ -146,7 +146,7 @@ EvalScalar
├── partitions scanned: 1
├── pruning stats: [segments: <range pruning: 2 to 2>, blocks: <range pruning: 2 to 2, bloom pruning: 2 to 1>]
├── push downs: [filters: [and_filters(bloom_test_nullable_t.c1 (#0) = 5, bloom_test_nullable_t.c2 (#1) > 1)], limit: NONE]
└── estimated rows: 0.80
└── estimated rows: 1.20

statement ok
drop table bloom_test_nullable_t
Original file line number Diff line number Diff line change
Expand Up @@ -660,10 +660,10 @@ explain select * from t1,t2, t3 where (t1.a > 1 and t2.a > 2) or (t1.b < 3 and t
----
EvalScalar
├── expressions: [t1.a (#0), t1.b (#1), t2.a (#2), t2.b (#3), t3.a (#4), t3.b (#5)]
├── estimated rows: 19.06
├── estimated rows: 19.92
└── Filter
├── filters: [t1.a (#0) > 1 AND t2.a (#2) > 2 OR t1.b (#1) < 3 AND t2.b (#3) < 4 OR t3.a (#4) = 2]
├── estimated rows: 19.06
├── estimated rows: 19.92
└── HashJoin
├── join type: CROSS
├── build keys: []
Expand Down

0 comments on commit ef90684

Please sign in to comment.