Skip to content

Commit

Permalink
feat(query): Inverted index search function support options (#16256)
Browse files Browse the repository at this point in the history
* feat(query): Inverted index search function support options

* add tests

* create query in pruner

* add tests

* fix
  • Loading branch information
b41sh authored Aug 20, 2024
1 parent 6136e6d commit d14f7a5
Show file tree
Hide file tree
Showing 8 changed files with 358 additions and 167 deletions.
20 changes: 19 additions & 1 deletion src/query/catalog/src/plan/pushdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,22 @@ pub struct PrewhereInfo {
pub virtual_columns: Option<Vec<VirtualColumnInfo>>,
}

/// Inverted index option for additional search functions configuration.
#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)]
pub struct InvertedIndexOption {
/// Fuzzy query match terms within Levenshtein distance
/// https://en.wikipedia.org/wiki/Levenshtein_distance
/// For example: if fuzziness is 1, and query text if `fox`,
/// the term `box` will be matched.
pub fuzziness: Option<u8>,
/// Operator: true is AND, false is OR, default is OR.
/// For example: query text `happy tax payer` is equals to `happy OR tax OR payer`,
/// but if operator is true, it will equals to `happy AND tax AND payer`.
pub operator: bool,
/// Parse a query leniently, ignore invalid query, default is false.
pub lenient: bool,
}

/// Information about inverted index.
#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)]
pub struct InvertedIndexInfo {
Expand All @@ -91,8 +107,10 @@ pub struct InvertedIndexInfo {
pub query_fields: Vec<(String, Option<F32>)>,
/// The search query text with query syntax.
pub query_text: String,
/// whether search with score function
/// Whether search with score function.
pub has_score: bool,
/// Optional search configuration option, like fuzziness, lenient, ..
pub inverted_index_option: Option<InvertedIndexOption>,
}

/// Extras is a wrapper for push down items.
Expand Down
105 changes: 47 additions & 58 deletions src/query/ee/tests/it/inverted_index/index_refresh.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,20 @@
use std::collections::BTreeMap;

use databend_common_base::base::tokio;
use databend_common_catalog::plan::InvertedIndexInfo;
use databend_common_catalog::table::Table;
use databend_common_catalog::table::TableExt;
use databend_common_exception::Result;
use databend_common_expression::types::DataType;
use databend_common_expression::DataField;
use databend_common_expression::DataSchema;
use databend_common_meta_app::schema::CreateOption;
use databend_common_meta_app::schema::CreateTableIndexReq;
use databend_common_sql::plans::RefreshTableIndexPlan;
use databend_common_storages_fuse::io::read::InvertedIndexReader;
use databend_common_storages_fuse::io::MetaReaders;
use databend_common_storages_fuse::io::TableMetaLocationGenerator;
use databend_common_storages_fuse::pruning::create_inverted_index_query;
use databend_common_storages_fuse::FuseTable;
use databend_common_storages_fuse::TableContext;
use databend_enterprise_inverted_index::get_inverted_index_handler;
Expand All @@ -33,11 +38,6 @@ use databend_query::interpreters::RefreshTableIndexInterpreter;
use databend_query::test_kits::append_string_sample_data;
use databend_query::test_kits::*;
use databend_storages_common_cache::LoadParams;
use tantivy::schema::Field;
use tantivy::tokenizer::LowerCaser;
use tantivy::tokenizer::SimpleTokenizer;
use tantivy::tokenizer::TextAnalyzer;
use tantivy::tokenizer::TokenizerManager;

#[tokio::test(flavor = "multi_thread")]
async fn test_fuse_do_refresh_inverted_index() -> Result<()> {
Expand Down Expand Up @@ -74,7 +74,7 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> {
name: index_name.clone(),
column_ids: vec![0, 1],
sync_creation: false,
options,
options: options.clone(),
};

let res = handler.do_create_table_index(catalog.clone(), req).await;
Expand Down Expand Up @@ -127,69 +127,58 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> {
let block_meta = &block_metas[0];

let dal = new_fuse_table.get_operator_ref();
let fields = ["title".to_string(), "content".to_string()];
let query_fields = vec![("title".to_string(), None), ("content".to_string(), None)];
let index_schema = DataSchema::new(vec![
DataField::new("title", DataType::String),
DataField::new("content", DataType::String),
]);

let index_loc = TableMetaLocationGenerator::gen_inverted_index_location_from_block_location(
&block_meta.location.0,
&index_name,
&index_version,
);

let field_nums = fields.len();
let field_nums = query_fields.len();
let has_score = true;
let need_position = false;

let mut query_fields = Vec::with_capacity(fields.len());
let query_field_boosts = Vec::new();
for i in 0..fields.len() {
let field = Field::from_field_id(i as u32);
query_fields.push(field);
let index_reader =
InvertedIndexReader::try_create(dal.clone(), field_nums, need_position, &index_loc).await?;

let queries = vec![
("rust".to_string(), vec![0, 1]),
("java".to_string(), vec![2]),
("data".to_string(), vec![4, 1, 5]),
];

for (query_text, ids) in queries.into_iter() {
let inverted_index_info = InvertedIndexInfo {
index_name: index_name.clone(),
index_version: index_version.clone(),
index_options: options.clone(),
index_schema: index_schema.clone(),
query_fields: query_fields.clone(),
query_text,
has_score,
inverted_index_option: None,
};

let (query, tokenizer_manager) = create_inverted_index_query(&inverted_index_info)?;

let matched_rows = index_reader.clone().do_filter(
has_score,
&query,
tokenizer_manager,
block_meta.row_count,
)?;
assert!(matched_rows.is_some());
let matched_rows = matched_rows.unwrap();
assert_eq!(matched_rows.len(), ids.len());
for (matched_row, id) in matched_rows.iter().zip(ids.iter()) {
assert_eq!(matched_row.0, *id);
}
}
let tokenizer_manager = TokenizerManager::new();
let english_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.build();
tokenizer_manager.register("english", english_analyzer);

let index_reader = InvertedIndexReader::try_create(
dal.clone(),
field_nums,
has_score,
need_position,
query_fields,
query_field_boosts,
tokenizer_manager,
&index_loc,
)
.await?;

let query = "rust";
let matched_rows = index_reader
.clone()
.do_filter(query, block_meta.row_count)?;
assert!(matched_rows.is_some());
let matched_rows = matched_rows.unwrap();
assert_eq!(matched_rows.len(), 2);
assert_eq!(matched_rows[0].0, 0);
assert_eq!(matched_rows[1].0, 1);

let query = "java";
let matched_rows = index_reader
.clone()
.do_filter(query, block_meta.row_count)?;
assert!(matched_rows.is_some());
let matched_rows = matched_rows.unwrap();
assert_eq!(matched_rows.len(), 1);
assert_eq!(matched_rows[0].0, 2);

let query = "data";
let matched_rows = index_reader.do_filter(query, block_meta.row_count)?;
assert!(matched_rows.is_some());
let matched_rows = matched_rows.unwrap();
assert_eq!(matched_rows.len(), 3);
assert_eq!(matched_rows[0].0, 4);
assert_eq!(matched_rows[1].0, 1);
assert_eq!(matched_rows[2].0, 5);

Ok(())
}
15 changes: 15 additions & 0 deletions src/query/ee/tests/it/inverted_index/pruning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("idiom".to_string(), None)],
query_text: "test".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -564,6 +565,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("idiom".to_string(), None)],
query_text: "save".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -576,6 +578,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("idiom".to_string(), None)],
query_text: "one".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -588,6 +591,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("idiom".to_string(), None)],
query_text: "the".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -600,6 +604,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("idiom".to_string(), None)],
query_text: "光阴".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -612,6 +617,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("idiom".to_string(), None)],
query_text: "人生".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -624,6 +630,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("meaning".to_string(), None)],
query_text: "people".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -636,6 +643,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("meaning".to_string(), None)],
query_text: "bad".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -648,6 +656,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("meaning".to_string(), None)],
query_text: "黄金".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -660,6 +669,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("meaning".to_string(), None)],
query_text: "时间".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -675,6 +685,7 @@ async fn test_block_pruner() -> Result<()> {
],
query_text: "you".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -690,6 +701,7 @@ async fn test_block_pruner() -> Result<()> {
],
query_text: "光阴".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -702,6 +714,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("extras".to_string(), None)],
query_text: "extras.title:Blockchain".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -714,6 +727,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("extras".to_string(), None)],
query_text: "extras.metadata.author:David".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -726,6 +740,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("extras".to_string(), None)],
query_text: "extras.metadata.tags:技术".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand Down
Loading

0 comments on commit d14f7a5

Please sign in to comment.