From 2dadbf70d657d23effcc4ccccbf9bbc7b35e1269 Mon Sep 17 00:00:00 2001 From: Mikkel Denker Date: Wed, 20 Mar 2024 21:36:44 +0100 Subject: [PATCH] Schema fields as traits (#185) * refactor data that is re-used across fields for a particular page during indexing into an 'FnCache' * automatically generate ALL_FIELDS and ALL_SIGNALS arrays with strum macro. ensures the arrays are always fully up to date * split up schema fields into submodules * add textfield trait with enum-dispatch * add fastfield trait with enum-dispatch * move field names into trait * move some trivial functions from 'FastFieldEnum' and 'TextFieldEnum' into their respective traits * move methods from Field into TextField and FastField traits * extract html .as_tantivy into textfield trait * extract html .as_tantivy into fastfield trait * extract webpage .as_tantivy into field traits * fix indexer example cleanup --- .gitignore | 1 + Cargo.lock | 40 +- Cargo.toml | 2 + assets/licenses.html | 7 +- crates/core/Cargo.toml | 2 + crates/core/examples/indexer.rs | 2 +- crates/core/src/collector.rs | 28 +- crates/core/src/enum_map.rs | 43 +- crates/core/src/fastfield_reader.rs | 10 +- crates/core/src/inverted_index.rs | 69 +- crates/core/src/mapreduce/dht/mod.rs | 8 +- crates/core/src/mapreduce/dht/network/mod.rs | 20 +- crates/core/src/mapreduce/dht/network/raft.rs | 28 +- crates/core/src/query/mod.rs | 4 +- crates/core/src/query/optic.rs | 20 +- crates/core/src/query/parser/as_tantivy.rs | 20 +- crates/core/src/query/parser/mod.rs | 5 +- crates/core/src/query/pattern_query/mod.rs | 8 +- crates/core/src/query/pattern_query/scorer.rs | 8 +- crates/core/src/query/pattern_query/weight.rs | 42 +- .../src/ranking/pipeline/stages/recall.rs | 6 +- crates/core/src/ranking/signal.rs | 158 +- crates/core/src/schema.rs | 850 ---------- crates/core/src/schema/fast_field.rs | 1233 ++++++++++++++ crates/core/src/schema/mod.rs | 139 ++ crates/core/src/schema/text_field.rs | 1427 +++++++++++++++++ crates/core/src/searcher/api/mod.rs | 6 +- crates/core/src/searcher/local.rs | 4 +- crates/core/src/webpage/html/fn_cache.rs | 136 ++ crates/core/src/webpage/html/into_tantivy.rs | 531 +----- crates/core/src/webpage/html/microformats.rs | 28 +- crates/core/src/webpage/html/mod.rs | 3 + crates/core/src/webpage/html/robots_meta.rs | 11 +- crates/core/src/webpage/mod.rs | 177 +- 34 files changed, 3325 insertions(+), 1751 deletions(-) delete mode 100644 crates/core/src/schema.rs create mode 100644 crates/core/src/schema/fast_field.rs create mode 100644 crates/core/src/schema/mod.rs create mode 100644 crates/core/src/schema/text_field.rs create mode 100644 crates/core/src/webpage/html/fn_cache.rs diff --git a/.gitignore b/.gitignore index 8fc1d80a..e94fafa8 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ proptest-regressions *.pending-snap .ipynb_checkpoints .zed +rustc-ice-* diff --git a/Cargo.lock b/Cargo.lock index b22de01f..213c5a0c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1373,6 +1373,18 @@ dependencies = [ "syn 2.0.52", ] +[[package]] +name = "enum_dispatch" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.52", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -4174,8 +4186,8 @@ dependencies = [ "smallvec", "snap", "socket2", - "strum", - "strum_macros", + "strum 0.23.0", + "strum_macros 0.23.1", "thiserror", "tokio", "tracing", @@ -4536,6 +4548,7 @@ dependencies = [ "csv", "dashmap", "encoding_rs", + "enum_dispatch", "eventsource-stream", "fend-core", "flate2", @@ -4586,6 +4599,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", + "strum 0.26.2", "tantivy", "thiserror", "tikv-jemallocator", @@ -4650,6 +4664,15 @@ version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb" +[[package]] +name = "strum" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29" +dependencies = [ + "strum_macros 0.26.2", +] + [[package]] name = "strum_macros" version = "0.23.1" @@ -4663,6 +4686,19 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "strum_macros" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6cf59daf282c0a494ba14fd21610a0325f9f90ec9d1231dea26bcb1d696c946" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.52", +] + [[package]] name = "subtle" version = "2.5.0" diff --git a/Cargo.toml b/Cargo.toml index f7c975af..a0e16e54 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ crossbeam-channel = "0.5.6" csv = "1.1.6" dashmap = { version = "5.4.0", features = ["rayon"] } encoding_rs = "0.8.31" +enum_dispatch = "0.3.12" eventsource-stream = "0.2.3" fend-core = "1.2.2" flate2 = "1.0.28" @@ -95,6 +96,7 @@ scylla = { version = "0.12.0", features = ["chrono"] } serde = { version = "1.0.137", features = ["rc", "derive"] } serde_json = "1.0.81" serde_urlencoded = "0.7.1" +strum = { version = "0.26.2", features = ["derive"] } tantivy = { git = "https://github.com/quickwit-oss/tantivy", rev = "182f58cea" } thiserror = "1.0.31" tikv-jemallocator = "0.5" diff --git a/assets/licenses.html b/assets/licenses.html index f9d0aa04..cdb3b33a 100644 --- a/assets/licenses.html +++ b/assets/licenses.html @@ -44,8 +44,8 @@

Third Party Licenses

Overview of licenses: