Skip to content

Commit

Permalink
Schema fields as traits (#185)
Browse files Browse the repository at this point in the history
* refactor data that is re-used across fields for a particular page during indexing into an 'FnCache'

* automatically generate ALL_FIELDS and ALL_SIGNALS arrays with strum macro. ensures the arrays are always fully up to date

* split up schema fields into submodules

* add textfield trait with enum-dispatch

* add fastfield trait with enum-dispatch

* move field names into trait

* move some trivial functions from 'FastFieldEnum' and 'TextFieldEnum' into their respective traits

* move methods from Field into TextField and FastField traits

* extract html .as_tantivy into textfield trait

* extract html .as_tantivy into fastfield trait

* extract webpage .as_tantivy into field traits

* fix indexer example cleanup
  • Loading branch information
mikkeldenker authored Mar 20, 2024
1 parent 72ac622 commit 2dadbf7
Show file tree
Hide file tree
Showing 34 changed files with 3,325 additions and 1,751 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ proptest-regressions
*.pending-snap
.ipynb_checkpoints
.zed
rustc-ice-*
40 changes: 38 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ crossbeam-channel = "0.5.6"
csv = "1.1.6"
dashmap = { version = "5.4.0", features = ["rayon"] }
encoding_rs = "0.8.31"
enum_dispatch = "0.3.12"
eventsource-stream = "0.2.3"
fend-core = "1.2.2"
flate2 = "1.0.28"
Expand Down Expand Up @@ -95,6 +96,7 @@ scylla = { version = "0.12.0", features = ["chrono"] }
serde = { version = "1.0.137", features = ["rc", "derive"] }
serde_json = "1.0.81"
serde_urlencoded = "0.7.1"
strum = { version = "0.26.2", features = ["derive"] }
tantivy = { git = "https://github.com/quickwit-oss/tantivy", rev = "182f58cea" }
thiserror = "1.0.31"
tikv-jemallocator = "0.5"
Expand Down
7 changes: 5 additions & 2 deletions assets/licenses.html
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ <h1>Third Party Licenses</h1>

<h2>Overview of licenses:</h2>
<ul class="licenses-overview">
<li><a href="#Apache-2.0">Apache License 2.0</a> (394)</li>
<li><a href="#MIT">MIT License</a> (180)</li>
<li><a href="#Apache-2.0">Apache License 2.0</a> (395)</li>
<li><a href="#MIT">MIT License</a> (182)</li>
<li><a href="#MPL-2.0">Mozilla Public License 2.0</a> (9)</li>
<li><a href="#BSD-3-Clause">BSD 3-Clause &quot;New&quot; or &quot;Revised&quot; License</a> (8)</li>
<li><a href="#Unicode-DFS-2016">Unicode License Agreement - Data Files and Software (2016)</a> (5)</li>
Expand Down Expand Up @@ -9889,6 +9889,7 @@ <h4>Used by:</h4>
<li><a href=" https://github.com/zrzka/anes-rs ">anes 0.1.6</a></li>
<li><a href=" https://github.com/huggingface/candle ">candle-nn 0.3.3</a></li>
<li><a href=" https://github.com/huggingface/candle ">candle-transformers 0.3.3</a></li>
<li><a href=" https://gitlab.com/antonok/enum_dispatch ">enum_dispatch 0.3.12</a></li>
<li><a href=" https://github.com/jpopesculian/eventsource-stream ">eventsource-stream 0.2.3</a></li>
<li><a href=" https://github.com/cbreeden/fxhash ">fxhash 0.2.1</a></li>
<li><a href=" https://github.com/starkat99/half-rs ">half 2.4.0</a></li>
Expand Down Expand Up @@ -13161,7 +13162,9 @@ <h3 id="MIT">MIT License</h3>
<h4>Used by:</h4>
<ul class="license-used-by">
<li><a href=" https://github.com/Peternator7/strum ">strum 0.23.0</a></li>
<li><a href=" https://github.com/Peternator7/strum ">strum 0.26.2</a></li>
<li><a href=" https://github.com/Peternator7/strum ">strum_macros 0.23.1</a></li>
<li><a href=" https://github.com/Peternator7/strum ">strum_macros 0.26.2</a></li>
</ul>
<pre class="license-text">MIT License

Expand Down
2 changes: 2 additions & 0 deletions crates/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ crossbeam-channel = { workspace = true }
csv = { workspace = true }
dashmap = { workspace = true }
encoding_rs = { workspace = true }
enum_dispatch = { workspace = true }
eventsource-stream = { workspace = true }
fend-core = { workspace = true }
flate2 = { workspace = true }
Expand Down Expand Up @@ -91,6 +92,7 @@ scylla = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
serde_urlencoded = { workspace = true }
strum = { workspace = true }
tantivy = { workspace = true }
thiserror = { workspace = true }
tokenizers = { workspace = true }
Expand Down
2 changes: 1 addition & 1 deletion crates/core/examples/indexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,6 @@ fn main() -> anyhow::Result<()> {

println!("Indexing took {:?}", start.elapsed());

std::fs::remove_dir(path)?;
std::fs::remove_dir_all(path)?;
Ok(())
}
28 changes: 20 additions & 8 deletions crates/core/src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use crate::{
inverted_index::{DocAddress, WebpagePointer},
prehashed::Prehashed,
ranking::initial::{InitialScoreTweaker, Score},
schema::FastField,
schema::{fast_field, FastFieldEnum},
simhash,
};

Expand Down Expand Up @@ -139,7 +139,7 @@ pub struct TopSegmentCollector {
}

impl TopSegmentCollector {
fn get_hash(&self, doc: DocId, field1: FastField, field2: FastField) -> Prehashed {
fn get_hash(&self, doc: DocId, field1: FastFieldEnum, field2: FastFieldEnum) -> Prehashed {
let field_reader = self.fastfield_segment_reader.get_field_reader(doc);

let hash = [
Expand Down Expand Up @@ -169,19 +169,31 @@ impl TopSegmentCollector {
let simhash: Option<u64> = self
.fastfield_segment_reader
.get_field_reader(doc)
.get(FastField::SimHash)
.get(fast_field::SimHash.into())
.unwrap()
.into();

self.bucket_collector.insert(SegmentDoc {
hashes: Hashes {
site: self.get_hash(doc, FastField::SiteHash1, FastField::SiteHash2),
title: self.get_hash(doc, FastField::TitleHash1, FastField::TitleHash2),
url: self.get_hash(doc, FastField::UrlHash1, FastField::UrlHash2),
site: self.get_hash(
doc,
fast_field::SiteHash1.into(),
fast_field::SiteHash2.into(),
),
title: self.get_hash(
doc,
fast_field::TitleHash1.into(),
fast_field::TitleHash2.into(),
),
url: self.get_hash(
doc,
fast_field::UrlHash1.into(),
fast_field::UrlHash2.into(),
),
url_without_tld: self.get_hash(
doc,
FastField::UrlWithoutTldHash1,
FastField::UrlWithoutTldHash2,
fast_field::UrlWithoutTldHash1.into(),
fast_field::UrlWithoutTldHash2.into(),
),
simhash: simhash.unwrap(),
},
Expand Down
43 changes: 24 additions & 19 deletions crates/core/src/enum_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,30 @@

use serde::{Deserialize, Serialize};

pub trait InsertEnumMapKey: Sized {
fn into_usize(self) -> usize;
}

pub trait GetEnumMapKey: Sized {
fn from_usize(value: usize) -> Option<Self>;
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct EnumMap<K: Into<usize>, V> {
pub struct EnumMap<K: InsertEnumMapKey, V> {
inner: Vec<Option<V>>,
len: usize,
_phantom: std::marker::PhantomData<K>,
}

impl<K: Into<usize>, V> Default for EnumMap<K, V> {
impl<K: InsertEnumMapKey, V> Default for EnumMap<K, V> {
fn default() -> Self {
Self::new()
}
}

impl<K, V> EnumMap<K, V>
where
K: Into<usize>,
K: InsertEnumMapKey,
{
pub fn new() -> Self {
Self {
Expand All @@ -42,7 +50,7 @@ where
}

pub fn insert(&mut self, key: K, value: V) {
let key = key.into();
let key = key.into_usize();

if key >= self.inner.len() {
self.inner.resize_with(key + 1, || None);
Expand All @@ -60,7 +68,7 @@ where
}

pub fn get(&self, key: K) -> Option<&V> {
let key = key.into();
let key = key.into_usize();
if key >= self.inner.len() {
None
} else {
Expand All @@ -81,7 +89,7 @@ where
}

pub fn get_mut(&mut self, key: K) -> Option<&mut V> {
let key = key.into();
let key = key.into_usize();
if key >= self.inner.len() {
None
} else {
Expand All @@ -92,19 +100,19 @@ where

impl<K, V> EnumMap<K, V>
where
K: TryFrom<usize> + Into<usize>,
K: GetEnumMapKey + InsertEnumMapKey,
{
pub fn keys(&self) -> impl Iterator<Item = K> + '_ {
self.inner
.iter()
.enumerate()
.filter_map(|(key, value)| value.as_ref().and_then(|_| K::try_from(key).ok()))
.filter_map(|(key, value)| value.as_ref().and_then(|_| K::from_usize(key)))
}
}

impl<K, V> FromIterator<(K, V)> for EnumMap<K, V>
where
K: Into<usize>,
K: InsertEnumMapKey,
{
fn from_iter<T: IntoIterator<Item = (K, V)>>(iter: T) -> Self {
let mut map = Self::new();
Expand All @@ -118,17 +126,17 @@ where
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct EnumSet<K: Into<usize>> {
pub struct EnumSet<K: InsertEnumMapKey> {
map: EnumMap<K, ()>,
}

impl<K: Into<usize>> Default for EnumSet<K> {
impl<K: InsertEnumMapKey> Default for EnumSet<K> {
fn default() -> Self {
Self::new()
}
}

impl<K: Into<usize>> EnumSet<K> {
impl<K: InsertEnumMapKey> EnumSet<K> {
pub fn new() -> Self {
Self {
map: EnumMap::new(),
Expand All @@ -148,10 +156,7 @@ impl<K: Into<usize>> EnumSet<K> {
}
}

impl<K> EnumSet<K>
where
K: TryFrom<usize> + Into<usize>,
{
impl<K: InsertEnumMapKey + GetEnumMapKey> EnumSet<K> {
pub fn iter(&self) -> impl Iterator<Item = K> + '_ {
self.map.keys()
}
Expand All @@ -168,9 +173,9 @@ mod tests {
C,
}

impl From<TestEnum> for usize {
fn from(val: TestEnum) -> Self {
val as usize
impl InsertEnumMapKey for TestEnum {
fn into_usize(self) -> usize {
self as usize
}
}

Expand Down
10 changes: 5 additions & 5 deletions crates/core/src/fastfield_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use tantivy::{columnar::ColumnValues, DocId, SegmentId};

use crate::{
enum_map::EnumMap,
schema::{DataType, FastField, Field},
schema::{fast_field::FastField, DataType, FastFieldEnum, Field},
};

#[derive(Default, Clone)]
Expand Down Expand Up @@ -53,7 +53,7 @@ impl FastFieldReader {
let mut u64s = EnumMap::new();
let mut bytes = EnumMap::new();

for field in Field::all().filter_map(Field::as_fast) {
for field in Field::all().filter_map(|f| f.as_fast()) {
match field.data_type() {
DataType::U64 => {
if let Ok(reader) = fastfield_readers.u64(field.name()) {
Expand Down Expand Up @@ -83,8 +83,8 @@ impl FastFieldReader {
}

struct AllReaders {
u64s: EnumMap<FastField, tantivy::columnar::Column<u64>>,
bytes: EnumMap<FastField, tantivy::columnar::BytesColumn>,
u64s: EnumMap<FastFieldEnum, tantivy::columnar::Column<u64>>,
bytes: EnumMap<FastFieldEnum, tantivy::columnar::BytesColumn>,
}

pub enum Value {
Expand Down Expand Up @@ -147,7 +147,7 @@ pub struct FieldReader<'a> {
}

impl<'a> FieldReader<'a> {
pub fn get(&self, field: FastField) -> Option<Value> {
pub fn get(&self, field: FastFieldEnum) -> Option<Value> {
match field.data_type() {
DataType::U64 => Some(
self.readers
Expand Down
Loading

0 comments on commit 2dadbf7

Please sign in to comment.