Skip to content

Commit

Permalink
extract webpage .as_tantivy into field traits
Browse files Browse the repository at this point in the history
  • Loading branch information
mikkeldenker committed Mar 20, 2024
1 parent 656c118 commit cdedda8
Show file tree
Hide file tree
Showing 3 changed files with 246 additions and 172 deletions.
159 changes: 158 additions & 1 deletion crates/core/src/schema/fast_field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use tantivy::{
use crate::{
enum_map::InsertEnumMapKey,
from_discriminant, simhash,
webpage::{html::FnCache, Html},
webpage::{html::FnCache, Html, Webpage},
Result,
};

Expand All @@ -41,6 +41,15 @@ pub trait FastField: Clone + Copy + std::fmt::Debug + PartialEq + Eq + std::hash
schema: &tantivy::schema::Schema,
) -> Result<()>;

fn add_webpage_tantivy(
&self,
_webpage: &crate::webpage::Webpage,
_doc: &mut TantivyDocument,
_schema: &tantivy::schema::Schema,
) -> Result<()> {
Ok(())
}

fn data_type(&self) -> DataType {
DataType::U64
}
Expand Down Expand Up @@ -231,6 +240,20 @@ impl FastField for HostCentrality {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
doc.add_u64(
self.tantivy_field(schema),
(webpage.host_centrality * FLOAT_SCALING as f64) as u64,
);

Ok(())
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
Expand All @@ -249,6 +272,17 @@ impl FastField for HostCentralityRank {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
doc.add_u64(self.tantivy_field(schema), webpage.host_centrality_rank);

Ok(())
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
Expand All @@ -266,6 +300,20 @@ impl FastField for PageCentrality {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
doc.add_u64(
self.tantivy_field(schema),
(webpage.page_centrality * FLOAT_SCALING as f64) as u64,
);

Ok(())
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
Expand All @@ -283,6 +331,17 @@ impl FastField for PageCentralityRank {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
doc.add_u64(self.tantivy_field(schema), webpage.page_centrality_rank);

Ok(())
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
Expand All @@ -300,6 +359,17 @@ impl FastField for FetchTimeMs {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
doc.add_u64(self.tantivy_field(schema), webpage.fetch_time_ms);

Ok(())
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
Expand Down Expand Up @@ -370,6 +440,25 @@ impl FastField for Region {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
let region = crate::webpage::region::Region::guess_from(webpage);
if let Ok(region) = region {
doc.add_u64(self.tantivy_field(schema), region.id());
} else {
doc.add_u64(
self.tantivy_field(schema),
crate::webpage::region::Region::All.id(),
);
}

Ok(())
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
Expand Down Expand Up @@ -802,6 +891,20 @@ impl FastField for PreComputedScore {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
doc.add_u64(
self.tantivy_field(schema),
(webpage.pre_computed_score * FLOAT_SCALING as f64) as u64,
);

Ok(())
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
Expand All @@ -824,6 +927,24 @@ impl FastField for HostNodeID {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
match &webpage.node_id {
Some(node_id) => {
doc.add_u64(self.tantivy_field(schema), node_id.as_u64());
}
None => {
doc.add_u64(self.tantivy_field(schema), u64::MAX);
}
}

Ok(())
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
Expand Down Expand Up @@ -1047,6 +1168,24 @@ impl FastField for TitleEmbeddings {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
if let Some(emb) = &webpage.title_embedding {
let mut serialized = Vec::new();
emb.write_bytes(&mut serialized)?;

doc.add_bytes(self.tantivy_field(schema), serialized);
} else {
doc.add_bytes(self.tantivy_field(schema), Vec::new());
}

Ok(())
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
Expand All @@ -1073,4 +1212,22 @@ impl FastField for KeywordEmbeddings {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
if let Some(emb) = &webpage.keyword_embedding {
let mut serialized = Vec::new();
emb.write_bytes(&mut serialized)?;

doc.add_bytes(self.tantivy_field(schema), serialized);
} else {
doc.add_bytes(self.tantivy_field(schema), Vec::new());
}

Ok(())
}
}
81 changes: 81 additions & 0 deletions crates/core/src/schema/text_field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use enum_dispatch::enum_dispatch;
use strum::{EnumDiscriminants, VariantArray};
use tantivy::{
schema::{IndexRecordOption, TextFieldIndexing, TextOptions},
time::OffsetDateTime,
tokenizer::PreTokenizedString,
TantivyDocument,
};
Expand Down Expand Up @@ -50,6 +51,15 @@ pub trait TextField:
schema: &tantivy::schema::Schema,
) -> Result<()>;

fn add_webpage_tantivy(
&self,
_webpage: &crate::webpage::Webpage,
_doc: &mut TantivyDocument,
_schema: &tantivy::schema::Schema,
) -> Result<()> {
Ok(())
}

fn indexing_tokenizer(&self) -> Tokenizer {
Tokenizer::default()
}
Expand Down Expand Up @@ -919,6 +929,20 @@ impl TextField for BacklinkText {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &crate::webpage::Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
doc.add_text(
self.tantivy_field(schema),
webpage.backlink_labels.join("\n"),
);

Ok(())
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
Expand Down Expand Up @@ -978,6 +1002,20 @@ impl TextField for DmozDescription {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &crate::webpage::Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
doc.add_text(
self.tantivy_field(schema),
webpage.dmoz_description().unwrap_or_default(),
);

Ok(())
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
Expand Down Expand Up @@ -1264,6 +1302,22 @@ impl TextField for SafetyClassification {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &crate::webpage::Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
let safety = webpage
.safety_classification
.map(|label| label.to_string())
.unwrap_or_default();

doc.add_text(self.tantivy_field(schema), safety);

Ok(())
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
Expand All @@ -1290,6 +1344,22 @@ impl TextField for InsertionTimestamp {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &crate::webpage::Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
doc.add_date(
self.tantivy_field(schema),
tantivy::DateTime::from_utc(OffsetDateTime::from_unix_timestamp(
webpage.inserted_at.timestamp(),
)?),
);

Ok(())
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
Expand Down Expand Up @@ -1343,4 +1413,15 @@ impl TextField for Keywords {
) -> Result<()> {
Ok(())
}

fn add_webpage_tantivy(
&self,
webpage: &crate::webpage::Webpage,
doc: &mut TantivyDocument,
schema: &tantivy::schema::Schema,
) -> Result<()> {
doc.add_text(self.tantivy_field(schema), webpage.keywords.join("\n"));

Ok(())
}
}
Loading

0 comments on commit cdedda8

Please sign in to comment.