diff --git a/crates/core/src/schema/fast_field.rs b/crates/core/src/schema/fast_field.rs index 18712d39..44413bce 100644 --- a/crates/core/src/schema/fast_field.rs +++ b/crates/core/src/schema/fast_field.rs @@ -24,7 +24,7 @@ use tantivy::{ use crate::{ enum_map::InsertEnumMapKey, from_discriminant, simhash, - webpage::{html::FnCache, Html}, + webpage::{html::FnCache, Html, Webpage}, Result, }; @@ -41,6 +41,15 @@ pub trait FastField: Clone + Copy + std::fmt::Debug + PartialEq + Eq + std::hash schema: &tantivy::schema::Schema, ) -> Result<()>; + fn add_webpage_tantivy( + &self, + _webpage: &crate::webpage::Webpage, + _doc: &mut TantivyDocument, + _schema: &tantivy::schema::Schema, + ) -> Result<()> { + Ok(()) + } + fn data_type(&self) -> DataType { DataType::U64 } @@ -231,6 +240,20 @@ impl FastField for HostCentrality { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + doc.add_u64( + self.tantivy_field(schema), + (webpage.host_centrality * FLOAT_SCALING as f64) as u64, + ); + + Ok(()) + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -249,6 +272,17 @@ impl FastField for HostCentralityRank { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + doc.add_u64(self.tantivy_field(schema), webpage.host_centrality_rank); + + Ok(()) + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -266,6 +300,20 @@ impl FastField for PageCentrality { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + doc.add_u64( + self.tantivy_field(schema), + (webpage.page_centrality * FLOAT_SCALING as f64) as u64, + ); + + Ok(()) + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -283,6 +331,17 @@ impl FastField for PageCentralityRank { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + doc.add_u64(self.tantivy_field(schema), webpage.page_centrality_rank); + + Ok(()) + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -300,6 +359,17 @@ impl FastField for FetchTimeMs { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + doc.add_u64(self.tantivy_field(schema), webpage.fetch_time_ms); + + Ok(()) + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -370,6 +440,25 @@ impl FastField for Region { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + let region = crate::webpage::region::Region::guess_from(webpage); + if let Ok(region) = region { + doc.add_u64(self.tantivy_field(schema), region.id()); + } else { + doc.add_u64( + self.tantivy_field(schema), + crate::webpage::region::Region::All.id(), + ); + } + + Ok(()) + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -802,6 +891,20 @@ impl FastField for PreComputedScore { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + doc.add_u64( + self.tantivy_field(schema), + (webpage.pre_computed_score * FLOAT_SCALING as f64) as u64, + ); + + Ok(()) + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -824,6 +927,24 @@ impl FastField for HostNodeID { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + match &webpage.node_id { + Some(node_id) => { + doc.add_u64(self.tantivy_field(schema), node_id.as_u64()); + } + None => { + doc.add_u64(self.tantivy_field(schema), u64::MAX); + } + } + + Ok(()) + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -1047,6 +1168,24 @@ impl FastField for TitleEmbeddings { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + if let Some(emb) = &webpage.title_embedding { + let mut serialized = Vec::new(); + emb.write_bytes(&mut serialized)?; + + doc.add_bytes(self.tantivy_field(schema), serialized); + } else { + doc.add_bytes(self.tantivy_field(schema), Vec::new()); + } + + Ok(()) + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -1073,4 +1212,22 @@ impl FastField for KeywordEmbeddings { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + if let Some(emb) = &webpage.keyword_embedding { + let mut serialized = Vec::new(); + emb.write_bytes(&mut serialized)?; + + doc.add_bytes(self.tantivy_field(schema), serialized); + } else { + doc.add_bytes(self.tantivy_field(schema), Vec::new()); + } + + Ok(()) + } } diff --git a/crates/core/src/schema/text_field.rs b/crates/core/src/schema/text_field.rs index 079679e9..a69cbfd0 100644 --- a/crates/core/src/schema/text_field.rs +++ b/crates/core/src/schema/text_field.rs @@ -18,6 +18,7 @@ use enum_dispatch::enum_dispatch; use strum::{EnumDiscriminants, VariantArray}; use tantivy::{ schema::{IndexRecordOption, TextFieldIndexing, TextOptions}, + time::OffsetDateTime, tokenizer::PreTokenizedString, TantivyDocument, }; @@ -50,6 +51,15 @@ pub trait TextField: schema: &tantivy::schema::Schema, ) -> Result<()>; + fn add_webpage_tantivy( + &self, + _webpage: &crate::webpage::Webpage, + _doc: &mut TantivyDocument, + _schema: &tantivy::schema::Schema, + ) -> Result<()> { + Ok(()) + } + fn indexing_tokenizer(&self) -> Tokenizer { Tokenizer::default() } @@ -919,6 +929,20 @@ impl TextField for BacklinkText { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &crate::webpage::Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + doc.add_text( + self.tantivy_field(schema), + webpage.backlink_labels.join("\n"), + ); + + Ok(()) + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -978,6 +1002,20 @@ impl TextField for DmozDescription { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &crate::webpage::Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + doc.add_text( + self.tantivy_field(schema), + webpage.dmoz_description().unwrap_or_default(), + ); + + Ok(()) + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -1264,6 +1302,22 @@ impl TextField for SafetyClassification { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &crate::webpage::Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + let safety = webpage + .safety_classification + .map(|label| label.to_string()) + .unwrap_or_default(); + + doc.add_text(self.tantivy_field(schema), safety); + + Ok(()) + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -1290,6 +1344,22 @@ impl TextField for InsertionTimestamp { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &crate::webpage::Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + doc.add_date( + self.tantivy_field(schema), + tantivy::DateTime::from_utc(OffsetDateTime::from_unix_timestamp( + webpage.inserted_at.timestamp(), + )?), + ); + + Ok(()) + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -1343,4 +1413,15 @@ impl TextField for Keywords { ) -> Result<()> { Ok(()) } + + fn add_webpage_tantivy( + &self, + webpage: &crate::webpage::Webpage, + doc: &mut TantivyDocument, + schema: &tantivy::schema::Schema, + ) -> Result<()> { + doc.add_text(self.tantivy_field(schema), webpage.keywords.join("\n")); + + Ok(()) + } } diff --git a/crates/core/src/webpage/mod.rs b/crates/core/src/webpage/mod.rs index 87b7b159..ea507f42 100644 --- a/crates/core/src/webpage/mod.rs +++ b/crates/core/src/webpage/mod.rs @@ -15,10 +15,7 @@ // along with this program. If not, see . use crate::{ - schema::{ - fast_field::{self, FastField}, - text_field::{self, TextField}, - }, + schema::{fast_field::FastField, text_field::TextField, Field}, webgraph::NodeID, Result, }; @@ -26,13 +23,9 @@ use candle_core::Tensor; use chrono::{DateTime, Utc}; use std::collections::HashMap; -use tantivy::{time::OffsetDateTime, TantivyDocument}; +use tantivy::TantivyDocument; use url::Url; -use crate::schema::FLOAT_SCALING; - -use self::region::Region; - mod adservers; pub mod html; mod just_text; @@ -117,7 +110,7 @@ impl Webpage { }) } - fn dmoz_description(&self) -> Option { + pub fn dmoz_description(&self) -> Option { self.dmoz_description.as_ref().and_then(|desc| { if !self.html.metadata().iter().any(|metadata| { if let Some(content) = metadata.get(&"content".to_string()) { @@ -134,172 +127,15 @@ impl Webpage { } pub fn as_tantivy(&self, schema: &tantivy::schema::Schema) -> Result { - let region = Region::guess_from(self); - - let dmoz_description = self.dmoz_description(); - let mut doc = self.html.as_tantivy(schema)?; - if let Ok(region) = region { - doc.add_u64( - schema - .get_field(fast_field::Region.name()) - .expect("Failed to get region field"), - region.id(), - ); - } else { - doc.add_u64( - schema - .get_field(fast_field::Region.name()) - .expect("Failed to get region field"), - Region::All.id(), - ); - } - - let backlink_text: String = - itertools::intersperse(self.backlink_labels.clone(), "\n".to_string()).collect(); - - doc.add_text( - schema - .get_field(text_field::BacklinkText.name()) - .expect("Failed to get backlink-text field"), - backlink_text, - ); - - doc.add_text( - schema - .get_field(text_field::Keywords.name()) - .expect("Failed to get keywords field"), - self.keywords.join("\n"), - ); - - doc.add_date( - schema - .get_field(text_field::InsertionTimestamp.name()) - .expect("Failed to get insertion-timestamp field"), - tantivy::DateTime::from_utc(OffsetDateTime::from_unix_timestamp( - self.inserted_at.timestamp(), - )?), - ); - - let safety = self - .safety_classification - .map(|label| label.to_string()) - .unwrap_or_default(); - - doc.add_text( - schema - .get_field(text_field::SafetyClassification.name()) - .expect("Failed to get safety_classification field"), - safety, - ); - - doc.add_u64( - schema - .get_field(fast_field::HostCentrality.name()) - .expect("Failed to get host_centrality field"), - (self.host_centrality * FLOAT_SCALING as f64) as u64, - ); - - doc.add_u64( - schema - .get_field(fast_field::HostCentralityRank.name()) - .expect("Failed to get host_centrality_rank field"), - self.host_centrality_rank, - ); - - doc.add_u64( - schema - .get_field(fast_field::PageCentrality.name()) - .expect("Failed to get page_centrality field"), - (self.page_centrality * FLOAT_SCALING as f64) as u64, - ); - - doc.add_u64( - schema - .get_field(fast_field::PageCentralityRank.name()) - .expect("Failed to get page_centrality_rank field"), - self.page_centrality_rank, - ); - - doc.add_u64( - schema - .get_field(fast_field::FetchTimeMs.name()) - .expect("Failed to get fetch_time_ms field"), - self.fetch_time_ms, - ); - - doc.add_u64( - schema - .get_field(fast_field::PreComputedScore.name()) - .expect("failed to get pre_computed_score field"), - (self.pre_computed_score * FLOAT_SCALING as f64) as u64, - ); - - if let Some(emb) = &self.title_embedding { - let mut serialized = Vec::new(); - emb.write_bytes(&mut serialized)?; - - doc.add_bytes( - schema - .get_field(fast_field::TitleEmbeddings.name()) - .expect("Failed to get title_embeddings field"), - serialized, - ); - } else { - doc.add_bytes( - schema - .get_field(fast_field::TitleEmbeddings.name()) - .expect("Failed to get title_embeddings field"), - Vec::new(), - ); - } - - if let Some(emb) = &self.keyword_embedding { - let mut serialized = Vec::new(); - emb.write_bytes(&mut serialized)?; - - doc.add_bytes( - schema - .get_field(fast_field::KeywordEmbeddings.name()) - .expect("Failed to get keyword_embeddings field"), - serialized, - ); - } else { - doc.add_bytes( - schema - .get_field(fast_field::KeywordEmbeddings.name()) - .expect("Failed to get keyword_embeddings field"), - Vec::new(), - ); - } - - match &self.node_id { - Some(node_id) => { - doc.add_u64( - schema - .get_field(fast_field::HostNodeID.name()) - .expect("Failed to get node_id field"), - node_id.as_u64(), - ); - } - None => { - doc.add_u64( - schema - .get_field(fast_field::HostNodeID.name()) - .expect("Failed to get node_id field"), - u64::MAX, - ); + for field in Field::all() { + match field { + Field::Fast(f) => f.add_webpage_tantivy(self, &mut doc, schema)?, + Field::Text(f) => f.add_webpage_tantivy(self, &mut doc, schema)?, } } - doc.add_text( - schema - .get_field(text_field::DmozDescription.name()) - .expect("failed to get dmoz_description field"), - dmoz_description.unwrap_or_default(), - ); - Ok(doc) } }