diff --git a/crates/core/src/schema/fast_field.rs b/crates/core/src/schema/fast_field.rs
index 18712d39..44413bce 100644
--- a/crates/core/src/schema/fast_field.rs
+++ b/crates/core/src/schema/fast_field.rs
@@ -24,7 +24,7 @@ use tantivy::{
use crate::{
enum_map::InsertEnumMapKey,
from_discriminant, simhash,
- webpage::{html::FnCache, Html},
+ webpage::{html::FnCache, Html, Webpage},
Result,
};
@@ -41,6 +41,15 @@ pub trait FastField: Clone + Copy + std::fmt::Debug + PartialEq + Eq + std::hash
schema: &tantivy::schema::Schema,
) -> Result<()>;
+ fn add_webpage_tantivy(
+ &self,
+ _webpage: &crate::webpage::Webpage,
+ _doc: &mut TantivyDocument,
+ _schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ Ok(())
+ }
+
fn data_type(&self) -> DataType {
DataType::U64
}
@@ -231,6 +240,20 @@ impl FastField for HostCentrality {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ doc.add_u64(
+ self.tantivy_field(schema),
+ (webpage.host_centrality * FLOAT_SCALING as f64) as u64,
+ );
+
+ Ok(())
+ }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -249,6 +272,17 @@ impl FastField for HostCentralityRank {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ doc.add_u64(self.tantivy_field(schema), webpage.host_centrality_rank);
+
+ Ok(())
+ }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -266,6 +300,20 @@ impl FastField for PageCentrality {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ doc.add_u64(
+ self.tantivy_field(schema),
+ (webpage.page_centrality * FLOAT_SCALING as f64) as u64,
+ );
+
+ Ok(())
+ }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -283,6 +331,17 @@ impl FastField for PageCentralityRank {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ doc.add_u64(self.tantivy_field(schema), webpage.page_centrality_rank);
+
+ Ok(())
+ }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -300,6 +359,17 @@ impl FastField for FetchTimeMs {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ doc.add_u64(self.tantivy_field(schema), webpage.fetch_time_ms);
+
+ Ok(())
+ }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -370,6 +440,25 @@ impl FastField for Region {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ let region = crate::webpage::region::Region::guess_from(webpage);
+ if let Ok(region) = region {
+ doc.add_u64(self.tantivy_field(schema), region.id());
+ } else {
+ doc.add_u64(
+ self.tantivy_field(schema),
+ crate::webpage::region::Region::All.id(),
+ );
+ }
+
+ Ok(())
+ }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -802,6 +891,20 @@ impl FastField for PreComputedScore {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ doc.add_u64(
+ self.tantivy_field(schema),
+ (webpage.pre_computed_score * FLOAT_SCALING as f64) as u64,
+ );
+
+ Ok(())
+ }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -824,6 +927,24 @@ impl FastField for HostNodeID {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ match &webpage.node_id {
+ Some(node_id) => {
+ doc.add_u64(self.tantivy_field(schema), node_id.as_u64());
+ }
+ None => {
+ doc.add_u64(self.tantivy_field(schema), u64::MAX);
+ }
+ }
+
+ Ok(())
+ }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1047,6 +1168,24 @@ impl FastField for TitleEmbeddings {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ if let Some(emb) = &webpage.title_embedding {
+ let mut serialized = Vec::new();
+ emb.write_bytes(&mut serialized)?;
+
+ doc.add_bytes(self.tantivy_field(schema), serialized);
+ } else {
+ doc.add_bytes(self.tantivy_field(schema), Vec::new());
+ }
+
+ Ok(())
+ }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1073,4 +1212,22 @@ impl FastField for KeywordEmbeddings {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ if let Some(emb) = &webpage.keyword_embedding {
+ let mut serialized = Vec::new();
+ emb.write_bytes(&mut serialized)?;
+
+ doc.add_bytes(self.tantivy_field(schema), serialized);
+ } else {
+ doc.add_bytes(self.tantivy_field(schema), Vec::new());
+ }
+
+ Ok(())
+ }
}
diff --git a/crates/core/src/schema/text_field.rs b/crates/core/src/schema/text_field.rs
index 079679e9..a69cbfd0 100644
--- a/crates/core/src/schema/text_field.rs
+++ b/crates/core/src/schema/text_field.rs
@@ -18,6 +18,7 @@ use enum_dispatch::enum_dispatch;
use strum::{EnumDiscriminants, VariantArray};
use tantivy::{
schema::{IndexRecordOption, TextFieldIndexing, TextOptions},
+ time::OffsetDateTime,
tokenizer::PreTokenizedString,
TantivyDocument,
};
@@ -50,6 +51,15 @@ pub trait TextField:
schema: &tantivy::schema::Schema,
) -> Result<()>;
+ fn add_webpage_tantivy(
+ &self,
+ _webpage: &crate::webpage::Webpage,
+ _doc: &mut TantivyDocument,
+ _schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ Ok(())
+ }
+
fn indexing_tokenizer(&self) -> Tokenizer {
Tokenizer::default()
}
@@ -919,6 +929,20 @@ impl TextField for BacklinkText {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &crate::webpage::Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ doc.add_text(
+ self.tantivy_field(schema),
+ webpage.backlink_labels.join("\n"),
+ );
+
+ Ok(())
+ }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -978,6 +1002,20 @@ impl TextField for DmozDescription {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &crate::webpage::Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ doc.add_text(
+ self.tantivy_field(schema),
+ webpage.dmoz_description().unwrap_or_default(),
+ );
+
+ Ok(())
+ }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1264,6 +1302,22 @@ impl TextField for SafetyClassification {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &crate::webpage::Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ let safety = webpage
+ .safety_classification
+ .map(|label| label.to_string())
+ .unwrap_or_default();
+
+ doc.add_text(self.tantivy_field(schema), safety);
+
+ Ok(())
+ }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1290,6 +1344,22 @@ impl TextField for InsertionTimestamp {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &crate::webpage::Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ doc.add_date(
+ self.tantivy_field(schema),
+ tantivy::DateTime::from_utc(OffsetDateTime::from_unix_timestamp(
+ webpage.inserted_at.timestamp(),
+ )?),
+ );
+
+ Ok(())
+ }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1343,4 +1413,15 @@ impl TextField for Keywords {
) -> Result<()> {
Ok(())
}
+
+ fn add_webpage_tantivy(
+ &self,
+ webpage: &crate::webpage::Webpage,
+ doc: &mut TantivyDocument,
+ schema: &tantivy::schema::Schema,
+ ) -> Result<()> {
+ doc.add_text(self.tantivy_field(schema), webpage.keywords.join("\n"));
+
+ Ok(())
+ }
}
diff --git a/crates/core/src/webpage/mod.rs b/crates/core/src/webpage/mod.rs
index 87b7b159..ea507f42 100644
--- a/crates/core/src/webpage/mod.rs
+++ b/crates/core/src/webpage/mod.rs
@@ -15,10 +15,7 @@
// along with this program. If not, see .
use crate::{
- schema::{
- fast_field::{self, FastField},
- text_field::{self, TextField},
- },
+ schema::{fast_field::FastField, text_field::TextField, Field},
webgraph::NodeID,
Result,
};
@@ -26,13 +23,9 @@ use candle_core::Tensor;
use chrono::{DateTime, Utc};
use std::collections::HashMap;
-use tantivy::{time::OffsetDateTime, TantivyDocument};
+use tantivy::TantivyDocument;
use url::Url;
-use crate::schema::FLOAT_SCALING;
-
-use self::region::Region;
-
mod adservers;
pub mod html;
mod just_text;
@@ -117,7 +110,7 @@ impl Webpage {
})
}
- fn dmoz_description(&self) -> Option {
+ pub fn dmoz_description(&self) -> Option {
self.dmoz_description.as_ref().and_then(|desc| {
if !self.html.metadata().iter().any(|metadata| {
if let Some(content) = metadata.get(&"content".to_string()) {
@@ -134,172 +127,15 @@ impl Webpage {
}
pub fn as_tantivy(&self, schema: &tantivy::schema::Schema) -> Result {
- let region = Region::guess_from(self);
-
- let dmoz_description = self.dmoz_description();
-
let mut doc = self.html.as_tantivy(schema)?;
- if let Ok(region) = region {
- doc.add_u64(
- schema
- .get_field(fast_field::Region.name())
- .expect("Failed to get region field"),
- region.id(),
- );
- } else {
- doc.add_u64(
- schema
- .get_field(fast_field::Region.name())
- .expect("Failed to get region field"),
- Region::All.id(),
- );
- }
-
- let backlink_text: String =
- itertools::intersperse(self.backlink_labels.clone(), "\n".to_string()).collect();
-
- doc.add_text(
- schema
- .get_field(text_field::BacklinkText.name())
- .expect("Failed to get backlink-text field"),
- backlink_text,
- );
-
- doc.add_text(
- schema
- .get_field(text_field::Keywords.name())
- .expect("Failed to get keywords field"),
- self.keywords.join("\n"),
- );
-
- doc.add_date(
- schema
- .get_field(text_field::InsertionTimestamp.name())
- .expect("Failed to get insertion-timestamp field"),
- tantivy::DateTime::from_utc(OffsetDateTime::from_unix_timestamp(
- self.inserted_at.timestamp(),
- )?),
- );
-
- let safety = self
- .safety_classification
- .map(|label| label.to_string())
- .unwrap_or_default();
-
- doc.add_text(
- schema
- .get_field(text_field::SafetyClassification.name())
- .expect("Failed to get safety_classification field"),
- safety,
- );
-
- doc.add_u64(
- schema
- .get_field(fast_field::HostCentrality.name())
- .expect("Failed to get host_centrality field"),
- (self.host_centrality * FLOAT_SCALING as f64) as u64,
- );
-
- doc.add_u64(
- schema
- .get_field(fast_field::HostCentralityRank.name())
- .expect("Failed to get host_centrality_rank field"),
- self.host_centrality_rank,
- );
-
- doc.add_u64(
- schema
- .get_field(fast_field::PageCentrality.name())
- .expect("Failed to get page_centrality field"),
- (self.page_centrality * FLOAT_SCALING as f64) as u64,
- );
-
- doc.add_u64(
- schema
- .get_field(fast_field::PageCentralityRank.name())
- .expect("Failed to get page_centrality_rank field"),
- self.page_centrality_rank,
- );
-
- doc.add_u64(
- schema
- .get_field(fast_field::FetchTimeMs.name())
- .expect("Failed to get fetch_time_ms field"),
- self.fetch_time_ms,
- );
-
- doc.add_u64(
- schema
- .get_field(fast_field::PreComputedScore.name())
- .expect("failed to get pre_computed_score field"),
- (self.pre_computed_score * FLOAT_SCALING as f64) as u64,
- );
-
- if let Some(emb) = &self.title_embedding {
- let mut serialized = Vec::new();
- emb.write_bytes(&mut serialized)?;
-
- doc.add_bytes(
- schema
- .get_field(fast_field::TitleEmbeddings.name())
- .expect("Failed to get title_embeddings field"),
- serialized,
- );
- } else {
- doc.add_bytes(
- schema
- .get_field(fast_field::TitleEmbeddings.name())
- .expect("Failed to get title_embeddings field"),
- Vec::new(),
- );
- }
-
- if let Some(emb) = &self.keyword_embedding {
- let mut serialized = Vec::new();
- emb.write_bytes(&mut serialized)?;
-
- doc.add_bytes(
- schema
- .get_field(fast_field::KeywordEmbeddings.name())
- .expect("Failed to get keyword_embeddings field"),
- serialized,
- );
- } else {
- doc.add_bytes(
- schema
- .get_field(fast_field::KeywordEmbeddings.name())
- .expect("Failed to get keyword_embeddings field"),
- Vec::new(),
- );
- }
-
- match &self.node_id {
- Some(node_id) => {
- doc.add_u64(
- schema
- .get_field(fast_field::HostNodeID.name())
- .expect("Failed to get node_id field"),
- node_id.as_u64(),
- );
- }
- None => {
- doc.add_u64(
- schema
- .get_field(fast_field::HostNodeID.name())
- .expect("Failed to get node_id field"),
- u64::MAX,
- );
+ for field in Field::all() {
+ match field {
+ Field::Fast(f) => f.add_webpage_tantivy(self, &mut doc, schema)?,
+ Field::Text(f) => f.add_webpage_tantivy(self, &mut doc, schema)?,
}
}
- doc.add_text(
- schema
- .get_field(text_field::DmozDescription.name())
- .expect("failed to get dmoz_description field"),
- dmoz_description.unwrap_or_default(),
- );
-
Ok(doc)
}
}