From 714656060b76efd545a0805417f021da64073f0d Mon Sep 17 00:00:00 2001 From: Mikkel Denker Date: Wed, 20 Mar 2024 10:14:21 +0100 Subject: [PATCH] split up schema fields into submodules --- crates/core/src/enum_map.rs | 43 ++- crates/core/src/ranking/signal.rs | 7 +- crates/core/src/schema/fast_field.rs | 114 ++++++ crates/core/src/{schema.rs => schema/mod.rs} | 362 +------------------ crates/core/src/schema/text_field.rs | 286 +++++++++++++++ crates/core/src/webpage/html/microformats.rs | 28 +- crates/core/src/webpage/html/robots_meta.rs | 11 +- 7 files changed, 455 insertions(+), 396 deletions(-) create mode 100644 crates/core/src/schema/fast_field.rs rename crates/core/src/{schema.rs => schema/mod.rs} (54%) create mode 100644 crates/core/src/schema/text_field.rs diff --git a/crates/core/src/enum_map.rs b/crates/core/src/enum_map.rs index ed3cf07a..45eeb56f 100644 --- a/crates/core/src/enum_map.rs +++ b/crates/core/src/enum_map.rs @@ -16,14 +16,22 @@ use serde::{Deserialize, Serialize}; +pub trait InsertEnumMapKey: Sized { + fn into_usize(self) -> usize; +} + +pub trait GetEnumMapKey: Sized { + fn from_usize(value: usize) -> Option; +} + #[derive(Clone, Debug, Serialize, Deserialize)] -pub struct EnumMap, V> { +pub struct EnumMap { inner: Vec>, len: usize, _phantom: std::marker::PhantomData, } -impl, V> Default for EnumMap { +impl Default for EnumMap { fn default() -> Self { Self::new() } @@ -31,7 +39,7 @@ impl, V> Default for EnumMap { impl EnumMap where - K: Into, + K: InsertEnumMapKey, { pub fn new() -> Self { Self { @@ -42,7 +50,7 @@ where } pub fn insert(&mut self, key: K, value: V) { - let key = key.into(); + let key = key.into_usize(); if key >= self.inner.len() { self.inner.resize_with(key + 1, || None); @@ -60,7 +68,7 @@ where } pub fn get(&self, key: K) -> Option<&V> { - let key = key.into(); + let key = key.into_usize(); if key >= self.inner.len() { None } else { @@ -81,7 +89,7 @@ where } pub fn get_mut(&mut self, key: K) -> Option<&mut V> { - let key = key.into(); + let key = key.into_usize(); if key >= self.inner.len() { None } else { @@ -92,19 +100,19 @@ where impl EnumMap where - K: TryFrom + Into, + K: GetEnumMapKey + InsertEnumMapKey, { pub fn keys(&self) -> impl Iterator + '_ { self.inner .iter() .enumerate() - .filter_map(|(key, value)| value.as_ref().and_then(|_| K::try_from(key).ok())) + .filter_map(|(key, value)| value.as_ref().and_then(|_| K::from_usize(key))) } } impl FromIterator<(K, V)> for EnumMap where - K: Into, + K: InsertEnumMapKey, { fn from_iter>(iter: T) -> Self { let mut map = Self::new(); @@ -118,17 +126,17 @@ where } #[derive(Clone, Debug, Serialize, Deserialize)] -pub struct EnumSet> { +pub struct EnumSet { map: EnumMap, } -impl> Default for EnumSet { +impl Default for EnumSet { fn default() -> Self { Self::new() } } -impl> EnumSet { +impl EnumSet { pub fn new() -> Self { Self { map: EnumMap::new(), @@ -148,10 +156,7 @@ impl> EnumSet { } } -impl EnumSet -where - K: TryFrom + Into, -{ +impl EnumSet { pub fn iter(&self) -> impl Iterator + '_ { self.map.keys() } @@ -168,9 +173,9 @@ mod tests { C, } - impl From for usize { - fn from(val: TestEnum) -> Self { - val as usize + impl InsertEnumMapKey for TestEnum { + fn into_usize(self) -> usize { + self as usize } } diff --git a/crates/core/src/ranking/signal.rs b/crates/core/src/ranking/signal.rs index 47a8c599..037de53e 100644 --- a/crates/core/src/ranking/signal.rs +++ b/crates/core/src/ranking/signal.rs @@ -14,6 +14,7 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +use crate::enum_map::InsertEnumMapKey; use crate::query::optic::AsSearchableRule; use crate::query::Query; use crate::Result; @@ -143,9 +144,9 @@ pub enum Signal { KeywordEmbeddingSimilarity, } -impl From for usize { - fn from(signal: Signal) -> Self { - signal as usize +impl InsertEnumMapKey for Signal { + fn into_usize(self) -> usize { + self as usize } } diff --git a/crates/core/src/schema/fast_field.rs b/crates/core/src/schema/fast_field.rs new file mode 100644 index 00000000..e19301a2 --- /dev/null +++ b/crates/core/src/schema/fast_field.rs @@ -0,0 +1,114 @@ +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see + +use strum::VariantArray; + +use crate::enum_map::InsertEnumMapKey; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, VariantArray)] +pub enum FastField { + IsHomepage, + HostCentrality, + HostCentralityRank, + PageCentrality, + PageCentralityRank, + FetchTimeMs, + LastUpdated, + TrackerScore, + Region, + NumUrlTokens, + NumTitleTokens, + NumCleanBodyTokens, + NumDescriptionTokens, + NumUrlForSiteOperatorTokens, + NumDomainTokens, + NumMicroformatTagsTokens, + SiteHash1, + SiteHash2, + UrlWithoutQueryHash1, + UrlWithoutQueryHash2, + TitleHash1, + TitleHash2, + UrlHash1, + UrlHash2, + DomainHash1, + DomainHash2, + UrlWithoutTldHash1, + UrlWithoutTldHash2, + PreComputedScore, + HostNodeID, + SimHash, + NumFlattenedSchemaTokens, + NumPathAndQuerySlashes, + NumPathAndQueryDigits, + LikelyHasAds, + LikelyHasPaywall, + LinkDensity, + TitleEmbeddings, + KeywordEmbeddings, +} + +impl FastField { + pub fn name(&self) -> &str { + match self { + FastField::HostCentrality => "host_centrality", + FastField::HostCentralityRank => "host_centrality_rank", + FastField::PageCentrality => "page_centrality", + FastField::PageCentralityRank => "page_centrality_rank", + FastField::IsHomepage => "is_homepage", + FastField::FetchTimeMs => "fetch_time_ms", + FastField::LastUpdated => "last_updated", + FastField::TrackerScore => "tracker_score", + FastField::Region => "region", + FastField::NumUrlTokens => "num_url_tokens", + FastField::NumTitleTokens => "num_title_tokens", + FastField::NumCleanBodyTokens => "num_clean_body_tokens", + FastField::NumDescriptionTokens => "num_description_tokens", + FastField::NumDomainTokens => "num_domain_tokens", + FastField::NumUrlForSiteOperatorTokens => "num_url_for_site_operator_tokens", + FastField::NumFlattenedSchemaTokens => "num_flattened_schema_tokens", + FastField::NumMicroformatTagsTokens => "num_microformat_tags_tokens", + FastField::SiteHash1 => "site_hash1", + FastField::SiteHash2 => "site_hash2", + FastField::UrlWithoutQueryHash1 => "url_without_query_hash1", + FastField::UrlWithoutQueryHash2 => "url_without_query_hash2", + FastField::TitleHash1 => "title_hash1", + FastField::TitleHash2 => "title_hash2", + FastField::UrlHash1 => "url_hash1", + FastField::UrlHash2 => "url_hash2", + FastField::DomainHash1 => "domain_hash1", + FastField::DomainHash2 => "domain_hash2", + FastField::UrlWithoutTldHash1 => "url_without_tld_hash1", + FastField::UrlWithoutTldHash2 => "url_without_tld_hash2", + FastField::PreComputedScore => "pre_computed_score", + FastField::HostNodeID => "host_node_id", + FastField::SimHash => "sim_hash", + FastField::NumPathAndQuerySlashes => "num_path_and_query_slashes", + FastField::NumPathAndQueryDigits => "num_path_and_query_digits", + FastField::LikelyHasAds => "likely_has_ads", + FastField::LikelyHasPaywall => "likely_has_paywall", + FastField::LinkDensity => "link_density", + FastField::TitleEmbeddings => "title_embeddings", + FastField::KeywordEmbeddings => "keyword_embeddings", + } + } +} + +impl InsertEnumMapKey for FastField { + fn into_usize(self) -> usize { + self as usize + } +} diff --git a/crates/core/src/schema.rs b/crates/core/src/schema/mod.rs similarity index 54% rename from crates/core/src/schema.rs rename to crates/core/src/schema/mod.rs index 2f272639..d8b6b973 100644 --- a/crates/core/src/schema.rs +++ b/crates/core/src/schema/mod.rs @@ -1,5 +1,5 @@ // Stract is an open source web search engine. -// Copyright (C) 2023 Stract ApS +// Copyright (C) 2024 Stract ApS // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as @@ -14,368 +14,20 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +mod fast_field; +mod text_field; + use strum::VariantArray; use tantivy::{ - schema::{BytesOptions, IndexRecordOption, NumericOptions, TextFieldIndexing, TextOptions}, + schema::{BytesOptions, NumericOptions, TextFieldIndexing, TextOptions}, DateOptions, }; -use crate::tokenizer::{ - BigramTokenizer, Identity, JsonField, SiteOperatorUrlTokenizer, Tokenizer, TrigramTokenizer, -}; +pub use fast_field::FastField; +pub use text_field::TextField; pub const FLOAT_SCALING: u64 = 1_000_000_000; -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, VariantArray)] -pub enum TextField { - Title, - CleanBody, - StemmedTitle, - StemmedCleanBody, - AllBody, - Url, - UrlNoTokenizer, - UrlForSiteOperator, - SiteWithout, - Domain, - SiteNoTokenizer, - DomainNoTokenizer, - DomainNameNoTokenizer, - /// this field is only set if the webpage is the homepage for the site. Allows us to boost - SiteIfHomepageNoTokenizer, - /// this field is only set if the webpage is the homepage for the site. Allows us to boost - DomainIfHomepage, - /// this field is only set if the webpage is the homepage for the site. Allows us to boost - DomainNameIfHomepageNoTokenizer, - /// this field is only set if the webpage is the homepage for the site. Allows us to boost - DomainIfHomepageNoTokenizer, - /// this field is only set if the webpage is the homepage for the site. Allows us to boost - TitleIfHomepage, - BacklinkText, - Description, - DmozDescription, - SchemaOrgJson, - FlattenedSchemaOrgJson, - CleanBodyBigrams, - TitleBigrams, - CleanBodyTrigrams, - TitleTrigrams, - MicroformatTags, - /// can either be NSFW or SFW (see safety classifier) - SafetyClassification, - InsertionTimestamp, - RecipeFirstIngredientTagId, - Keywords, -} - -impl From for usize { - fn from(value: TextField) -> Self { - value as usize - } -} - -impl TextField { - pub fn ngram_size(&self) -> usize { - match self { - TextField::Title => 1, - TextField::CleanBody => 1, - TextField::StemmedTitle => 1, - TextField::StemmedCleanBody => 1, - TextField::AllBody => 1, - TextField::Url => 1, - TextField::UrlNoTokenizer => 1, - TextField::UrlForSiteOperator => 1, - TextField::SiteWithout => 1, - TextField::Domain => 1, - TextField::SiteNoTokenizer => 1, - TextField::DomainNoTokenizer => 1, - TextField::DomainNameNoTokenizer => 1, - TextField::SiteIfHomepageNoTokenizer => 1, - TextField::DomainIfHomepage => 1, - TextField::DomainNameIfHomepageNoTokenizer => 1, - TextField::DomainIfHomepageNoTokenizer => 1, - TextField::TitleIfHomepage => 1, - TextField::BacklinkText => 1, - TextField::Description => 1, - TextField::DmozDescription => 1, - TextField::SchemaOrgJson => 1, - TextField::FlattenedSchemaOrgJson => 1, - TextField::CleanBodyBigrams => 2, - TextField::TitleBigrams => 2, - TextField::CleanBodyTrigrams => 3, - TextField::TitleTrigrams => 3, - TextField::MicroformatTags => 1, - TextField::SafetyClassification => 1, - TextField::InsertionTimestamp => 1, - TextField::RecipeFirstIngredientTagId => 1, - TextField::Keywords => 1, - } - } - - pub fn monogram_field(&self) -> TextField { - match self { - TextField::Title => TextField::Title, - TextField::CleanBody => TextField::CleanBody, - TextField::StemmedTitle => TextField::StemmedTitle, - TextField::StemmedCleanBody => TextField::StemmedCleanBody, - TextField::AllBody => TextField::AllBody, - TextField::Url => TextField::Url, - TextField::UrlNoTokenizer => TextField::UrlNoTokenizer, - TextField::UrlForSiteOperator => TextField::UrlForSiteOperator, - TextField::SiteWithout => TextField::SiteWithout, - TextField::Domain => TextField::Domain, - TextField::SiteNoTokenizer => TextField::SiteNoTokenizer, - TextField::DomainNoTokenizer => TextField::DomainNoTokenizer, - TextField::DomainNameNoTokenizer => TextField::DomainNameNoTokenizer, - TextField::SiteIfHomepageNoTokenizer => TextField::SiteIfHomepageNoTokenizer, - TextField::DomainIfHomepage => TextField::DomainIfHomepage, - TextField::DomainNameIfHomepageNoTokenizer => { - TextField::DomainNameIfHomepageNoTokenizer - } - TextField::DomainIfHomepageNoTokenizer => TextField::DomainIfHomepageNoTokenizer, - TextField::TitleIfHomepage => TextField::TitleIfHomepage, - TextField::BacklinkText => TextField::BacklinkText, - TextField::Description => TextField::Description, - TextField::DmozDescription => TextField::DmozDescription, - TextField::SchemaOrgJson => TextField::SchemaOrgJson, - TextField::FlattenedSchemaOrgJson => TextField::FlattenedSchemaOrgJson, - TextField::CleanBodyBigrams => TextField::CleanBody, - TextField::TitleBigrams => TextField::Title, - TextField::CleanBodyTrigrams => TextField::CleanBody, - TextField::TitleTrigrams => TextField::Title, - TextField::MicroformatTags => TextField::MicroformatTags, - TextField::SafetyClassification => TextField::SafetyClassification, - TextField::InsertionTimestamp => TextField::InsertionTimestamp, - TextField::RecipeFirstIngredientTagId => TextField::RecipeFirstIngredientTagId, - TextField::Keywords => TextField::Keywords, - } - } - - pub fn query_tokenizer(&self) -> Tokenizer { - match self { - TextField::TitleBigrams => Tokenizer::default(), - TextField::CleanBodyBigrams => Tokenizer::default(), - TextField::TitleTrigrams => Tokenizer::default(), - TextField::CleanBodyTrigrams => Tokenizer::default(), - _ => self.indexing_tokenizer(), - } - } - - pub fn indexing_tokenizer(&self) -> Tokenizer { - match self { - TextField::Title => Tokenizer::default(), - TextField::CleanBody => Tokenizer::default(), - TextField::StemmedTitle => Tokenizer::new_stemmed(), - TextField::StemmedCleanBody => Tokenizer::new_stemmed(), - TextField::AllBody => Tokenizer::default(), - TextField::Url => Tokenizer::default(), - TextField::UrlNoTokenizer => Tokenizer::Identity(Identity {}), - TextField::UrlForSiteOperator => Tokenizer::SiteOperator(SiteOperatorUrlTokenizer), - TextField::SiteWithout => Tokenizer::default(), - TextField::Domain => Tokenizer::default(), - TextField::SiteNoTokenizer => Tokenizer::Identity(Identity {}), - TextField::SiteIfHomepageNoTokenizer => Tokenizer::Identity(Identity {}), - TextField::DomainNoTokenizer => Tokenizer::Identity(Identity {}), - TextField::DomainNameNoTokenizer => Tokenizer::Identity(Identity {}), - TextField::DomainIfHomepage => Tokenizer::default(), - TextField::DomainNameIfHomepageNoTokenizer => Tokenizer::Identity(Identity {}), - TextField::DomainIfHomepageNoTokenizer => Tokenizer::Identity(Identity {}), - TextField::TitleIfHomepage => Tokenizer::default(), - TextField::BacklinkText => Tokenizer::default(), - TextField::Description => Tokenizer::default(), - TextField::DmozDescription => Tokenizer::default(), - TextField::SchemaOrgJson => Tokenizer::Identity(Identity {}), - TextField::FlattenedSchemaOrgJson => Tokenizer::Json(JsonField), - TextField::CleanBodyBigrams => Tokenizer::Bigram(BigramTokenizer::default()), - TextField::TitleBigrams => Tokenizer::Bigram(BigramTokenizer::default()), - TextField::CleanBodyTrigrams => Tokenizer::Trigram(TrigramTokenizer::default()), - TextField::TitleTrigrams => Tokenizer::Trigram(TrigramTokenizer::default()), - TextField::MicroformatTags => Tokenizer::default(), - TextField::SafetyClassification => Tokenizer::Identity(Identity {}), - TextField::InsertionTimestamp => Tokenizer::Identity(Identity {}), - TextField::RecipeFirstIngredientTagId => Tokenizer::Identity(Identity {}), - TextField::Keywords => Tokenizer::default(), - } - } - - pub fn index_option(&self) -> IndexRecordOption { - if self.has_pos() { - IndexRecordOption::WithFreqsAndPositions - } else { - IndexRecordOption::WithFreqs - } - } - - pub fn has_pos(&self) -> bool { - match self { - TextField::Title => true, - TextField::CleanBody => true, - TextField::StemmedTitle => false, - TextField::StemmedCleanBody => false, - TextField::AllBody => false, - TextField::Url => true, - TextField::UrlNoTokenizer => false, - TextField::UrlForSiteOperator => true, - TextField::SiteWithout => true, - TextField::Domain => true, - TextField::SiteNoTokenizer => false, - TextField::SiteIfHomepageNoTokenizer => false, - TextField::DomainNoTokenizer => false, - TextField::DomainNameNoTokenizer => false, - TextField::DomainIfHomepage => false, - TextField::DomainNameIfHomepageNoTokenizer => false, - TextField::DomainIfHomepageNoTokenizer => false, - TextField::TitleIfHomepage => false, - TextField::BacklinkText => false, - TextField::Description => true, - TextField::DmozDescription => true, - TextField::SchemaOrgJson => false, - TextField::FlattenedSchemaOrgJson => true, - TextField::CleanBodyBigrams => false, - TextField::TitleBigrams => false, - TextField::CleanBodyTrigrams => false, - TextField::TitleTrigrams => false, - TextField::MicroformatTags => true, - TextField::SafetyClassification => false, - TextField::InsertionTimestamp => false, - TextField::RecipeFirstIngredientTagId => false, - TextField::Keywords => false, - } - } - - pub fn name(&self) -> &str { - match self { - TextField::Title => "title", - TextField::CleanBody => "body", - TextField::Url => "url", - TextField::UrlNoTokenizer => "url_no_tokenizer", - TextField::UrlForSiteOperator => "url_for_site_operator", - TextField::SiteWithout => "site", - TextField::Domain => "domain", - TextField::SiteNoTokenizer => "site_no_tokenizer", - TextField::SiteIfHomepageNoTokenizer => "site_if_homepage_no_tokenizer", - TextField::DomainNoTokenizer => "domain_no_tokenizer", - TextField::DomainNameNoTokenizer => "domain_name_no_tokenizer", - TextField::BacklinkText => "backlink_text", - TextField::StemmedTitle => "stemmed_title", - TextField::StemmedCleanBody => "stemmed_body", - TextField::DomainIfHomepage => "domain_if_homepage", - TextField::DomainNameIfHomepageNoTokenizer => "domain_name_if_homepage_no_tokenizer", - TextField::DomainIfHomepageNoTokenizer => "domain_if_homepage_no_tokenizer", - TextField::Description => "description", - TextField::TitleIfHomepage => "title_if_homepage", - TextField::AllBody => "all_body", - TextField::DmozDescription => "dmoz_description", - TextField::SchemaOrgJson => "schema_org_json", - TextField::FlattenedSchemaOrgJson => "flattened_schema_org_json", - TextField::CleanBodyBigrams => "clean_body_bigrams", - TextField::TitleBigrams => "title_bigrams", - TextField::CleanBodyTrigrams => "clean_body_trigrams", - TextField::TitleTrigrams => "title_trigrams", - TextField::MicroformatTags => "microformat_tags", - TextField::SafetyClassification => "safety_classification", - TextField::InsertionTimestamp => "insertion_timestamp", - TextField::RecipeFirstIngredientTagId => "recipe_first_ingredient_tag_id", - TextField::Keywords => "keywords", - } - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, VariantArray)] -pub enum FastField { - IsHomepage, - HostCentrality, - HostCentralityRank, - PageCentrality, - PageCentralityRank, - FetchTimeMs, - LastUpdated, - TrackerScore, - Region, - NumUrlTokens, - NumTitleTokens, - NumCleanBodyTokens, - NumDescriptionTokens, - NumUrlForSiteOperatorTokens, - NumDomainTokens, - NumMicroformatTagsTokens, - SiteHash1, - SiteHash2, - UrlWithoutQueryHash1, - UrlWithoutQueryHash2, - TitleHash1, - TitleHash2, - UrlHash1, - UrlHash2, - DomainHash1, - DomainHash2, - UrlWithoutTldHash1, - UrlWithoutTldHash2, - PreComputedScore, - HostNodeID, - SimHash, - NumFlattenedSchemaTokens, - NumPathAndQuerySlashes, - NumPathAndQueryDigits, - LikelyHasAds, - LikelyHasPaywall, - LinkDensity, - TitleEmbeddings, - KeywordEmbeddings, -} - -impl FastField { - pub fn name(&self) -> &str { - match self { - FastField::HostCentrality => "host_centrality", - FastField::HostCentralityRank => "host_centrality_rank", - FastField::PageCentrality => "page_centrality", - FastField::PageCentralityRank => "page_centrality_rank", - FastField::IsHomepage => "is_homepage", - FastField::FetchTimeMs => "fetch_time_ms", - FastField::LastUpdated => "last_updated", - FastField::TrackerScore => "tracker_score", - FastField::Region => "region", - FastField::NumUrlTokens => "num_url_tokens", - FastField::NumTitleTokens => "num_title_tokens", - FastField::NumCleanBodyTokens => "num_clean_body_tokens", - FastField::NumDescriptionTokens => "num_description_tokens", - FastField::NumDomainTokens => "num_domain_tokens", - FastField::NumUrlForSiteOperatorTokens => "num_url_for_site_operator_tokens", - FastField::NumFlattenedSchemaTokens => "num_flattened_schema_tokens", - FastField::NumMicroformatTagsTokens => "num_microformat_tags_tokens", - FastField::SiteHash1 => "site_hash1", - FastField::SiteHash2 => "site_hash2", - FastField::UrlWithoutQueryHash1 => "url_without_query_hash1", - FastField::UrlWithoutQueryHash2 => "url_without_query_hash2", - FastField::TitleHash1 => "title_hash1", - FastField::TitleHash2 => "title_hash2", - FastField::UrlHash1 => "url_hash1", - FastField::UrlHash2 => "url_hash2", - FastField::DomainHash1 => "domain_hash1", - FastField::DomainHash2 => "domain_hash2", - FastField::UrlWithoutTldHash1 => "url_without_tld_hash1", - FastField::UrlWithoutTldHash2 => "url_without_tld_hash2", - FastField::PreComputedScore => "pre_computed_score", - FastField::HostNodeID => "host_node_id", - FastField::SimHash => "sim_hash", - FastField::NumPathAndQuerySlashes => "num_path_and_query_slashes", - FastField::NumPathAndQueryDigits => "num_path_and_query_digits", - FastField::LikelyHasAds => "likely_has_ads", - FastField::LikelyHasPaywall => "likely_has_paywall", - FastField::LinkDensity => "link_density", - FastField::TitleEmbeddings => "title_embeddings", - FastField::KeywordEmbeddings => "keyword_embeddings", - } - } -} - -impl From for usize { - fn from(value: FastField) -> Self { - value as usize - } -} - #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum Field { Fast(FastField), diff --git a/crates/core/src/schema/text_field.rs b/crates/core/src/schema/text_field.rs new file mode 100644 index 00000000..bb7ec0fa --- /dev/null +++ b/crates/core/src/schema/text_field.rs @@ -0,0 +1,286 @@ +// Stract is an open source web search engine. +// Copyright (C) 2024 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use strum::VariantArray; +use tantivy::schema::IndexRecordOption; + +use crate::{ + enum_map::InsertEnumMapKey, + tokenizer::{ + BigramTokenizer, Identity, JsonField, SiteOperatorUrlTokenizer, Tokenizer, TrigramTokenizer, + }, +}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, VariantArray)] +pub enum TextField { + Title, + CleanBody, + StemmedTitle, + StemmedCleanBody, + AllBody, + Url, + UrlNoTokenizer, + UrlForSiteOperator, + SiteWithout, + Domain, + SiteNoTokenizer, + DomainNoTokenizer, + DomainNameNoTokenizer, + /// this field is only set if the webpage is the homepage for the site. Allows us to boost + SiteIfHomepageNoTokenizer, + /// this field is only set if the webpage is the homepage for the site. Allows us to boost + DomainIfHomepage, + /// this field is only set if the webpage is the homepage for the site. Allows us to boost + DomainNameIfHomepageNoTokenizer, + /// this field is only set if the webpage is the homepage for the site. Allows us to boost + DomainIfHomepageNoTokenizer, + /// this field is only set if the webpage is the homepage for the site. Allows us to boost + TitleIfHomepage, + BacklinkText, + Description, + DmozDescription, + SchemaOrgJson, + FlattenedSchemaOrgJson, + CleanBodyBigrams, + TitleBigrams, + CleanBodyTrigrams, + TitleTrigrams, + MicroformatTags, + /// can either be NSFW or SFW (see safety classifier) + SafetyClassification, + InsertionTimestamp, + RecipeFirstIngredientTagId, + Keywords, +} + +impl From for usize { + fn from(value: TextField) -> Self { + value as usize + } +} + +impl TextField { + pub fn ngram_size(&self) -> usize { + match self { + TextField::Title => 1, + TextField::CleanBody => 1, + TextField::StemmedTitle => 1, + TextField::StemmedCleanBody => 1, + TextField::AllBody => 1, + TextField::Url => 1, + TextField::UrlNoTokenizer => 1, + TextField::UrlForSiteOperator => 1, + TextField::SiteWithout => 1, + TextField::Domain => 1, + TextField::SiteNoTokenizer => 1, + TextField::DomainNoTokenizer => 1, + TextField::DomainNameNoTokenizer => 1, + TextField::SiteIfHomepageNoTokenizer => 1, + TextField::DomainIfHomepage => 1, + TextField::DomainNameIfHomepageNoTokenizer => 1, + TextField::DomainIfHomepageNoTokenizer => 1, + TextField::TitleIfHomepage => 1, + TextField::BacklinkText => 1, + TextField::Description => 1, + TextField::DmozDescription => 1, + TextField::SchemaOrgJson => 1, + TextField::FlattenedSchemaOrgJson => 1, + TextField::CleanBodyBigrams => 2, + TextField::TitleBigrams => 2, + TextField::CleanBodyTrigrams => 3, + TextField::TitleTrigrams => 3, + TextField::MicroformatTags => 1, + TextField::SafetyClassification => 1, + TextField::InsertionTimestamp => 1, + TextField::RecipeFirstIngredientTagId => 1, + TextField::Keywords => 1, + } + } + + pub fn monogram_field(&self) -> TextField { + match self { + TextField::Title => TextField::Title, + TextField::CleanBody => TextField::CleanBody, + TextField::StemmedTitle => TextField::StemmedTitle, + TextField::StemmedCleanBody => TextField::StemmedCleanBody, + TextField::AllBody => TextField::AllBody, + TextField::Url => TextField::Url, + TextField::UrlNoTokenizer => TextField::UrlNoTokenizer, + TextField::UrlForSiteOperator => TextField::UrlForSiteOperator, + TextField::SiteWithout => TextField::SiteWithout, + TextField::Domain => TextField::Domain, + TextField::SiteNoTokenizer => TextField::SiteNoTokenizer, + TextField::DomainNoTokenizer => TextField::DomainNoTokenizer, + TextField::DomainNameNoTokenizer => TextField::DomainNameNoTokenizer, + TextField::SiteIfHomepageNoTokenizer => TextField::SiteIfHomepageNoTokenizer, + TextField::DomainIfHomepage => TextField::DomainIfHomepage, + TextField::DomainNameIfHomepageNoTokenizer => { + TextField::DomainNameIfHomepageNoTokenizer + } + TextField::DomainIfHomepageNoTokenizer => TextField::DomainIfHomepageNoTokenizer, + TextField::TitleIfHomepage => TextField::TitleIfHomepage, + TextField::BacklinkText => TextField::BacklinkText, + TextField::Description => TextField::Description, + TextField::DmozDescription => TextField::DmozDescription, + TextField::SchemaOrgJson => TextField::SchemaOrgJson, + TextField::FlattenedSchemaOrgJson => TextField::FlattenedSchemaOrgJson, + TextField::CleanBodyBigrams => TextField::CleanBody, + TextField::TitleBigrams => TextField::Title, + TextField::CleanBodyTrigrams => TextField::CleanBody, + TextField::TitleTrigrams => TextField::Title, + TextField::MicroformatTags => TextField::MicroformatTags, + TextField::SafetyClassification => TextField::SafetyClassification, + TextField::InsertionTimestamp => TextField::InsertionTimestamp, + TextField::RecipeFirstIngredientTagId => TextField::RecipeFirstIngredientTagId, + TextField::Keywords => TextField::Keywords, + } + } + + pub fn query_tokenizer(&self) -> Tokenizer { + match self { + TextField::TitleBigrams => Tokenizer::default(), + TextField::CleanBodyBigrams => Tokenizer::default(), + TextField::TitleTrigrams => Tokenizer::default(), + TextField::CleanBodyTrigrams => Tokenizer::default(), + _ => self.indexing_tokenizer(), + } + } + + pub fn indexing_tokenizer(&self) -> Tokenizer { + match self { + TextField::Title => Tokenizer::default(), + TextField::CleanBody => Tokenizer::default(), + TextField::StemmedTitle => Tokenizer::new_stemmed(), + TextField::StemmedCleanBody => Tokenizer::new_stemmed(), + TextField::AllBody => Tokenizer::default(), + TextField::Url => Tokenizer::default(), + TextField::UrlNoTokenizer => Tokenizer::Identity(Identity {}), + TextField::UrlForSiteOperator => Tokenizer::SiteOperator(SiteOperatorUrlTokenizer), + TextField::SiteWithout => Tokenizer::default(), + TextField::Domain => Tokenizer::default(), + TextField::SiteNoTokenizer => Tokenizer::Identity(Identity {}), + TextField::SiteIfHomepageNoTokenizer => Tokenizer::Identity(Identity {}), + TextField::DomainNoTokenizer => Tokenizer::Identity(Identity {}), + TextField::DomainNameNoTokenizer => Tokenizer::Identity(Identity {}), + TextField::DomainIfHomepage => Tokenizer::default(), + TextField::DomainNameIfHomepageNoTokenizer => Tokenizer::Identity(Identity {}), + TextField::DomainIfHomepageNoTokenizer => Tokenizer::Identity(Identity {}), + TextField::TitleIfHomepage => Tokenizer::default(), + TextField::BacklinkText => Tokenizer::default(), + TextField::Description => Tokenizer::default(), + TextField::DmozDescription => Tokenizer::default(), + TextField::SchemaOrgJson => Tokenizer::Identity(Identity {}), + TextField::FlattenedSchemaOrgJson => Tokenizer::Json(JsonField), + TextField::CleanBodyBigrams => Tokenizer::Bigram(BigramTokenizer::default()), + TextField::TitleBigrams => Tokenizer::Bigram(BigramTokenizer::default()), + TextField::CleanBodyTrigrams => Tokenizer::Trigram(TrigramTokenizer::default()), + TextField::TitleTrigrams => Tokenizer::Trigram(TrigramTokenizer::default()), + TextField::MicroformatTags => Tokenizer::default(), + TextField::SafetyClassification => Tokenizer::Identity(Identity {}), + TextField::InsertionTimestamp => Tokenizer::Identity(Identity {}), + TextField::RecipeFirstIngredientTagId => Tokenizer::Identity(Identity {}), + TextField::Keywords => Tokenizer::default(), + } + } + + pub fn index_option(&self) -> IndexRecordOption { + if self.has_pos() { + IndexRecordOption::WithFreqsAndPositions + } else { + IndexRecordOption::WithFreqs + } + } + + pub fn has_pos(&self) -> bool { + match self { + TextField::Title => true, + TextField::CleanBody => true, + TextField::StemmedTitle => false, + TextField::StemmedCleanBody => false, + TextField::AllBody => false, + TextField::Url => true, + TextField::UrlNoTokenizer => false, + TextField::UrlForSiteOperator => true, + TextField::SiteWithout => true, + TextField::Domain => true, + TextField::SiteNoTokenizer => false, + TextField::SiteIfHomepageNoTokenizer => false, + TextField::DomainNoTokenizer => false, + TextField::DomainNameNoTokenizer => false, + TextField::DomainIfHomepage => false, + TextField::DomainNameIfHomepageNoTokenizer => false, + TextField::DomainIfHomepageNoTokenizer => false, + TextField::TitleIfHomepage => false, + TextField::BacklinkText => false, + TextField::Description => true, + TextField::DmozDescription => true, + TextField::SchemaOrgJson => false, + TextField::FlattenedSchemaOrgJson => true, + TextField::CleanBodyBigrams => false, + TextField::TitleBigrams => false, + TextField::CleanBodyTrigrams => false, + TextField::TitleTrigrams => false, + TextField::MicroformatTags => true, + TextField::SafetyClassification => false, + TextField::InsertionTimestamp => false, + TextField::RecipeFirstIngredientTagId => false, + TextField::Keywords => false, + } + } + + pub fn name(&self) -> &str { + match self { + TextField::Title => "title", + TextField::CleanBody => "body", + TextField::Url => "url", + TextField::UrlNoTokenizer => "url_no_tokenizer", + TextField::UrlForSiteOperator => "url_for_site_operator", + TextField::SiteWithout => "site", + TextField::Domain => "domain", + TextField::SiteNoTokenizer => "site_no_tokenizer", + TextField::SiteIfHomepageNoTokenizer => "site_if_homepage_no_tokenizer", + TextField::DomainNoTokenizer => "domain_no_tokenizer", + TextField::DomainNameNoTokenizer => "domain_name_no_tokenizer", + TextField::BacklinkText => "backlink_text", + TextField::StemmedTitle => "stemmed_title", + TextField::StemmedCleanBody => "stemmed_body", + TextField::DomainIfHomepage => "domain_if_homepage", + TextField::DomainNameIfHomepageNoTokenizer => "domain_name_if_homepage_no_tokenizer", + TextField::DomainIfHomepageNoTokenizer => "domain_if_homepage_no_tokenizer", + TextField::Description => "description", + TextField::TitleIfHomepage => "title_if_homepage", + TextField::AllBody => "all_body", + TextField::DmozDescription => "dmoz_description", + TextField::SchemaOrgJson => "schema_org_json", + TextField::FlattenedSchemaOrgJson => "flattened_schema_org_json", + TextField::CleanBodyBigrams => "clean_body_bigrams", + TextField::TitleBigrams => "title_bigrams", + TextField::CleanBodyTrigrams => "clean_body_trigrams", + TextField::TitleTrigrams => "title_trigrams", + TextField::MicroformatTags => "microformat_tags", + TextField::SafetyClassification => "safety_classification", + TextField::InsertionTimestamp => "insertion_timestamp", + TextField::RecipeFirstIngredientTagId => "recipe_first_ingredient_tag_id", + TextField::Keywords => "keywords", + } + } +} + +impl InsertEnumMapKey for TextField { + fn into_usize(self) -> usize { + self as usize + } +} diff --git a/crates/core/src/webpage/html/microformats.rs b/crates/core/src/webpage/html/microformats.rs index 9ecba223..8e310e14 100644 --- a/crates/core/src/webpage/html/microformats.rs +++ b/crates/core/src/webpage/html/microformats.rs @@ -13,7 +13,7 @@ // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::{enum_map::EnumSet, Result}; +use crate::enum_map::{EnumSet, GetEnumMapKey, InsertEnumMapKey}; use super::Html; @@ -49,9 +49,9 @@ impl Microformat { } } -impl From for usize { - fn from(value: Microformat) -> Self { - match value { +impl InsertEnumMapKey for Microformat { + fn into_usize(self) -> usize { + match self { Microformat::HCard => 0, Microformat::HEvent => 1, Microformat::HEntry => 2, @@ -62,18 +62,16 @@ impl From for usize { } } -impl TryFrom for Microformat { - type Error = anyhow::Error; - - fn try_from(value: usize) -> Result { +impl GetEnumMapKey for Microformat { + fn from_usize(value: usize) -> Option { match value { - 0 => Ok(Microformat::HCard), - 1 => Ok(Microformat::HEvent), - 2 => Ok(Microformat::HEntry), - 3 => Ok(Microformat::HRecipe), - 4 => Ok(Microformat::HReview), - 5 => Ok(Microformat::HProduct), - _ => Err(anyhow::anyhow!("Unknown microformat")), + 0 => Some(Microformat::HCard), + 1 => Some(Microformat::HEvent), + 2 => Some(Microformat::HEntry), + 3 => Some(Microformat::HRecipe), + 4 => Some(Microformat::HReview), + 5 => Some(Microformat::HProduct), + _ => None, } } } diff --git a/crates/core/src/webpage/html/robots_meta.rs b/crates/core/src/webpage/html/robots_meta.rs index 08707624..b65adb11 100644 --- a/crates/core/src/webpage/html/robots_meta.rs +++ b/crates/core/src/webpage/html/robots_meta.rs @@ -16,7 +16,10 @@ use std::str::FromStr; -use crate::{enum_map::EnumSet, Error, Result}; +use crate::{ + enum_map::{EnumSet, InsertEnumMapKey}, + Error, Result, +}; use super::Html; @@ -38,9 +41,9 @@ impl FromStr for RobotsMeta { } } -impl From for usize { - fn from(val: RobotsMeta) -> Self { - match val { +impl InsertEnumMapKey for RobotsMeta { + fn into_usize(self) -> usize { + match self { RobotsMeta::NoIndex => 0, RobotsMeta::NoFollow => 1, }