Skip to content

Commit

Permalink
split up schema fields into submodules
Browse files Browse the repository at this point in the history
  • Loading branch information
mikkeldenker committed Mar 20, 2024
1 parent ba8d2b2 commit 7146560
Show file tree
Hide file tree
Showing 7 changed files with 455 additions and 396 deletions.
43 changes: 24 additions & 19 deletions crates/core/src/enum_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,30 @@

use serde::{Deserialize, Serialize};

pub trait InsertEnumMapKey: Sized {
fn into_usize(self) -> usize;
}

pub trait GetEnumMapKey: Sized {
fn from_usize(value: usize) -> Option<Self>;
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct EnumMap<K: Into<usize>, V> {
pub struct EnumMap<K: InsertEnumMapKey, V> {
inner: Vec<Option<V>>,
len: usize,
_phantom: std::marker::PhantomData<K>,
}

impl<K: Into<usize>, V> Default for EnumMap<K, V> {
impl<K: InsertEnumMapKey, V> Default for EnumMap<K, V> {
fn default() -> Self {
Self::new()
}
}

impl<K, V> EnumMap<K, V>
where
K: Into<usize>,
K: InsertEnumMapKey,
{
pub fn new() -> Self {
Self {
Expand All @@ -42,7 +50,7 @@ where
}

pub fn insert(&mut self, key: K, value: V) {
let key = key.into();
let key = key.into_usize();

if key >= self.inner.len() {
self.inner.resize_with(key + 1, || None);
Expand All @@ -60,7 +68,7 @@ where
}

pub fn get(&self, key: K) -> Option<&V> {
let key = key.into();
let key = key.into_usize();
if key >= self.inner.len() {
None
} else {
Expand All @@ -81,7 +89,7 @@ where
}

pub fn get_mut(&mut self, key: K) -> Option<&mut V> {
let key = key.into();
let key = key.into_usize();
if key >= self.inner.len() {
None
} else {
Expand All @@ -92,19 +100,19 @@ where

impl<K, V> EnumMap<K, V>
where
K: TryFrom<usize> + Into<usize>,
K: GetEnumMapKey + InsertEnumMapKey,
{
pub fn keys(&self) -> impl Iterator<Item = K> + '_ {
self.inner
.iter()
.enumerate()
.filter_map(|(key, value)| value.as_ref().and_then(|_| K::try_from(key).ok()))
.filter_map(|(key, value)| value.as_ref().and_then(|_| K::from_usize(key)))
}
}

impl<K, V> FromIterator<(K, V)> for EnumMap<K, V>
where
K: Into<usize>,
K: InsertEnumMapKey,
{
fn from_iter<T: IntoIterator<Item = (K, V)>>(iter: T) -> Self {
let mut map = Self::new();
Expand All @@ -118,17 +126,17 @@ where
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct EnumSet<K: Into<usize>> {
pub struct EnumSet<K: InsertEnumMapKey> {
map: EnumMap<K, ()>,
}

impl<K: Into<usize>> Default for EnumSet<K> {
impl<K: InsertEnumMapKey> Default for EnumSet<K> {
fn default() -> Self {
Self::new()
}
}

impl<K: Into<usize>> EnumSet<K> {
impl<K: InsertEnumMapKey> EnumSet<K> {
pub fn new() -> Self {
Self {
map: EnumMap::new(),
Expand All @@ -148,10 +156,7 @@ impl<K: Into<usize>> EnumSet<K> {
}
}

impl<K> EnumSet<K>
where
K: TryFrom<usize> + Into<usize>,
{
impl<K: InsertEnumMapKey + GetEnumMapKey> EnumSet<K> {
pub fn iter(&self) -> impl Iterator<Item = K> + '_ {
self.map.keys()
}
Expand All @@ -168,9 +173,9 @@ mod tests {
C,
}

impl From<TestEnum> for usize {
fn from(val: TestEnum) -> Self {
val as usize
impl InsertEnumMapKey for TestEnum {
fn into_usize(self) -> usize {
self as usize
}
}

Expand Down
7 changes: 4 additions & 3 deletions crates/core/src/ranking/signal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.

use crate::enum_map::InsertEnumMapKey;
use crate::query::optic::AsSearchableRule;
use crate::query::Query;
use crate::Result;
Expand Down Expand Up @@ -143,9 +144,9 @@ pub enum Signal {
KeywordEmbeddingSimilarity,
}

impl From<Signal> for usize {
fn from(signal: Signal) -> Self {
signal as usize
impl InsertEnumMapKey for Signal {
fn into_usize(self) -> usize {
self as usize
}
}

Expand Down
114 changes: 114 additions & 0 deletions crates/core/src/schema/fast_field.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// Stract is an open source web search engine.
// Copyright (C) 2024 Stract ApS
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>

use strum::VariantArray;

use crate::enum_map::InsertEnumMapKey;

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, VariantArray)]
pub enum FastField {
IsHomepage,
HostCentrality,
HostCentralityRank,
PageCentrality,
PageCentralityRank,
FetchTimeMs,
LastUpdated,
TrackerScore,
Region,
NumUrlTokens,
NumTitleTokens,
NumCleanBodyTokens,
NumDescriptionTokens,
NumUrlForSiteOperatorTokens,
NumDomainTokens,
NumMicroformatTagsTokens,
SiteHash1,
SiteHash2,
UrlWithoutQueryHash1,
UrlWithoutQueryHash2,
TitleHash1,
TitleHash2,
UrlHash1,
UrlHash2,
DomainHash1,
DomainHash2,
UrlWithoutTldHash1,
UrlWithoutTldHash2,
PreComputedScore,
HostNodeID,
SimHash,
NumFlattenedSchemaTokens,
NumPathAndQuerySlashes,
NumPathAndQueryDigits,
LikelyHasAds,
LikelyHasPaywall,
LinkDensity,
TitleEmbeddings,
KeywordEmbeddings,
}

impl FastField {
pub fn name(&self) -> &str {
match self {
FastField::HostCentrality => "host_centrality",
FastField::HostCentralityRank => "host_centrality_rank",
FastField::PageCentrality => "page_centrality",
FastField::PageCentralityRank => "page_centrality_rank",
FastField::IsHomepage => "is_homepage",
FastField::FetchTimeMs => "fetch_time_ms",
FastField::LastUpdated => "last_updated",
FastField::TrackerScore => "tracker_score",
FastField::Region => "region",
FastField::NumUrlTokens => "num_url_tokens",
FastField::NumTitleTokens => "num_title_tokens",
FastField::NumCleanBodyTokens => "num_clean_body_tokens",
FastField::NumDescriptionTokens => "num_description_tokens",
FastField::NumDomainTokens => "num_domain_tokens",
FastField::NumUrlForSiteOperatorTokens => "num_url_for_site_operator_tokens",
FastField::NumFlattenedSchemaTokens => "num_flattened_schema_tokens",
FastField::NumMicroformatTagsTokens => "num_microformat_tags_tokens",
FastField::SiteHash1 => "site_hash1",
FastField::SiteHash2 => "site_hash2",
FastField::UrlWithoutQueryHash1 => "url_without_query_hash1",
FastField::UrlWithoutQueryHash2 => "url_without_query_hash2",
FastField::TitleHash1 => "title_hash1",
FastField::TitleHash2 => "title_hash2",
FastField::UrlHash1 => "url_hash1",
FastField::UrlHash2 => "url_hash2",
FastField::DomainHash1 => "domain_hash1",
FastField::DomainHash2 => "domain_hash2",
FastField::UrlWithoutTldHash1 => "url_without_tld_hash1",
FastField::UrlWithoutTldHash2 => "url_without_tld_hash2",
FastField::PreComputedScore => "pre_computed_score",
FastField::HostNodeID => "host_node_id",
FastField::SimHash => "sim_hash",
FastField::NumPathAndQuerySlashes => "num_path_and_query_slashes",
FastField::NumPathAndQueryDigits => "num_path_and_query_digits",
FastField::LikelyHasAds => "likely_has_ads",
FastField::LikelyHasPaywall => "likely_has_paywall",
FastField::LinkDensity => "link_density",
FastField::TitleEmbeddings => "title_embeddings",
FastField::KeywordEmbeddings => "keyword_embeddings",
}
}
}

impl InsertEnumMapKey for FastField {
fn into_usize(self) -> usize {
self as usize
}
}
Loading

0 comments on commit 7146560

Please sign in to comment.