From 40c827d733f16fb8b6d49266aebb0ac5eac68b3c Mon Sep 17 00:00:00 2001 From: Mikkel Denker Date: Mon, 11 Mar 2024 13:39:11 +0100 Subject: [PATCH] handle missing fastfields in index gracefully --- crates/core/src/collector.rs | 5 +- crates/core/src/fastfield_reader.rs | 50 +++++++++---------- crates/core/src/query/pattern_query/scorer.rs | 4 +- .../src/ranking/pipeline/scorers/embedding.rs | 21 ++++++++ .../src/ranking/pipeline/stages/recall.rs | 4 +- crates/core/src/ranking/signal.rs | 27 ++++++---- 6 files changed, 67 insertions(+), 44 deletions(-) diff --git a/crates/core/src/collector.rs b/crates/core/src/collector.rs index 7bb13900..bd921c0b 100644 --- a/crates/core/src/collector.rs +++ b/crates/core/src/collector.rs @@ -143,8 +143,8 @@ impl TopSegmentCollector { let field_reader = self.fastfield_segment_reader.get_field_reader(doc); let hash = [ - field_reader.get(field1).as_u64().unwrap(), - field_reader.get(field2).as_u64().unwrap(), + field_reader.get(field1).unwrap().as_u64().unwrap(), + field_reader.get(field2).unwrap().as_u64().unwrap(), ]; combine_u64s(hash).into() } @@ -170,6 +170,7 @@ impl TopSegmentCollector { .fastfield_segment_reader .get_field_reader(doc) .get(FastField::SimHash) + .unwrap() .into(); self.bucket_collector.insert(SegmentDoc { diff --git a/crates/core/src/fastfield_reader.rs b/crates/core/src/fastfield_reader.rs index b65362bd..269b7a3a 100644 --- a/crates/core/src/fastfield_reader.rs +++ b/crates/core/src/fastfield_reader.rs @@ -52,12 +52,14 @@ impl FastFieldReader { for field in Field::all().filter_map(Field::as_fast) { match field.data_type() { DataType::U64 => { - let reader = fastfield_readers.u64(field.name()).unwrap(); - u64s.insert(field, reader); + if let Ok(reader) = fastfield_readers.u64(field.name()) { + u64s.insert(field, reader); + } } DataType::Bytes => { - let reader = fastfield_readers.bytes(field.name()).unwrap().unwrap(); - bytes.insert(field, reader); + if let Some(reader) = fastfield_readers.bytes(field.name()).ok().flatten() { + bytes.insert(field, reader); + } } }; } @@ -83,7 +85,7 @@ struct AllReaders { pub enum Value { U64(u64), - Bytes(Option>), + Bytes(Vec), } impl Value { @@ -96,7 +98,7 @@ impl Value { pub fn as_bytes(&self) -> Option<&[u8]> { match self { - Value::Bytes(Some(val)) => Some(val), + Value::Bytes(val) => Some(val), _ => None, } } @@ -110,12 +112,6 @@ impl From for Value { impl From> for Value { fn from(val: Vec) -> Self { - Value::Bytes(Some(val)) - } -} - -impl From>> for Value { - fn from(val: Option>) -> Self { Value::Bytes(val) } } @@ -135,7 +131,7 @@ impl<'a> From<&'a Value> for Option<&'a [u8]> { impl From for Option> { fn from(val: Value) -> Self { match val { - Value::Bytes(val) => val, + Value::Bytes(val) => Some(val), _ => None, } } @@ -147,32 +143,32 @@ pub struct FieldReader<'a> { } impl<'a> FieldReader<'a> { - pub fn get(&self, field: FastField) -> Value { + pub fn get(&self, field: FastField) -> Option { match field.data_type() { - DataType::U64 => self - .readers - .u64s - .get(field) - .unwrap() - .values - .get_val(self.doc) - .into(), + DataType::U64 => Some( + self.readers + .u64s + .get(field)? + .values + .get_val(self.doc) + .into(), + ), DataType::Bytes => { - let reader = self.readers.bytes.get(field).unwrap(); + let reader = self.readers.bytes.get(field)?; let ord = reader.ords().values.get_val(self.doc); - if ord > reader.num_terms() as u64 { - return Value::Bytes(None); + if ord > reader.num_terms() as u64 || reader.num_terms() == 0 { + return None; } let mut bytes = Vec::new(); reader.ord_to_bytes(ord, &mut bytes).unwrap(); if bytes.is_empty() { - Value::Bytes(None) + None } else { - bytes.into() + Some(bytes.into()) } } } diff --git a/crates/core/src/query/pattern_query/scorer.rs b/crates/core/src/query/pattern_query/scorer.rs index 15c6d3c9..3efafe66 100644 --- a/crates/core/src/query/pattern_query/scorer.rs +++ b/crates/core/src/query/pattern_query/scorer.rs @@ -136,7 +136,7 @@ impl EmptyFieldScorer { .segment_reader .get_field_reader(doc) .get(self.num_tokens_fastfield) - .into(); + .and_then(|v| v.as_u64()); s.unwrap_or_default() } } @@ -279,7 +279,7 @@ impl NormalPatternScorer { .segment_reader .get_field_reader(self.doc()) .get(self.num_tokens_field) - .into(); + .and_then(|v| v.as_u64()); let num_tokens_doc = num_tokens_doc.unwrap(); for (i, pattern_part) in self.pattern.iter().enumerate() { diff --git a/crates/core/src/ranking/pipeline/scorers/embedding.rs b/crates/core/src/ranking/pipeline/scorers/embedding.rs index 2fbf2db8..966b9442 100644 --- a/crates/core/src/ranking/pipeline/scorers/embedding.rs +++ b/crates/core/src/ranking/pipeline/scorers/embedding.rs @@ -142,6 +142,10 @@ impl ScoredWebpagePointer { impl> Scorer for EmbeddingScorer { fn score(&self, webpages: &mut [W]) { + if !webpages.iter().any(E::has_embedding) { + return; + } + if let Some((query_emb, coefficient)) = self.query_emb_and_coefficient(E::signal()) { let hidden_size = query_emb.size(); for webpage in webpages.iter_mut() { @@ -164,6 +168,7 @@ pub struct KeywordEmbeddings; pub trait EmbeddingSignal: Send + Sync { fn signal() -> Signal; + fn has_embedding(webpage: &W) -> bool; fn embedding(webpage: &W, hidden_size: usize) -> Option; fn insert_signal(webpage: &mut W, score: f64, coefficient: f64); } @@ -173,6 +178,10 @@ impl EmbeddingSignal for TitleEmbeddings { Signal::TitleEmbeddingSimilarity } + fn has_embedding(webpage: &ScoredWebpagePointer) -> bool { + webpage.as_ranking().title_embedding.is_some() + } + fn embedding(webpage: &ScoredWebpagePointer, hidden_size: usize) -> Option { webpage.title_emb(hidden_size) } @@ -194,6 +203,10 @@ impl EmbeddingSignal for TitleEmbeddings { Signal::TitleEmbeddingSimilarity } + fn has_embedding(webpage: &RecallRankingWebpage) -> bool { + webpage.title_embedding.is_some() + } + fn embedding(webpage: &RecallRankingWebpage, hidden_size: usize) -> Option { webpage.title_emb(hidden_size) } @@ -216,6 +229,10 @@ impl EmbeddingSignal for KeywordEmbeddings { Signal::KeywordEmbeddingSimilarity } + fn has_embedding(webpage: &ScoredWebpagePointer) -> bool { + webpage.as_ranking().keyword_embedding.is_some() + } + fn embedding(webpage: &ScoredWebpagePointer, hidden_size: usize) -> Option { webpage.keyword_emb(hidden_size) } @@ -237,6 +254,10 @@ impl EmbeddingSignal for KeywordEmbeddings { Signal::KeywordEmbeddingSimilarity } + fn has_embedding(webpage: &RecallRankingWebpage) -> bool { + webpage.keyword_embedding.is_some() + } + fn embedding(webpage: &RecallRankingWebpage, hidden_size: usize) -> Option { webpage.keyword_emb(hidden_size) } diff --git a/crates/core/src/ranking/pipeline/stages/recall.rs b/crates/core/src/ranking/pipeline/stages/recall.rs index 32d36a7a..11124e0f 100644 --- a/crates/core/src/ranking/pipeline/stages/recall.rs +++ b/crates/core/src/ranking/pipeline/stages/recall.rs @@ -59,14 +59,14 @@ impl RecallRankingWebpage { .unwrap() .get_field_reader(pointer.address.doc_id) .get(FastField::TitleEmbeddings) - .into(); + .and_then(|v| v.into()); let keyword_embedding: Option> = aggregator .fastfield_readers() .unwrap() .get_field_reader(pointer.address.doc_id) .get(FastField::KeywordEmbeddings) - .into(); + .and_then(|v| v.into()); let mut res = RecallRankingWebpage { signals: EnumMap::new(), diff --git a/crates/core/src/ranking/signal.rs b/crates/core/src/ranking/signal.rs index 8412fb81..fd4ac62d 100644 --- a/crates/core/src/ranking/signal.rs +++ b/crates/core/src/ranking/signal.rs @@ -294,6 +294,10 @@ fn idf_sum(field: &mut TextFieldData, doc: DocId) -> f64 { impl Signal { fn is_computable_before_search(&self) -> bool { self.as_fastfield().is_some() + && !matches!( + self, + Signal::TitleEmbeddingSimilarity | Signal::KeywordEmbeddingSimilarity + ) } pub fn default_coefficient(&self) -> f64 { @@ -356,7 +360,7 @@ impl Signal { let node_id = fastfield_reader .get(FastField::HostNodeID) - .as_u64() + .and_then(|n| n.as_u64()) .unwrap(); let host_id: Option = if node_id == u64::MAX { @@ -369,35 +373,35 @@ impl Signal { Signal::HostCentrality | Signal::PageCentrality => { let val = fastfield_reader .get(self.as_fastfield().unwrap()) - .as_u64() + .and_then(|v| v.as_u64()) .unwrap(); Some(val as f64 / FLOAT_SCALING as f64) } Signal::HostCentralityRank | Signal::PageCentralityRank => { let val = fastfield_reader .get(self.as_fastfield().unwrap()) - .as_u64() + .and_then(|v| v.as_u64()) .unwrap(); Some(score_rank(val as f64)) } Signal::IsHomepage => { let val = fastfield_reader .get(self.as_fastfield().unwrap()) - .as_u64() + .and_then(|v| v.as_u64()) .unwrap(); Some(val as f64) } Signal::LinkDensity => { let val = fastfield_reader .get(self.as_fastfield().unwrap()) - .as_u64() + .and_then(|v| v.as_u64()) .unwrap(); Some(score_link_density(val as f64 / FLOAT_SCALING as f64)) } Signal::FetchTimeMs => { let fetch_time_ms = fastfield_reader .get(self.as_fastfield().unwrap()) - .as_u64() + .and_then(|v| v.as_u64()) .unwrap() as usize; if fetch_time_ms >= signal_aggregator.fetch_time_ms_cache.len() { @@ -409,7 +413,7 @@ impl Signal { Signal::UpdateTimestamp => { let val = fastfield_reader .get(self.as_fastfield().unwrap()) - .as_u64() + .and_then(|v| v.as_u64()) .unwrap() as usize; Some(score_timestamp(val, signal_aggregator)) @@ -417,28 +421,28 @@ impl Signal { Signal::TrackerScore => { let val = fastfield_reader .get(self.as_fastfield().unwrap()) - .as_u64() + .and_then(|v| v.as_u64()) .unwrap(); Some(score_trackers(val as f64)) } Signal::UrlDigits => { let val = fastfield_reader .get(self.as_fastfield().unwrap()) - .as_u64() + .and_then(|v| v.as_u64()) .unwrap(); Some(score_digits(val as f64)) } Signal::UrlSlashes => { let val = fastfield_reader .get(self.as_fastfield().unwrap()) - .as_u64() + .and_then(|v| v.as_u64()) .unwrap(); Some(score_slashes(val as f64)) } Signal::Region => { let val = fastfield_reader .get(self.as_fastfield().unwrap()) - .as_u64() + .and_then(|v| v.as_u64()) .unwrap(); let region = Region::from_id(val); Some(score_region(region, signal_aggregator)) @@ -621,6 +625,7 @@ impl Signal { Signal::UrlDigits => Some(FastField::NumPathAndQueryDigits), Signal::LinkDensity => Some(FastField::LinkDensity), Signal::TitleEmbeddingSimilarity => Some(FastField::TitleEmbeddings), + Signal::KeywordEmbeddingSimilarity => Some(FastField::KeywordEmbeddings), _ => None, } }