From 9a2e98fa5b5fc86d60ac6fb8df9e43471de085a8 Mon Sep 17 00:00:00 2001 From: Mikkel Denker Date: Thu, 5 Dec 2024 10:00:46 +0100 Subject: [PATCH] document ranking pipeline --- crates/core/src/ranking/computer/mod.rs | 13 ++++++++++ crates/core/src/ranking/mod.rs | 6 +++++ .../core/src/ranking/models/cross_encoder.rs | 5 +++- crates/core/src/ranking/models/lambdamart.rs | 5 +++- crates/core/src/ranking/models/linear.rs | 2 +- crates/core/src/ranking/pipeline/mod.rs | 8 +++--- .../src/ranking/pipeline/modifiers/mod.rs | 24 +++++++++++++++-- .../ranking/pipeline/scorers/lambdamart.rs | 4 +-- .../core/src/ranking/pipeline/scorers/mod.rs | 26 ++++++++++++++++--- .../src/ranking/pipeline/scorers/reranker.rs | 2 +- .../src/ranking/pipeline/stages/precision.rs | 5 ++++ .../src/ranking/pipeline/stages/recall.rs | 4 +++ 12 files changed, 88 insertions(+), 16 deletions(-) diff --git a/crates/core/src/ranking/computer/mod.rs b/crates/core/src/ranking/computer/mod.rs index 86e2edf4..5dc99d6d 100644 --- a/crates/core/src/ranking/computer/mod.rs +++ b/crates/core/src/ranking/computer/mod.rs @@ -14,6 +14,19 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +//! The ranking computer is responsible for computing the core ranking signals for +//! each potential page in the result set. This module handles the initial ranking phase +//! that runs independently on each search node in the distributed search cluster. +//! +//! The computer evaluates a set of core ranking signals for each candidate page, +//! including text-based relevance scores like BM25 and authority scores (harmonic centrality). +//! These signals are combined using a linear model to produce an initial ranking score. +//! The top pages are passed to the coordinator node for the final ranking phase. +//! +//! The core signals computed here are designed to be fast to calculate while still +//! providing strong relevance signals. More expensive ranking features are deferred +//! to the final ranking phase on the coordinator. + use crate::query::optic::AsSearchableRule; use crate::query::{Query, MAX_TERMS_FOR_NGRAM_LOOKUPS}; use crate::ranking::bm25f::MultiBm25FWeight; diff --git a/crates/core/src/ranking/mod.rs b/crates/core/src/ranking/mod.rs index b402fdce..4f8e0e8b 100644 --- a/crates/core/src/ranking/mod.rs +++ b/crates/core/src/ranking/mod.rs @@ -14,6 +14,12 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +//! The ranking module is responsible for ranking pages based on their relevance to a query. +//! +//! The core ranking signals are computed by the `computer` module, which runs independently +//! on each search shard in the search cluster. Increasingly complex stages +//! run in the ranking pipeline on the coordinator node to produce the final ranking. + pub mod bitvec_similarity; pub mod bm25; pub mod bm25f; diff --git a/crates/core/src/ranking/models/cross_encoder.rs b/crates/core/src/ranking/models/cross_encoder.rs index 8cf95843..e829e97e 100644 --- a/crates/core/src/ranking/models/cross_encoder.rs +++ b/crates/core/src/ranking/models/cross_encoder.rs @@ -1,5 +1,5 @@ // Stract is an open source web search engine. -// Copyright (C) 2023 Stract ApS +// Copyright (C) 2024 Stract ApS // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as @@ -29,6 +29,9 @@ use crate::models::bert::BertModel; const TRUNCATE_INPUT: usize = 128; +/// A cross-encoder model for ranking pages. +/// +/// Takes a query and a page body as input and returns a score for the page. pub struct CrossEncoderModel { tokenizer: tokenizers::Tokenizer, encoder: BertModel, diff --git a/crates/core/src/ranking/models/lambdamart.rs b/crates/core/src/ranking/models/lambdamart.rs index 0ec3b472..db14ae4e 100644 --- a/crates/core/src/ranking/models/lambdamart.rs +++ b/crates/core/src/ranking/models/lambdamart.rs @@ -1,5 +1,5 @@ // Stract is an open source web search engine. -// Copyright (C) 2023 Stract ApS +// Copyright (C) 2024 Stract ApS // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as @@ -240,6 +240,9 @@ impl Header { } } +/// A LambdaMART model for ranking pages. +/// +/// Designed for efficient inference of lightgbm compatible models. pub struct LambdaMART { trees: Vec, } diff --git a/crates/core/src/ranking/models/linear.rs b/crates/core/src/ranking/models/linear.rs index 0e5c63e6..cb550770 100644 --- a/crates/core/src/ranking/models/linear.rs +++ b/crates/core/src/ranking/models/linear.rs @@ -1,5 +1,5 @@ // Stract is an open source web search engine. -// Copyright (C) 2023 Stract ApS +// Copyright (C) 2024 Stract ApS // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as diff --git a/crates/core/src/ranking/pipeline/mod.rs b/crates/core/src/ranking/pipeline/mod.rs index 6a25a08f..78f13796 100644 --- a/crates/core/src/ranking/pipeline/mod.rs +++ b/crates/core/src/ranking/pipeline/mod.rs @@ -68,10 +68,10 @@ impl StageOrModifier where T: RankableWebpage + Send + Sync, { - fn top_n(&self) -> Top { + fn top(&self) -> Top { match self { - StageOrModifier::Stage(stage) => stage.top_n(), - StageOrModifier::Modifier(modifier) => modifier.top_n(), + StageOrModifier::Stage(stage) => stage.top(), + StageOrModifier::Modifier(modifier) => modifier.top(), } } @@ -139,7 +139,7 @@ where let coefficients = query.signal_coefficients(); for stage_or_modifier in self.stages_or_modifiers.iter() { - let webpages = if let Top::Limit(top_n) = stage_or_modifier.top_n() { + let webpages = if let Top::Limit(top_n) = stage_or_modifier.top() { if query.offset() > top_n { continue; } diff --git a/crates/core/src/ranking/pipeline/modifiers/mod.rs b/crates/core/src/ranking/pipeline/modifiers/mod.rs index fa481e3c..3809a452 100644 --- a/crates/core/src/ranking/pipeline/modifiers/mod.rs +++ b/crates/core/src/ranking/pipeline/modifiers/mod.rs @@ -14,28 +14,48 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +//! Modifiers are used to modify the ranking of pages. +//! +//! Each page is ranked by a linear combination of the signals like +//! `score = boost * (signal_1 * weight_1 + signal_2 * weight_2 + ...)` +//! +//! Modifiers can either modify the multiplicative boost factor for +//! each page or override the ranking entirely (if we want to rank +//! for something other than the score). + mod inbound_similarity; use super::{RankableWebpage, Top}; pub use inbound_similarity::InboundSimilarity; +/// A modifier that gives full control over the ranking. pub trait FullModifier: Send + Sync { type Webpage: RankableWebpage; + /// Modify the boost factor for each page. fn update_boosts(&self, webpages: &mut [Self::Webpage]); + /// Override ranking of the pages. fn rank(&self, webpages: &mut [Self::Webpage]) { webpages.sort_by(|a, b| b.score().partial_cmp(&a.score()).unwrap()); } - fn top_n(&self) -> Top { + /// The number of pages to return from this part of the pipeline. + fn top(&self) -> Top { Top::Unlimited } } +/// A modifier that modifies the multiplicative boost factor for each page. +/// +/// This is the most common type of modifier. pub trait Modifier: Send + Sync { type Webpage: RankableWebpage; + /// Modify the boost factor for a page. + /// + /// The new boost factor will be multiplied with the page's current boost factor. fn boost(&self, webpage: &Self::Webpage) -> f64; + /// The number of pages to return from this part of the pipeline. fn top(&self) -> Top { Top::Unlimited } @@ -54,7 +74,7 @@ where } } - fn top_n(&self) -> Top { + fn top(&self) -> Top { Modifier::top(self) } } diff --git a/crates/core/src/ranking/pipeline/scorers/lambdamart.rs b/crates/core/src/ranking/pipeline/scorers/lambdamart.rs index fa4932a9..9f039efa 100644 --- a/crates/core/src/ranking/pipeline/scorers/lambdamart.rs +++ b/crates/core/src/ranking/pipeline/scorers/lambdamart.rs @@ -36,7 +36,7 @@ impl RankingStage for Arc { ) } - fn top_n(&self) -> Top { + fn top(&self) -> Top { Top::Limit(20) } } @@ -59,7 +59,7 @@ impl RankingStage for PrecisionLambda { ) } - fn top_n(&self) -> Top { + fn top(&self) -> Top { Top::Limit(20) } } diff --git a/crates/core/src/ranking/pipeline/scorers/mod.rs b/crates/core/src/ranking/pipeline/scorers/mod.rs index bc561e52..2365397f 100644 --- a/crates/core/src/ranking/pipeline/scorers/mod.rs +++ b/crates/core/src/ranking/pipeline/scorers/mod.rs @@ -14,6 +14,10 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +//! Scorers are used to compute the ranking signals in the ranking pipeline. +//! +//! Each scorer computes a single signal which is then used to rank the pages. + pub mod embedding; pub mod inbound_similarity; pub mod lambdamart; @@ -26,14 +30,23 @@ use crate::ranking::{SignalCalculation, SignalCoefficients, SignalEnum}; use super::{RankableWebpage, Top}; +/// A ranking stage that computes some signals for each page. +/// +/// This trait is implemented for all scorers. +/// Most of the time you will want to implement the [`RankingStage`] trait instead, +/// but this trait gives you more control over the ranking pipeline. pub trait FullRankingStage: Send + Sync { type Webpage: RankableWebpage; + /// Compute the signal for each page. fn compute(&self, webpages: &mut [Self::Webpage]); - fn top_n(&self) -> Top { + + /// The number of pages to return from this part of the pipeline. + fn top(&self) -> Top { Top::Unlimited } + /// Update the score for each page. fn update_scores(&self, webpages: &mut [Self::Webpage], coefficients: &SignalCoefficients) { for webpage in webpages.iter_mut() { webpage.set_raw_score(webpage.signals().iter().fold(0.0, |acc, (signal, calc)| { @@ -42,16 +55,21 @@ pub trait FullRankingStage: Send + Sync { } } + /// Rank the pages by their score. fn rank(&self, webpages: &mut [Self::Webpage]) { webpages.sort_by(|a, b| b.score().partial_cmp(&a.score()).unwrap()); } } +/// A ranking stage that computes a single signal for each page. pub trait RankingStage: Send + Sync { type Webpage: RankableWebpage; + /// Compute the signal for a single page. fn compute(&self, webpage: &Self::Webpage) -> (SignalEnum, SignalCalculation); - fn top_n(&self) -> Top { + + /// The number of pages to return from this part of the pipeline. + fn top(&self) -> Top { Top::Unlimited } } @@ -69,7 +87,7 @@ where } } - fn top_n(&self) -> Top { - self.top_n() + fn top(&self) -> Top { + self.top() } } diff --git a/crates/core/src/ranking/pipeline/scorers/reranker.rs b/crates/core/src/ranking/pipeline/scorers/reranker.rs index 6caac87d..d0bb0ebf 100644 --- a/crates/core/src/ranking/pipeline/scorers/reranker.rs +++ b/crates/core/src/ranking/pipeline/scorers/reranker.rs @@ -68,7 +68,7 @@ impl FullRankingStage for ReRanker { self.crossencoder_score_webpages(webpages); } - fn top_n(&self) -> Top { + fn top(&self) -> Top { Top::Limit(20) } } diff --git a/crates/core/src/ranking/pipeline/stages/precision.rs b/crates/core/src/ranking/pipeline/stages/precision.rs index 59847eaa..b336c8a6 100644 --- a/crates/core/src/ranking/pipeline/stages/precision.rs +++ b/crates/core/src/ranking/pipeline/stages/precision.rs @@ -14,6 +14,11 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +//! The precision stage of the ranking pipeline. +//! +//! This stage focusses on refining the first page of results +//! from the recall stage. + use std::sync::Arc; use crate::{ diff --git a/crates/core/src/ranking/pipeline/stages/recall.rs b/crates/core/src/ranking/pipeline/stages/recall.rs index 688af169..897012af 100644 --- a/crates/core/src/ranking/pipeline/stages/recall.rs +++ b/crates/core/src/ranking/pipeline/stages/recall.rs @@ -14,6 +14,10 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +//! The recall stage of the ranking pipeline. +//! +//! This stage focusses on getting the best pages into the precision stage. + use std::sync::Arc; use crate::{