diff --git a/crates/core/src/ranking/computer/mod.rs b/crates/core/src/ranking/computer/mod.rs
index 86e2edf4..5dc99d6d 100644
--- a/crates/core/src/ranking/computer/mod.rs
+++ b/crates/core/src/ranking/computer/mod.rs
@@ -14,6 +14,19 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see .
+//! The ranking computer is responsible for computing the core ranking signals for
+//! each potential page in the result set. This module handles the initial ranking phase
+//! that runs independently on each search node in the distributed search cluster.
+//!
+//! The computer evaluates a set of core ranking signals for each candidate page,
+//! including text-based relevance scores like BM25 and authority scores (harmonic centrality).
+//! These signals are combined using a linear model to produce an initial ranking score.
+//! The top pages are passed to the coordinator node for the final ranking phase.
+//!
+//! The core signals computed here are designed to be fast to calculate while still
+//! providing strong relevance signals. More expensive ranking features are deferred
+//! to the final ranking phase on the coordinator.
+
use crate::query::optic::AsSearchableRule;
use crate::query::{Query, MAX_TERMS_FOR_NGRAM_LOOKUPS};
use crate::ranking::bm25f::MultiBm25FWeight;
diff --git a/crates/core/src/ranking/mod.rs b/crates/core/src/ranking/mod.rs
index b402fdce..4f8e0e8b 100644
--- a/crates/core/src/ranking/mod.rs
+++ b/crates/core/src/ranking/mod.rs
@@ -14,6 +14,12 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see .
+//! The ranking module is responsible for ranking pages based on their relevance to a query.
+//!
+//! The core ranking signals are computed by the `computer` module, which runs independently
+//! on each search shard in the search cluster. Increasingly complex stages
+//! run in the ranking pipeline on the coordinator node to produce the final ranking.
+
pub mod bitvec_similarity;
pub mod bm25;
pub mod bm25f;
diff --git a/crates/core/src/ranking/models/cross_encoder.rs b/crates/core/src/ranking/models/cross_encoder.rs
index 8cf95843..e829e97e 100644
--- a/crates/core/src/ranking/models/cross_encoder.rs
+++ b/crates/core/src/ranking/models/cross_encoder.rs
@@ -1,5 +1,5 @@
// Stract is an open source web search engine.
-// Copyright (C) 2023 Stract ApS
+// Copyright (C) 2024 Stract ApS
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
@@ -29,6 +29,9 @@ use crate::models::bert::BertModel;
const TRUNCATE_INPUT: usize = 128;
+/// A cross-encoder model for ranking pages.
+///
+/// Takes a query and a page body as input and returns a score for the page.
pub struct CrossEncoderModel {
tokenizer: tokenizers::Tokenizer,
encoder: BertModel,
diff --git a/crates/core/src/ranking/models/lambdamart.rs b/crates/core/src/ranking/models/lambdamart.rs
index 0ec3b472..db14ae4e 100644
--- a/crates/core/src/ranking/models/lambdamart.rs
+++ b/crates/core/src/ranking/models/lambdamart.rs
@@ -1,5 +1,5 @@
// Stract is an open source web search engine.
-// Copyright (C) 2023 Stract ApS
+// Copyright (C) 2024 Stract ApS
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
@@ -240,6 +240,9 @@ impl Header {
}
}
+/// A LambdaMART model for ranking pages.
+///
+/// Designed for efficient inference of lightgbm compatible models.
pub struct LambdaMART {
trees: Vec,
}
diff --git a/crates/core/src/ranking/models/linear.rs b/crates/core/src/ranking/models/linear.rs
index 0e5c63e6..cb550770 100644
--- a/crates/core/src/ranking/models/linear.rs
+++ b/crates/core/src/ranking/models/linear.rs
@@ -1,5 +1,5 @@
// Stract is an open source web search engine.
-// Copyright (C) 2023 Stract ApS
+// Copyright (C) 2024 Stract ApS
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
diff --git a/crates/core/src/ranking/pipeline/mod.rs b/crates/core/src/ranking/pipeline/mod.rs
index 6a25a08f..78f13796 100644
--- a/crates/core/src/ranking/pipeline/mod.rs
+++ b/crates/core/src/ranking/pipeline/mod.rs
@@ -68,10 +68,10 @@ impl StageOrModifier
where
T: RankableWebpage + Send + Sync,
{
- fn top_n(&self) -> Top {
+ fn top(&self) -> Top {
match self {
- StageOrModifier::Stage(stage) => stage.top_n(),
- StageOrModifier::Modifier(modifier) => modifier.top_n(),
+ StageOrModifier::Stage(stage) => stage.top(),
+ StageOrModifier::Modifier(modifier) => modifier.top(),
}
}
@@ -139,7 +139,7 @@ where
let coefficients = query.signal_coefficients();
for stage_or_modifier in self.stages_or_modifiers.iter() {
- let webpages = if let Top::Limit(top_n) = stage_or_modifier.top_n() {
+ let webpages = if let Top::Limit(top_n) = stage_or_modifier.top() {
if query.offset() > top_n {
continue;
}
diff --git a/crates/core/src/ranking/pipeline/modifiers/mod.rs b/crates/core/src/ranking/pipeline/modifiers/mod.rs
index fa481e3c..3809a452 100644
--- a/crates/core/src/ranking/pipeline/modifiers/mod.rs
+++ b/crates/core/src/ranking/pipeline/modifiers/mod.rs
@@ -14,28 +14,48 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see .
+//! Modifiers are used to modify the ranking of pages.
+//!
+//! Each page is ranked by a linear combination of the signals like
+//! `score = boost * (signal_1 * weight_1 + signal_2 * weight_2 + ...)`
+//!
+//! Modifiers can either modify the multiplicative boost factor for
+//! each page or override the ranking entirely (if we want to rank
+//! for something other than the score).
+
mod inbound_similarity;
use super::{RankableWebpage, Top};
pub use inbound_similarity::InboundSimilarity;
+/// A modifier that gives full control over the ranking.
pub trait FullModifier: Send + Sync {
type Webpage: RankableWebpage;
+ /// Modify the boost factor for each page.
fn update_boosts(&self, webpages: &mut [Self::Webpage]);
+ /// Override ranking of the pages.
fn rank(&self, webpages: &mut [Self::Webpage]) {
webpages.sort_by(|a, b| b.score().partial_cmp(&a.score()).unwrap());
}
- fn top_n(&self) -> Top {
+ /// The number of pages to return from this part of the pipeline.
+ fn top(&self) -> Top {
Top::Unlimited
}
}
+/// A modifier that modifies the multiplicative boost factor for each page.
+///
+/// This is the most common type of modifier.
pub trait Modifier: Send + Sync {
type Webpage: RankableWebpage;
+ /// Modify the boost factor for a page.
+ ///
+ /// The new boost factor will be multiplied with the page's current boost factor.
fn boost(&self, webpage: &Self::Webpage) -> f64;
+ /// The number of pages to return from this part of the pipeline.
fn top(&self) -> Top {
Top::Unlimited
}
@@ -54,7 +74,7 @@ where
}
}
- fn top_n(&self) -> Top {
+ fn top(&self) -> Top {
Modifier::top(self)
}
}
diff --git a/crates/core/src/ranking/pipeline/scorers/lambdamart.rs b/crates/core/src/ranking/pipeline/scorers/lambdamart.rs
index fa4932a9..9f039efa 100644
--- a/crates/core/src/ranking/pipeline/scorers/lambdamart.rs
+++ b/crates/core/src/ranking/pipeline/scorers/lambdamart.rs
@@ -36,7 +36,7 @@ impl RankingStage for Arc {
)
}
- fn top_n(&self) -> Top {
+ fn top(&self) -> Top {
Top::Limit(20)
}
}
@@ -59,7 +59,7 @@ impl RankingStage for PrecisionLambda {
)
}
- fn top_n(&self) -> Top {
+ fn top(&self) -> Top {
Top::Limit(20)
}
}
diff --git a/crates/core/src/ranking/pipeline/scorers/mod.rs b/crates/core/src/ranking/pipeline/scorers/mod.rs
index bc561e52..2365397f 100644
--- a/crates/core/src/ranking/pipeline/scorers/mod.rs
+++ b/crates/core/src/ranking/pipeline/scorers/mod.rs
@@ -14,6 +14,10 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see .
+//! Scorers are used to compute the ranking signals in the ranking pipeline.
+//!
+//! Each scorer computes a single signal which is then used to rank the pages.
+
pub mod embedding;
pub mod inbound_similarity;
pub mod lambdamart;
@@ -26,14 +30,23 @@ use crate::ranking::{SignalCalculation, SignalCoefficients, SignalEnum};
use super::{RankableWebpage, Top};
+/// A ranking stage that computes some signals for each page.
+///
+/// This trait is implemented for all scorers.
+/// Most of the time you will want to implement the [`RankingStage`] trait instead,
+/// but this trait gives you more control over the ranking pipeline.
pub trait FullRankingStage: Send + Sync {
type Webpage: RankableWebpage;
+ /// Compute the signal for each page.
fn compute(&self, webpages: &mut [Self::Webpage]);
- fn top_n(&self) -> Top {
+
+ /// The number of pages to return from this part of the pipeline.
+ fn top(&self) -> Top {
Top::Unlimited
}
+ /// Update the score for each page.
fn update_scores(&self, webpages: &mut [Self::Webpage], coefficients: &SignalCoefficients) {
for webpage in webpages.iter_mut() {
webpage.set_raw_score(webpage.signals().iter().fold(0.0, |acc, (signal, calc)| {
@@ -42,16 +55,21 @@ pub trait FullRankingStage: Send + Sync {
}
}
+ /// Rank the pages by their score.
fn rank(&self, webpages: &mut [Self::Webpage]) {
webpages.sort_by(|a, b| b.score().partial_cmp(&a.score()).unwrap());
}
}
+/// A ranking stage that computes a single signal for each page.
pub trait RankingStage: Send + Sync {
type Webpage: RankableWebpage;
+ /// Compute the signal for a single page.
fn compute(&self, webpage: &Self::Webpage) -> (SignalEnum, SignalCalculation);
- fn top_n(&self) -> Top {
+
+ /// The number of pages to return from this part of the pipeline.
+ fn top(&self) -> Top {
Top::Unlimited
}
}
@@ -69,7 +87,7 @@ where
}
}
- fn top_n(&self) -> Top {
- self.top_n()
+ fn top(&self) -> Top {
+ self.top()
}
}
diff --git a/crates/core/src/ranking/pipeline/scorers/reranker.rs b/crates/core/src/ranking/pipeline/scorers/reranker.rs
index 6caac87d..d0bb0ebf 100644
--- a/crates/core/src/ranking/pipeline/scorers/reranker.rs
+++ b/crates/core/src/ranking/pipeline/scorers/reranker.rs
@@ -68,7 +68,7 @@ impl FullRankingStage for ReRanker {
self.crossencoder_score_webpages(webpages);
}
- fn top_n(&self) -> Top {
+ fn top(&self) -> Top {
Top::Limit(20)
}
}
diff --git a/crates/core/src/ranking/pipeline/stages/precision.rs b/crates/core/src/ranking/pipeline/stages/precision.rs
index 59847eaa..b336c8a6 100644
--- a/crates/core/src/ranking/pipeline/stages/precision.rs
+++ b/crates/core/src/ranking/pipeline/stages/precision.rs
@@ -14,6 +14,11 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see .
+//! The precision stage of the ranking pipeline.
+//!
+//! This stage focusses on refining the first page of results
+//! from the recall stage.
+
use std::sync::Arc;
use crate::{
diff --git a/crates/core/src/ranking/pipeline/stages/recall.rs b/crates/core/src/ranking/pipeline/stages/recall.rs
index 688af169..897012af 100644
--- a/crates/core/src/ranking/pipeline/stages/recall.rs
+++ b/crates/core/src/ranking/pipeline/stages/recall.rs
@@ -14,6 +14,10 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see .
+//! The recall stage of the ranking pipeline.
+//!
+//! This stage focusses on getting the best pages into the precision stage.
+
use std::sync::Arc;
use crate::{