diff --git a/README.md b/README.md
index 9eadb71b..6eef0eda 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ We recommend everyone to use the hosted version at [stract.com](https://stract.c
# 💼 License
-Stract is offered under the terms defined under the [LICENSE.md](LICENSE.md) file.
+Stract is offered under the terms defined under the [LICENSE.md](LICENSE.md) file unless otherwise specified in the relevant subdirectory.
# 📬 Contact
diff --git a/assets/licenses.html b/assets/licenses.html
index ab81f847..91c35754 100644
--- a/assets/licenses.html
+++ b/assets/licenses.html
@@ -45,8 +45,8 @@
MIT License
+
+Copyright (c) 2024 Stract ApS
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/crates/web-spell/Cargo.toml b/crates/web-spell/Cargo.toml
index c4b1f4ca..97913681 100644
--- a/crates/web-spell/Cargo.toml
+++ b/crates/web-spell/Cargo.toml
@@ -1,6 +1,6 @@
[package]
edition = "2021"
-license = "AGPL-3.0"
+license = "MIT"
name = "web-spell"
version = "0.1.0"
diff --git a/crates/web-spell/LICENSE b/crates/web-spell/LICENSE
new file mode 100644
index 00000000..409e7e2c
--- /dev/null
+++ b/crates/web-spell/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Stract ApS
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/crates/web-spell/README.md b/crates/web-spell/README.md
index be6b40a2..1ac10707 100644
--- a/crates/web-spell/README.md
+++ b/crates/web-spell/README.md
@@ -1,13 +1,9 @@
# Web Spell
-Automatic spelling correction from web data. It is based on the paper
+Automatic spelling correction from web data. It is roughly based on the paper
[Using the Web for Language Independent Spellchecking and
Autocorrection](http://static.googleusercontent.com/media/research.google.com/en/us/pubs/archive/36180.pdf)
from google.
-## Usage
-```rust
-let checker = SpellChecker::open("", CorrectionConfig::default()).unwrap();
-let correction = checker.correct("hwllo", Lang::Eng);
-assert_eq!(correction.unwrap().terms, vec![CorrectionTerm::Corrected { orig: "hwllo".to_string(), correction: "hello".to_string() }]);
-```
+## License
+Web spell is licensed under the MIT license. See the [LICENSE](LICENSE) file for details.
\ No newline at end of file
diff --git a/crates/web-spell/src/config.rs b/crates/web-spell/src/config.rs
index 0a34217d..d9293a61 100644
--- a/crates/web-spell/src/config.rs
+++ b/crates/web-spell/src/config.rs
@@ -1,19 +1,3 @@
-// Stract is an open source web search engine.
-// Copyright (C) 2024 Stract ApS
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see .
-
fn misspelled_prob() -> f64 {
0.1
}
diff --git a/crates/web-spell/src/error_model.rs b/crates/web-spell/src/error_model.rs
index 5a7fae95..69c2ee5c 100644
--- a/crates/web-spell/src/error_model.rs
+++ b/crates/web-spell/src/error_model.rs
@@ -1,19 +1,3 @@
-// Stract is an open source web search engine.
-// Copyright (C) 2024 Stract ApS
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see .
-
use super::Result;
use std::{
collections::HashMap,
@@ -55,6 +39,7 @@ pub enum ErrorType {
)]
pub struct ErrorSequence(Vec);
+/// Return all the possible ways to transform one string into another with a single edit.
pub fn possible_errors(a: &str, b: &str) -> Option {
if a == b {
return None;
@@ -165,6 +150,7 @@ impl From for ErrorModel {
}
}
+/// A model for the probability of an error sequence.
#[derive(Debug)]
pub struct ErrorModel {
errors: HashMap,
@@ -185,6 +171,7 @@ impl ErrorModel {
}
}
+ /// Save the error model to disk.
pub fn save>(self, path: P) -> Result<()> {
let file = OpenOptions::new()
.write(true)
@@ -199,6 +186,7 @@ impl ErrorModel {
Ok(())
}
+ /// Open the error model from disk.
pub fn open>(path: P) -> Result {
let file = OpenOptions::new().read(true).open(path)?;
@@ -209,6 +197,7 @@ impl ErrorModel {
Ok(stored.into())
}
+ /// Add an error sequence to the error model.
pub fn add(&mut self, a: &str, b: &str) {
if let Some(errors) = possible_errors(a, b) {
*self.errors.entry(errors).or_insert(0) += 1;
@@ -216,11 +205,13 @@ impl ErrorModel {
}
}
+ /// Get the probability of an error sequence.
pub fn prob(&self, error: &ErrorSequence) -> f64 {
let count = self.errors.get(error).unwrap_or(&0);
*count as f64 / self.total as f64
}
+ /// Get the log probability of an error sequence.
pub fn log_prob(&self, error: &ErrorSequence) -> f64 {
match self.errors.get(error) {
Some(count) => (*count as f64).log2() - ((self.total + 1) as f64).log2(),
diff --git a/crates/web-spell/src/lib.rs b/crates/web-spell/src/lib.rs
index 54d5bfcb..f61f2b69 100644
--- a/crates/web-spell/src/lib.rs
+++ b/crates/web-spell/src/lib.rs
@@ -1,22 +1,24 @@
-// Stract is an open source web search engine.
-// Copyright (C) 2024 Stract ApS
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see .
-
-//! This module contains the spell checker. It is based on the paper
+//! This module contains the spell checker. It is roughly based on the paper
//! http://static.googleusercontent.com/media/research.google.com/en/us/pubs/archive/36180.pdf
//! from google.
+//!
+//! # Usage
+//!
+//! ```rust
+//! # use std::path::Path;
+//! # use web_spell::{CorrectionConfig, SpellChecker, Lang};
+//!
+//! # let path = Path::new("../data/web_spell/checker");
+//!
+//! # if !path.exists() {
+//! # return;
+//! # }
+//!
+//! let checker = SpellChecker::open("", CorrectionConfig::default());
+//! # let checker = SpellChecker::open(path, CorrectionConfig::default());
+//! let correction = checker.unwrap().correct("hwllo", &Lang::Eng);
+//! ```
+
mod config;
mod error_model;
pub mod spell_checker;
@@ -26,6 +28,7 @@ mod trainer;
pub use config::CorrectionConfig;
pub use error_model::ErrorModel;
+pub use spell_checker::Lang;
pub use spell_checker::SpellChecker;
pub use stupid_backoff::StupidBackoff;
pub use term_freqs::TermDict;
@@ -108,6 +111,7 @@ impl From for String {
}
impl Correction {
+ /// Create an empty correction.
pub fn empty(original: String) -> Self {
Self {
original,
@@ -115,10 +119,12 @@ impl Correction {
}
}
+ /// Push a term to the correction.
pub fn push(&mut self, term: CorrectionTerm) {
self.terms.push(term);
}
+ /// Check if all terms are not corrected.
pub fn is_all_orig(&self) -> bool {
self.terms
.iter()
@@ -126,6 +132,13 @@ impl Correction {
}
}
+/// Split text into sentence ranges by detecting common sentence boundaries like periods, exclamation marks,
+/// question marks and newlines. Returns a Vec of byte ranges for each detected sentence.
+///
+/// The splitting is optimized for performance and simplicity rather than perfect accuracy. It handles
+/// common cases like abbreviations, URLs, ellipses and whitespace trimming.
+///
+/// Note that this is a heuristic approach and may not handle all edge cases correctly.
pub fn sentence_ranges(text: &str) -> Vec> {
let skip = ["mr.", "ms.", "dr."];
@@ -178,6 +191,7 @@ pub fn sentence_ranges(text: &str) -> Vec> {
res
}
+/// Tokenize text into words.
pub fn tokenize(text: &str) -> Vec {
text.to_lowercase()
.split_whitespace()
@@ -188,11 +202,20 @@ pub fn tokenize(text: &str) -> Vec {
.map(|s| s.to_string())
.collect()
}
-pub struct MergePointer<'a> {
- pub term: String,
- pub value: u64,
- pub stream: fst::map::Stream<'a>,
- pub is_finished: bool,
+
+/// A pointer for merging two term streams.
+struct MergePointer<'a> {
+ /// The current head of the stream.
+ pub(crate) term: String,
+
+ /// The current head value.
+ pub(crate) value: u64,
+
+ /// The stream to merge.
+ pub(crate) stream: fst::map::Stream<'a>,
+
+ /// Whether the stream is finished.
+ pub(crate) is_finished: bool,
}
impl MergePointer<'_> {
@@ -234,6 +257,7 @@ impl PartialEq for MergePointer<'_> {
impl Eq for MergePointer<'_> {}
+/// Get the next character boundary after or at the given index.
fn ceil_char_boundary(str: &str, index: usize) -> usize {
let mut res = index;
diff --git a/crates/web-spell/src/spell_checker.rs b/crates/web-spell/src/spell_checker.rs
index 47399d74..a737e554 100644
--- a/crates/web-spell/src/spell_checker.rs
+++ b/crates/web-spell/src/spell_checker.rs
@@ -1,38 +1,31 @@
-// Stract is an open source web search engine.
-// Copyright (C) 2024 Stract ApS
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see .
-
use super::Result;
use std::{path::Path, str::FromStr};
use fnv::FnvHashMap;
-use whatlang::Lang;
+pub use whatlang::Lang;
use crate::config::CorrectionConfig;
use crate::stupid_backoff::{IntoMiddle, LeftToRight, RightToLeft};
use super::{error_model, Correction, CorrectionTerm, Error, ErrorModel, StupidBackoff, TermDict};
+/// A spell checker for a specific language.
struct LangSpellChecker {
+ /// The term dictionary.
term_dict: TermDict,
+
+ /// The language model.
language_model: StupidBackoff,
+
+ /// Model of typical errors.
error_model: ErrorModel,
+
+ /// The correction configuration.
config: CorrectionConfig,
}
impl LangSpellChecker {
+ /// Open a spell checker for a specific language.
fn open>(path: P, config: CorrectionConfig) -> Result {
let term_dict = TermDict::open(path.as_ref().join("term_dict"))?;
let language_model = StupidBackoff::open(path.as_ref().join("stupid_backoff"))?;
@@ -46,6 +39,10 @@ impl LangSpellChecker {
})
}
+ /// Get the possible correction candidates for a given term.
+ ///
+ /// The candidates are generated by looking for candidates in the term dictionary
+ /// within a maximum edit distance.
fn candidates(&self, term: &str) -> Vec {
// one edit for words of
// up to four characters, two edits for up to twelve
@@ -61,6 +58,7 @@ impl LangSpellChecker {
self.term_dict.search(term, max_edit_distance)
}
+ /// Return the log probability of the term given the surrounding context of terms.
fn lm_logprob(&self, term_idx: usize, context: &[String]) -> f64 {
if term_idx == 0 {
let strat = RightToLeft;
@@ -74,6 +72,11 @@ impl LangSpellChecker {
}
}
+ /// Score correction candidates for a given term using a combination of language model and error model probabilities.
+ ///
+ /// Returns the best candidate and its score, or None if no candidates are sufficiently better than the original term.
+ /// The score combines the language model probability of the candidate in context (weighted by config.lm_prob_weight)
+ /// and the error model probability of the transformations needed to get from the original term to the candidate.
fn score_candidates(
&self,
term: &str,
@@ -118,6 +121,7 @@ impl LangSpellChecker {
best_term
}
+ /// Correct a single term at a time.
fn correct_once(&self, text: &str) -> Option {
let orig_terms = super::tokenize(text);
let mut terms = orig_terms.clone();
@@ -189,6 +193,7 @@ impl LangSpellChecker {
Some(res)
}
+ /// Correct all terms in a text.
fn correct(&self, text: &str) -> Option {
// TODO:
// sometimes the text should be corrected more than once.
@@ -198,11 +203,24 @@ impl LangSpellChecker {
}
}
+/// The main spell checker for detecting and correcting spelling mistakes.
+///
+/// This is the primary entry point for spell checking functionality. It analyzes text input
+/// and suggests corrections for misspelled words based on statistical models.
+///
+/// The spell checker uses statistical language models and error models trained on a large corpus of text such as the web to detect and correct
+/// spelling mistakes. The correction algorithm is roughly based on the approach described in Google's paper
+/// "Using the Web for Language Independent Spellchecking and Autocorrection".
+///
+/// Use [`SpellChecker::open`] to create a new instance from a model directory containing language-specific models.
+/// Then use [`SpellChecker::correct`] to correct text in a specific language.
pub struct SpellChecker {
+ /// The language-specific spell checkers.
lang_spell_checkers: FnvHashMap,
}
impl SpellChecker {
+ /// Open a spell checker from a model directory.
pub fn open>(path: P, config: CorrectionConfig) -> Result {
if !path.as_ref().exists() {
return Err(Error::CheckerNotFound);
@@ -243,6 +261,8 @@ impl SpellChecker {
lang_spell_checkers,
})
}
+
+ /// Correct a text in a specific language.
pub fn correct(&self, text: &str, lang: &Lang) -> Option {
self.lang_spell_checkers
.get(lang)
@@ -273,6 +293,7 @@ mod tests {
res
}
+
#[test]
fn simple() {
let path = Path::new("../data/web_spell/checker");
diff --git a/crates/web-spell/src/stupid_backoff.rs b/crates/web-spell/src/stupid_backoff.rs
index 2ebef6df..05d7c907 100644
--- a/crates/web-spell/src/stupid_backoff.rs
+++ b/crates/web-spell/src/stupid_backoff.rs
@@ -1,19 +1,3 @@
-// Stract is an open source web search engine.
-// Copyright (C) 2024 Stract ApS
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see .
-
use super::{tokenize, MergePointer, Result};
use std::{
cmp::Reverse,
@@ -62,6 +46,9 @@ impl AsRef<[u8]> for StoredNgram {
}
}
+/// A trainer for the stupid backoff language model.
+///
+/// This is used to train the language model from a corpus of text.
pub struct StupidBackoffTrainer {
max_ngram_size: usize,
ngrams: BTreeMap,
@@ -70,6 +57,8 @@ pub struct StupidBackoffTrainer {
}
impl StupidBackoffTrainer {
+ /// Create a new trainer for a given maximum n-gram size.
+ /// It's preferred to use an n-gram size of 3.
pub fn new(max_ngram_size: usize) -> Self {
Self {
max_ngram_size,
@@ -79,6 +68,7 @@ impl StupidBackoffTrainer {
}
}
+ /// Train the model on a sequence of tokens.
pub fn train(&mut self, tokens: &[String]) {
for window in tokens.windows(self.max_ngram_size) {
for i in 1..=window.len() {
@@ -105,6 +95,7 @@ impl StupidBackoffTrainer {
}
}
+ /// Build the language model from the trainer.
pub fn build>(self, path: P) -> Result<()> {
if !path.as_ref().exists() {
std::fs::create_dir_all(path.as_ref())?;
@@ -151,6 +142,7 @@ impl StupidBackoffTrainer {
}
}
+/// Merge multiple streams into a single FST.
fn merge_streams(
mut builder: fst::MapBuilder>,
streams: Vec>,
@@ -208,6 +200,16 @@ fn merge_streams(
Ok(())
}
+/// A stupid backoff language model for scoring n-grams.
+///
+/// The model scores n-grams by recursively backing off to lower order n-grams when the full
+/// n-gram is not found in the training data. The backoff is done by multiplying the score
+/// by a constant factor (0.4).
+///
+/// The model stores n-grams in two FSTs:
+/// `ngrams` contains regular n-grams with their frequencies, while `rotated_ngrams` contains
+/// n-grams with their words rotated to enable efficient prefix queries. Additionally, the model
+/// maintains counts of total n-grams seen for each order n in `n_counts`.
pub struct StupidBackoff {
ngrams: fst::Map,
rotated_ngrams: fst::Map,
@@ -216,6 +218,7 @@ pub struct StupidBackoff {
}
impl StupidBackoff {
+ /// Open a language model from a model directory.
pub fn open>(folder: P) -> Result {
let mmap = unsafe { memmap2::Mmap::map(&File::open(folder.as_ref().join("ngrams.bin"))?)? };
let ngrams = fst::Map::new(mmap)?;
@@ -237,6 +240,7 @@ impl StupidBackoff {
})
}
+ /// Merge multiple language models into a single model.
pub fn merge>(models: Vec, folder: P) -> Result {
if !folder.as_ref().exists() {
std::fs::create_dir_all(folder.as_ref())?;
@@ -307,6 +311,7 @@ impl StupidBackoff {
})
}
+ /// Return the frequency of the n-gram.
pub fn freq(&self, words: &[String]) -> Option {
if words.len() >= self.ngrams.len() || words.is_empty() {
return None;
@@ -319,6 +324,7 @@ impl StupidBackoff {
self.ngrams.get(ngram)
}
+ /// Return the log probability of the n-gram.
pub fn log_prob(&self, words: &[String], strat: S) -> f64 {
if words.len() >= self.ngrams.len() || words.is_empty() {
return -(self.n_counts[0] as f64).log2();
@@ -336,10 +342,12 @@ impl StupidBackoff {
}
}
+ /// Return the probability of the n-gram.
pub fn prob(&self, words: &[String], strat: S) -> f64 {
self.log_prob(words, strat).exp2()
}
+ /// Given a word, return all n-grams where that word appears in the middle of the n-gram.
pub fn contexts(&self, word: &str) -> Vec<(Vec, u64)> {
let q = word.to_string() + " ";
let automaton = fst::automaton::Str::new(&q).starts_with();
@@ -360,13 +368,20 @@ impl StupidBackoff {
}
}
+/// A trait for strategies that determine the next words to consider when backing off.
pub trait NextWordsStrategy: Sized {
+ /// The inverse strategy.
type Inv: NextWordsStrategy;
+ /// Return the next words to consider.
fn next_words<'a>(&mut self, words: &'a [String]) -> &'a [String];
+
+ /// Return the inverse strategy.
fn inverse(self) -> Self::Inv;
}
+/// A strategy that backs off by removing words from left to right. For example, given the sequence
+/// "the cat sat", it would first consider "cat sat", then just "sat".
pub struct LeftToRight;
impl NextWordsStrategy for LeftToRight {
@@ -381,6 +396,8 @@ impl NextWordsStrategy for LeftToRight {
}
}
+/// A strategy that backs off by removing words from right to left. For example, given the sequence
+/// "the cat sat", it would first consider "the cat", then just "the".
pub struct RightToLeft;
impl NextWordsStrategy for RightToLeft {
@@ -395,6 +412,8 @@ impl NextWordsStrategy for RightToLeft {
}
}
+/// A strategy that backs off by removing words from either left or right. For example, given the sequence
+/// "the cat sat", it would first consider "cat sat", then just "sat".
#[derive(Default)]
pub struct IntoMiddle {
last_left: bool,
diff --git a/crates/web-spell/src/term_freqs.rs b/crates/web-spell/src/term_freqs.rs
index 7cf1c421..e6071932 100644
--- a/crates/web-spell/src/term_freqs.rs
+++ b/crates/web-spell/src/term_freqs.rs
@@ -1,19 +1,3 @@
-// Stract is an open source web search engine.
-// Copyright (C) 2024 Stract ApS
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see .
-
use super::{MergePointer, Result};
use fst::{IntoStreamer, Streamer};
@@ -161,6 +145,7 @@ struct Metadata {
dicts: Vec,
}
+/// A dictionary of terms and their frequencies.
pub struct TermDict {
builder: DictBuilder,
stored: Vec,
@@ -169,6 +154,7 @@ pub struct TermDict {
}
impl TermDict {
+ /// Open a term dictionary from a model directory.
pub fn open>(path: P) -> Result {
if path.as_ref().exists() {
let file = File::open(path.as_ref().join("meta.json"))?;
@@ -203,6 +189,7 @@ impl TermDict {
}
}
+ /// Insert a term into the dictionary.
pub fn insert(&mut self, term: &str) {
if term.len() <= 1 {
return;
@@ -235,6 +222,7 @@ impl TermDict {
self.builder.insert(term);
}
+ /// Save the current state of the dictionary to disk.
pub fn commit(&mut self) -> Result<()> {
let builder = std::mem::take(&mut self.builder);
@@ -251,6 +239,7 @@ impl TermDict {
Ok(())
}
+ /// Remove unused dictionaries from disk.
fn gc(&self) -> Result<()> {
let all_dicts = self
.path
@@ -277,6 +266,7 @@ impl TermDict {
Ok(())
}
+ /// Save the metadata to disk.
fn save_meta(&self) -> Result<()> {
let file = OpenOptions::new()
.create(true)
@@ -289,6 +279,7 @@ impl TermDict {
Ok(())
}
+ /// Merge all dictionary segments into a single dictionary.
pub fn merge_dicts(&mut self) -> Result<()> {
if self.stored.len() <= 1 {
return Ok(());
@@ -311,6 +302,7 @@ impl TermDict {
Ok(())
}
+ /// Get the frequency of a term across all dictionary segments.
pub fn freq(&self, term: &str) -> Option {
let mut freqs = None;
@@ -326,6 +318,7 @@ impl TermDict {
freqs
}
+ /// Get all terms in the dictionary.
pub fn terms(&self) -> Vec {
let mut terms = Vec::new();
@@ -340,6 +333,7 @@ impl TermDict {
terms
}
+ /// Search for terms in the dictionary with a given edit distance.
pub fn search(&self, term: &str, max_edit_distance: u32) -> Vec {
let mut res = Vec::new();
@@ -354,6 +348,7 @@ impl TermDict {
res
}
+ /// Merge another term dictionary into this one.
pub fn merge(&mut self, other: Self) -> Result<()> {
for stored in other.stored {
let uuid = uuid::Uuid::new_v4();
@@ -369,6 +364,7 @@ impl TermDict {
Ok(())
}
+ /// Get the path to the model directory.
pub(crate) fn path(&self) -> &Path {
&self.path
}
diff --git a/crates/web-spell/src/trainer.rs b/crates/web-spell/src/trainer.rs
index 13f4b19f..22101768 100644
--- a/crates/web-spell/src/trainer.rs
+++ b/crates/web-spell/src/trainer.rs
@@ -1,19 +1,3 @@
-// Stract is an open source web search engine.
-// Copyright (C) 2024 Stract ApS
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see .
-
use indicatif::ParallelProgressIterator;
use rayon::prelude::*;
diff --git a/crates/zimba/LICENSE b/crates/zimba/LICENSE
new file mode 100644
index 00000000..409e7e2c
--- /dev/null
+++ b/crates/zimba/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Stract ApS
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/crates/zimba/README.md b/crates/zimba/README.md
index 29744347..b7d55629 100644
--- a/crates/zimba/README.md
+++ b/crates/zimba/README.md
@@ -14,4 +14,7 @@ fn main() -> Result<(), Error> {
Ok(())
}
-```
\ No newline at end of file
+```
+
+## License
+Zimba is licensed under the MIT license. See the [LICENSE](LICENSE) file for details.
\ No newline at end of file
diff --git a/crates/zimba/src/lib.rs b/crates/zimba/src/lib.rs
index b936abd1..892d2575 100644
--- a/crates/zimba/src/lib.rs
+++ b/crates/zimba/src/lib.rs
@@ -1,21 +1,28 @@
-// Stract is an open source web search engine.
-// Copyright (C) 2024 Stract ApS
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see .
-
//! Zim file reader.
//! https://wiki.openzim.org/wiki/ZIM_file_format
+//!
+//! The ZIM file format is used for storing web content in a highly compressed format.
+//! It is commonly used for offline storage of Wikipedia and other web content.
+//!
+//! A ZIM archive starts with a header that contains metadata about the file,
+//! including a magic number, version information, and pointers to various sections
+//! of the file. The header is followed by a list of MIME types, path pointers,
+//! title pointers, directory entries, and clusters.
+//!
+//! # Usage
+//! ```no_run
+//! use zimba::{ZimFile, Error};
+//!
+//! fn main() -> Result<(), Error> {
+//! let zim_file = ZimFile::open("path/to/file.zim")?;
+//!
+//! for article in zim_file.articles()? {
+//! println!("{}", article.title);
+//! }
+//!
+//! Ok(())
+//! }
+//! ```
pub mod wiki;
@@ -54,6 +61,7 @@ pub enum Error {
Lzma(#[from] lzma::Error),
}
+/// Read a zero-terminated string.
fn read_zero_terminated(bytes: &[u8]) -> IResult<&[u8], String> {
let (remaining, string) = map(take_while(|b| b != 0), |bytes: &[u8]| {
String::from_utf8_lossy(bytes).into_owned()
@@ -112,21 +120,58 @@ impl NomParseNumber for u8 {
}
}
+/// The ZIM file header.
#[derive(Debug)]
#[allow(unused)]
struct Header {
+ /// A 4-byte magic number. It must be `72_173_914` (0x44D495A) for a valid ZIM file.
magic: u32,
+
+ /// Major version of the ZIM archive format.
+ /// Major version is updated when an incompatible
+ /// change is integrated in the format (a lib
+ /// made for a version N will probably not be
+ /// able to read a version N+1).
major_version: u16,
+
+ /// Minor version of the ZIM archive format.
+ /// Minor version is updated when an compatible
+ /// change is integrated (a lib made for a
+ /// minor version n will be able to read a
+ /// minor version n+1).
minor_version: u16,
+
+ /// Unique ID of this ZIM archive.
uuid: u128,
+
+ /// Number of entries in the ZIM archive.
entry_count: u32,
+
+ /// Number of clusters in the ZIM archive.
cluster_count: u32,
+
+ /// Position of the URL pointer list.
url_ptr_pos: u64,
+
+ /// Position of the title pointer list.
+ /// This is considered deprecated and should not be used if possible.
title_ptr_pos: u64,
+
+ /// Position of the cluster pointer list.
cluster_ptr_pos: u64,
+
+ /// Position of the MIME type list.
mime_list_pos: u64,
+
+ /// Position of the main page or 0xFFFFFFFF if not set.
main_page: u32,
+
+ /// Position of the layout page or 0xFFFFFFFF if not set.
layout_page: u32,
+
+ /// Pointer to the MD5 checksum of this archive without
+ /// the checksum itself.
+ /// This points always 16 bytes before the end of the archive.
checksum_pos: u64,
}
@@ -191,8 +236,10 @@ impl Header {
}
}
+/// A list of MIME types.
#[derive(Debug)]
pub struct MimeTypes(Vec);
+
impl MimeTypes {
fn from_bytes(bytes: &[u8]) -> Result {
let mut mime_types = Vec::new();
@@ -226,6 +273,7 @@ impl std::ops::Index for MimeTypes {
#[derive(Debug)]
pub struct UrlPointer(pub u64);
+/// A list of URL pointers.
#[derive(Debug)]
pub struct UrlPointerList(Vec);
@@ -256,6 +304,7 @@ impl UrlPointerList {
}
}
+/// A title pointer.
#[derive(Debug)]
#[allow(unused)]
pub struct TitlePointer(u32);
@@ -291,6 +340,7 @@ impl TitlePointerList {
}
}
+/// A cluster pointer.
#[derive(Debug)]
struct ClusterPointer(u64);
@@ -325,6 +375,9 @@ impl ClusterPointerList {
}
}
+/// A directory entry in a ZIM file, representing either content or a redirect.
+/// Content entries contain actual data like articles or images, while redirect entries
+/// point to other entries in the archive.
#[derive(Debug)]
pub enum DirEntry {
Content {
@@ -459,11 +512,17 @@ impl std::io::Read for CompressedReader<'_> {
}
}
+/// An offset in a cluster.
#[derive(Debug)]
struct ClusterOffset {
offset: u64,
}
+/// A cluster.
+///
+/// Clusters contain the actual data of the directory entries.
+/// The purpose of the clusters are that data of more than one directory entry can be compressed inside one cluster, making the compression much more efficient.
+/// Typically clusters have a size of about 1 MB.
#[derive(Debug)]
pub struct Cluster {
blob_offsets: Vec,
@@ -579,6 +638,7 @@ impl Cluster {
}
}
+/// A ZIM file.
pub struct ZimFile {
header: Header,
mime_types: MimeTypes,
@@ -589,6 +649,7 @@ pub struct ZimFile {
}
impl ZimFile {
+ /// Open a ZIM file. The file is memory-mapped and only the header and pointers are read into memory upfront.
pub fn open>(path: P) -> Result {
let file = File::open(path)?;
let mmap = unsafe { memmap2::MmapOptions::new().map(&file)? };
@@ -627,6 +688,7 @@ impl ZimFile {
})
}
+ /// Get a directory entry by its index.
pub fn get_dir_entry(&self, index: usize) -> Result