Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional documentation to tokenizer and parser steps #13

Merged
merged 10 commits into from
Apr 5, 2024
2 changes: 2 additions & 0 deletions src/data/page_ref.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ pub struct PageRef<'t> {
}

impl<'t> PageRef<'t> {
/// Creates a [`PageRef`] with the given page and site.
#[inline]
pub fn page_and_site<S1, S2>(site: S1, page: S2) -> Self
where
Expand All @@ -52,6 +53,7 @@ impl<'t> PageRef<'t> {
}
}

/// Creates a [`PageRef`] with the given page and no site.
#[inline]
pub fn page_only<S>(page: S) -> Self
where
Expand Down
1 change: 0 additions & 1 deletion src/parsing/collect/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ mod prelude {
pub use crate::parsing::prelude::*;
pub use crate::parsing::rule::Rule;
pub use crate::parsing::token::{ExtractedToken, Token};
pub use crate::text::FullText;
}

mod consume;
Expand Down
7 changes: 3 additions & 4 deletions src/parsing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ mod prelude {
};
pub use crate::settings::WikitextSettings;
pub use crate::text::FullText;
pub use crate::tree::{Element, Elements, OwnedElementsIterator};
pub use crate::tree::{Element, Elements};
}

use self::depth::{process_depths, DepthItem, DepthList};
Expand Down Expand Up @@ -74,7 +74,7 @@ pub use self::token::{ExtractedToken, Token};

/// Parse through the given tokens and produce an AST.
///
/// This takes a list of `ExtractedToken` items produced by `tokenize()`.
/// This takes a list of [`ExtractedToken`] items produced by [tokenize](crate::tokenizer::tokenize()).
pub fn parse<'r, 't>(
tokenization: &'r Tokenization<'t>,
page_info: &'r PageInfo<'t>,
Expand Down Expand Up @@ -243,8 +243,7 @@ impl NextIndex<TableOfContentsIndex> for Incrementer {
}
}

// Parse internal result

/// Represents the result of an internal parse.
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct UnstructuredParseResult<'r, 't> {
/// The returned result from parsing.
Expand Down
4 changes: 3 additions & 1 deletion src/parsing/paragraph/stack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pub struct ParagraphStack<'t> {
/// Elements being accumulated in the current paragraph.
current: Vec<Element<'t>>,

/// Previous elements created, to be outputted in the final `SyntaxTree`.
/// Previous elements created, to be outputted in the final [`SyntaxTree`].
finished: Vec<Element<'t>>,

/// Gathered errors from paragraph parsing.
Expand Down Expand Up @@ -91,6 +91,7 @@ impl<'t> ParagraphStack<'t> {
}
}

/// Creates a paragraph element out of this instance's current elements.
pub fn build_paragraph(&mut self) -> Option<Element<'t>> {
debug!(
"Building paragraph from current stack state (length {})",
Expand All @@ -111,6 +112,7 @@ impl<'t> ParagraphStack<'t> {
Some(element)
}

/// Set the finished field in this struct to the paragraph element.
pub fn end_paragraph(&mut self) {
debug!("Ending the current paragraph to push as a completed element");

Expand Down
3 changes: 2 additions & 1 deletion src/parsing/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ use std::{mem, ptr};

const MAX_RECURSION_DEPTH: usize = 100;

/// Parser for a set of tokens.
#[derive(Debug, Clone)]
pub struct Parser<'r, 't> {
// Page and parse information
Expand Down Expand Up @@ -204,7 +205,7 @@ impl<'r, 't> Parser<'r, 't> {
}
}

// Table of Contents
/// Add heading element to table of contents.
pub fn push_table_of_contents_entry(
&mut self,
heading: HeadingLevel,
Expand Down
2 changes: 1 addition & 1 deletion src/parsing/rule/impls/block/blocks/later.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
//! (not to be confused with `MiniRecentThreads`) which only
//! outputted "later." and no other functionality.
//!
//! See https://twitter.com/wikidotbugs/status/1328588862218702850
//! See <https://twitter.com/wikidotbugs/status/1328588862218702850>

use super::prelude::*;

Expand Down
4 changes: 1 addition & 3 deletions src/parsing/rule/impls/block/blocks/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,9 @@

mod prelude {
pub use super::super::{Arguments, BlockRule};
pub use crate::parsing::collect::*;
pub use crate::parsing::condition::ParseCondition;
pub use crate::parsing::parser::Parser;
pub use crate::parsing::prelude::*;
pub use crate::parsing::{ParseError, Token};
pub use crate::parsing::ParseError;
pub use crate::tree::{Container, ContainerType, Element};

#[cfg(debug)]
Expand Down
5 changes: 2 additions & 3 deletions src/parsing/rule/impls/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,10 @@ mod prelude {
pub use crate::parsing::consume::consume;
pub use crate::parsing::error::{ParseError, ParseErrorKind};
pub use crate::parsing::parser::Parser;
pub use crate::parsing::result::{ParseResult, ParseSuccess};
pub use crate::parsing::result::ParseResult;
pub use crate::parsing::rule::{LineRequirement, Rule};
pub use crate::parsing::token::{ExtractedToken, Token};
pub use crate::text::FullText;
pub use crate::tree::{AttributeMap, Container, ContainerType, Element, Elements};
pub use crate::tree::{AttributeMap, ContainerType, Element, Elements};
}

mod anchor;
Expand Down
2 changes: 1 addition & 1 deletion src/parsing/rule/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ mod mapping;

pub mod impls;

pub use self::mapping::{get_rules_for_token, RULE_MAP};
pub use self::mapping::get_rules_for_token;

/// Defines a rule that can possibly match tokens and return an `Element`.
#[derive(Copy, Clone)]
Expand Down
13 changes: 11 additions & 2 deletions src/parsing/token/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ use pest::Parser;
use std::ops::Range;
use strum_macros::IntoStaticStr;

/// Struct that represents a token in a specific text.
#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
pub struct ExtractedToken<'a> {
pub token: Token,
Expand All @@ -46,6 +47,8 @@ pub struct ExtractedToken<'a> {
}

impl<'a> ExtractedToken<'a> {
/// Returns a new object with the same values, except with span refering to the byte indicies
/// of the text if it were in UTF-16 rather than in UTF-8.
#[must_use]
pub fn to_utf16_indices(&self, map: &Utf16IndexMap) -> Self {
// Copy fields
Expand All @@ -61,6 +64,8 @@ impl<'a> ExtractedToken<'a> {
}
}

/// Enum that represents the type of a parsed token. For a struct with additional context
/// surrounding the positioning and content of the token, see [`ExtractedToken`].
#[derive(
Serialize, Deserialize, Enum, IntoStaticStr, Debug, Copy, Clone, PartialEq, Eq,
)]
Expand Down Expand Up @@ -163,6 +168,10 @@ pub enum Token {
}

impl Token {
/// Extracts all tokens from the given text.
/// # Errors
/// Returns an error if something goes wrong with the parsing process. This will result in the
/// only [`Token`] being a raw text containing all of the input.
pub(crate) fn extract_all(text: &str) -> Vec<ExtractedToken> {
info!("Running lexer on input");

Expand Down Expand Up @@ -196,7 +205,7 @@ impl Token {
}
}

/// Converts a single `Pair` from pest into its corresponding `ExtractedToken`.
/// Converts a single [`Pair`] from pest into its corresponding [`ExtractedToken`].
fn convert_pair(pair: Pair<Rule>) -> ExtractedToken {
// Extract values from the Pair
let rule = pair.as_rule();
Expand All @@ -212,7 +221,7 @@ impl Token {
ExtractedToken { token, slice, span }
}

/// Mapping of a pest `Rule` to its corresponding `Token` enum.
/// Maps each pest [`Rule`] to its corresponding [`Token`].
fn get_from_rule(rule: Rule) -> Token {
match rule {
// Symbols
Expand Down
2 changes: 1 addition & 1 deletion src/render/html/element/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ mod prelude {
pub use super::super::context::HtmlContext;
pub use super::super::random::Random;
pub use super::{render_element, render_elements};
pub use crate::tree::{Element, SyntaxTree};
pub use crate::tree::Element;
}

use self::bibliography::{render_bibcite, render_bibliography};
Expand Down
1 change: 1 addition & 0 deletions src/render/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

#[allow(unused_imports)]
mod prelude {
pub use super::Render;
pub use crate::data::PageInfo;
Expand Down
32 changes: 32 additions & 0 deletions src/settings/interwiki.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,23 @@ use once_cell::sync::Lazy;
use std::borrow::Cow;
use std::collections::HashMap;

/// An [`InterwikiSettings`] instance that has no prefixes.
pub static EMPTY_INTERWIKI: Lazy<InterwikiSettings> = Lazy::new(|| InterwikiSettings {
prefixes: hashmap! {},
});

#[allow(rustdoc::bare_urls)]
/// An [`InterwikiSettings`] instance that has the default prefixes.
///
/// These prefixes are:
/// - `wikipedia:path` => `https://wikipedia.org/wiki/path`
/// - `wp:path` => `https://wikipedia.org/wiki/path`
/// - `commons:path` => `https://commons.wikimedia.org/wiki/path`
/// - `google:path` => `https://google.com/search?q=path`
/// - `duckduckgo:path` => `https://duckduckgo.com/?q=path`
/// - `ddg:path` => `https://duckduckgo.com/?q=path`
/// - `dictionary:path` => `https://dictionary.com/browse/path`
/// - `thesaurus:path` => `https://thesaurus.com/browse/path`
pub static DEFAULT_INTERWIKI: Lazy<InterwikiSettings> = Lazy::new(|| InterwikiSettings {
prefixes: hashmap! {
cow!("wikipedia") => cow!("https://wikipedia.org/wiki/$$"),
Expand All @@ -38,18 +52,35 @@ pub static DEFAULT_INTERWIKI: Lazy<InterwikiSettings> = Lazy::new(|| InterwikiSe
},
});

/// Settings that determine how to turn [`interwiki links`](http://org.wikidot.com/doc:wiki-syntax#toc21)
/// into full URLs.
#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, Eq)]
pub struct InterwikiSettings {
#[serde(flatten)]
/// A map from each interwiki prefix to the interwiki URL. A '$$' in the URL indicates where the path specified in
/// the Wikijump interwiki block should go.
pub prefixes: HashMap<Cow<'static, str>, Cow<'static, str>>,
}

impl InterwikiSettings {
/// Creates a new instance with no prefixes.
#[inline]
pub fn new() -> Self {
InterwikiSettings::default()
}

/// Creates a full URL from an interwiki link.
/// # Example
/// ```
/// # use ftml::settings::*;
/// assert_eq!(DEFAULT_INTERWIKI.build("wikipedia:Mallard").unwrap(), "https://wikipedia.org/wiki/Mallard");
/// ```
emmiegit marked this conversation as resolved.
Show resolved Hide resolved
///
/// Returns None if:
/// - The link starts with a colon
/// - There is no colon in the link
/// - There is nothing after the colon
/// - The interwiki prefix is not found
pub fn build(&self, link: &str) -> Option<String> {
match link.find(':') {
// Starting with a colon is not interwiki, skip.
Expand Down Expand Up @@ -145,4 +176,5 @@ fn interwiki_prefixes() {
check!("thesaurus:oak", Some("https://thesaurus.com/browse/oak"));
check!("banana:fruit-salad", None);
check!(":empty", None);
check!("no-link:", None);
}
1 change: 1 addition & 0 deletions src/settings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ pub struct WikitextSettings {
}

impl WikitextSettings {
/// Returns the default settings for the given [`WikitextMode`].
pub fn from_mode(mode: WikitextMode) -> Self {
let interwiki = DEFAULT_INTERWIKI.clone();

Expand Down
1 change: 1 addition & 0 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
use crate::parsing::{ExtractedToken, Token};
use crate::text::FullText;

/// Struct that represents both a list of tokens and the text the tokens were generated from.
#[derive(Debug, Clone)]
pub struct Tokenization<'t> {
tokens: Vec<ExtractedToken<'t>>,
Expand Down
2 changes: 1 addition & 1 deletion src/tree/attribute/safe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ macro_rules! hashset_unicase {

/// List of safe attributes. All others will be filtered out.
///
/// See https://scuttle.atlassian.net/wiki/spaces/WD/pages/1030782977/Allowed+Attributes+in+Wikitext
/// See <https://scuttle.atlassian.net/wiki/spaces/WD/pages/1030782977/Allowed+Attributes+in+Wikitext>
pub static SAFE_ATTRIBUTES: Lazy<HashSet<UniCase<&'static str>>> = Lazy::new(|| {
hashset_unicase![
"accept",
Expand Down
3 changes: 2 additions & 1 deletion src/tree/element/object.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use ref_map::*;
use std::borrow::Cow;
use std::num::NonZeroU32;

/// Represents an element to be rendered.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
#[serde(rename_all = "kebab-case", tag = "element", content = "data")]
pub enum Element<'t> {
Expand Down Expand Up @@ -367,7 +368,7 @@ impl Element<'_> {
/// This is to avoid making the call very expensive, but for a complete
/// understanding of the paragraph requirements, see the `Elements` return.
///
/// See https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#phrasing_content
/// See <https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#phrasing_content>
pub fn paragraph_safe(&self) -> bool {
match self {
Element::Container(container) => container.ctype().paragraph_safe(),
Expand Down
2 changes: 1 addition & 1 deletion src/tree/partial.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ pub enum PartialElement<'t> {

/// Text associated with a Ruby annotation.
///
/// Outputs HTML `<rt>`. See also https://developer.mozilla.org/en-US/docs/Web/HTML/Element/ruby.
/// Outputs HTML `<rt>`. See also <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/ruby>.
RubyText(RubyText<'t>),
}

Expand Down
2 changes: 1 addition & 1 deletion src/utf16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ pub struct Utf16IndexMap<'t> {
impl<'t> Utf16IndexMap<'t> {
/// Produces a mapping of UTF-8 byte index to UTF-16 index.
///
/// This enables objects to be converted into using character indices
/// This enables objects to be converted from UTF-8 into UTF-16 using character indices
/// for strings rather than byte indices. This is useful for environments
/// which do use UTF-16 strings, such as Javascript (via WebASM).
pub fn new(text: &'t str) -> Self {
Expand Down
Loading