diff --git a/crates/formality-core/src/parse/parser.rs b/crates/formality-core/src/parse/parser.rs index dea85e2f..96a63521 100644 --- a/crates/formality-core/src/parse/parser.rs +++ b/crates/formality-core/src/parse/parser.rs @@ -33,6 +33,7 @@ where nonterminal_name: &'static str, successes: Vec>, failures: Set>, + min_precedence_level: usize, } /// The *precedence* of a variant determines how to manage @@ -78,6 +79,7 @@ pub enum Associativity { Left, Right, None, + Both, } impl Precedence { @@ -98,16 +100,31 @@ impl Precedence { /// Construct a new precedence. fn new(level: usize, associativity: Associativity) -> Self { + // We require level to be STRICTLY LESS than usize::MAX + // so that we can always add 1. + assert!(level < std::usize::MAX); Self { level, - associativity, + associativity: associativity, } } } impl Default for Precedence { fn default() -> Self { - Self::new(std::usize::MAX, Associativity::None) + // Default precedence: + // + // Use MAX-1 because we sometimes try to add 1 when we detect recursion, and we don't want overflow. + // + // Use Right associativity because if you have a variant like `T = [T]` + // then you want to be able to (by default) embed arbitrary T in that recursive location. + // If you use LEFT or NONE, then this recursive T would have a minimum level of std::usize::MAX + // (and hence never be satisfied). + // + // Using RIGHT feels a bit weird here but seems to behave correctly all the time. + // It's tempting to add something like "Both" or "N/A" that would just not set a min + // prec level when there's recursion. + Self::new(std::usize::MAX - 1, Associativity::Both) } } @@ -172,7 +189,7 @@ where ) -> ParseResult<'t, T> { let text = skip_whitespace(text); - left_recursion::enter(scope, text, || { + left_recursion::enter(scope, text, |min_precedence_level| { let tracing_span = tracing::span!( tracing::Level::TRACE, "nonterminal", @@ -188,6 +205,7 @@ where nonterminal_name, successes: vec![], failures: set![], + min_precedence_level, }; op(&mut parser); @@ -230,6 +248,15 @@ where ); let guard = span.enter(); + if variant_precedence.level < self.min_precedence_level { + tracing::trace!( + "variant has precedence level {} which is below parser minimum of {}", + variant_precedence.level, + self.min_precedence_level, + ); + return; + } + let mut active_variant = ActiveVariant::new(variant_precedence, self.scope, self.start_text); let result = op(&mut active_variant); @@ -480,7 +507,7 @@ where } /// Accepts any of the given keywords. - #[tracing::instrument(level = "trace", ret)] + #[tracing::instrument(level = "trace", skip(self), ret)] pub fn expect_keyword_in(&mut self, expected: &[&str]) -> Result>> { let text0 = self.current_text; match self.identifier_like_string() { @@ -495,6 +522,7 @@ where /// Attempts to execute `op` and, if it successfully parses, then returns an error /// with the value (which is meant to be incorporated into a par) /// If `op` fails to parse then the result is `Ok`. + #[tracing::instrument(level = "trace", skip(self, op, err), ret)] pub fn reject( &self, op: impl Fn(&mut ActiveVariant<'s, 't, L>) -> Result>>, @@ -519,7 +547,6 @@ where /// Does not consume any input. /// You can this to implement positional keywords -- just before parsing an identifier or variable, /// you can invoke `reject_custom_keywords` to reject anything that you don't want to permit in this position. - #[tracing::instrument(level = "trace", ret)] pub fn reject_custom_keywords(&self, keywords: &[&str]) -> Result<(), Set>> { self.reject( |p| p.expect_keyword_in(keywords), @@ -534,7 +561,7 @@ where /// Extracts a string that meets the regex for an identifier /// (but it could also be a keyword). - #[tracing::instrument(level = "trace", ret)] + #[tracing::instrument(level = "trace", skip(self), ret)] pub fn identifier_like_string(&mut self) -> Result>> { self.string( |ch| matches!(ch, 'a'..='z' | 'A'..='Z' | '_'), @@ -547,7 +574,7 @@ where /// following the usual rules. **Disallows language keywords.** /// If you want to disallow additional keywords, /// see the `reject_custom_keywords` method. - #[tracing::instrument(level = "trace", ret)] + #[tracing::instrument(level = "trace", skip(self), ret)] pub fn identifier(&mut self) -> Result>> { self.reject_custom_keywords(L::KEYWORDS)?; self.identifier_like_string() @@ -610,7 +637,7 @@ where /// It also allows parsing where you use variables to stand for /// more complex parameters, which is kind of combining parsing /// and substitution and can be convenient in tests. - #[tracing::instrument(level = "trace", ret)] + #[tracing::instrument(level = "trace", skip(self), ret)] pub fn variable(&mut self) -> Result>> where R: Debug + DowncastFrom>, @@ -636,7 +663,7 @@ where } /// Extract a number from the input, erroring if the input does not start with a number. - #[tracing::instrument(level = "trace", ret)] + #[tracing::instrument(level = "trace", skip(self), ret)] pub fn number(&mut self) -> Result>> where T: FromStr + std::fmt::Debug, @@ -702,7 +729,7 @@ where /// because we consumed the open paren `(` from `T` but then encountered an error /// looking for the closing paren `)`. #[track_caller] - #[tracing::instrument(level = "Trace", ret)] + #[tracing::instrument(level = "Trace", skip(self), ret)] pub fn opt_nonterminal(&mut self) -> Result, Set>> where T: CoreParse, @@ -726,6 +753,7 @@ where /// Continue parsing instances of `T` while we can. /// This is a greedy parse. + #[tracing::instrument(level = "Trace", skip(self), ret)] pub fn many_nonterminal(&mut self) -> Result, Set>> where T: CoreParse, @@ -737,7 +765,7 @@ where Ok(result) } - #[tracing::instrument(level = "Trace", ret)] + #[tracing::instrument(level = "Trace", skip(self), ret)] pub fn delimited_nonterminal( &mut self, open: char, diff --git a/crates/formality-core/src/parse/parser/left_recursion.rs b/crates/formality-core/src/parse/parser/left_recursion.rs index f62f9247..862f77f4 100644 --- a/crates/formality-core/src/parse/parser/left_recursion.rs +++ b/crates/formality-core/src/parse/parser/left_recursion.rs @@ -7,11 +7,11 @@ //! `T` in our `thread_local!` and you can't have generic thread-local values. //! So we have to erase types. Annoying! -use std::{any::TypeId, cell::RefCell, fmt::Debug}; +use std::{any::TypeId, cell::RefCell, fmt::Debug, ops::ControlFlow}; use crate::{ language::Language, - parse::{ParseError, ParseResult, Scope, SuccessfulParse}, + parse::{parser::Associativity, ParseError, ParseResult, Scope, SuccessfulParse}, }; use super::Precedence; @@ -41,7 +41,7 @@ struct StackEntry { observed: bool, } -#[allow(dead_code)] +#[derive(Copy, Clone, Debug)] pub(super) struct CurrentState { pub left_right: LeftRight, pub precedence: Precedence, @@ -64,7 +64,7 @@ pub(super) struct CurrentState { /// e.g. `E = E + E + E`. Really we should consider any further /// recursions as `Other`, I suppose, but I'm too lazy to deal with that /// right now. -#[allow(dead_code)] +#[derive(Copy, Clone, Debug)] pub(super) enum LeftRight { /// Have not yet consumed any tokens. Left, @@ -89,7 +89,7 @@ impl StackEntry { } } - pub fn matches(&self, scope: &Scope, start_text: &str) -> bool + pub fn matches_start_state(&self, scope: &Scope, start_text: &str) -> bool where L: Language, T: Clone + 'static, @@ -100,39 +100,90 @@ impl StackEntry { scope == self.scope && start_text == self.start_text && self.type_id == type_id } + /// True if a call to parse a value of type `T` with the given scope scope/text + /// matches the current state of this stack frame -- this means that it is a recursive + /// call from this stack frame (directly or indirectly). + /// + /// # Example + /// + /// Consider this grammar: + /// + /// ```text + /// E = E + E + /// | ( E ) + /// | integer + /// ``` + /// + /// and sample input `"1 + (2 + 3)"`. + /// + /// We will start with the variant `E = E + E`. This will recursively try to parse + /// an `E` two times. + /// Each time it will invoke [`recurse`][] which will record the current text. + /// + /// The first time, the current text will be `"1 + (2 + 3)"`, same as the start text. + /// When we start parsing `E`, we'll invoke [`enter`][] and see a (left) match. + /// + /// Later, after we've parsed `1 + `, we'll try to parse another `E`, but with + /// the current text `"(2 + 3)"`. We'll again see a (right) match and eventually + /// reach the variant `E = ( E )`. + /// + /// This variant will recurse *again*. But this time the current text is `"2 + 3)"`. + /// This is not considered a recursive match. + pub fn matches_current_state<'t, L, T>( + &self, + scope: &Scope, + start_text: &'t str, + ) -> Option + where + L: Language, + T: 'static, + { + // Convert incoming scope/text to raw pointers. + let scope: *const () = erase_type(scope); + let type_id = TypeId::of::(); + + // Scope and type-id must match. + if scope != self.scope || type_id != self.type_id { + return None; + } + + // Start text must match *current* text of this frame. + let start_text: *const str = start_text; + let Some(current_state) = &self.current_state else { + panic!("observed a stack frame with no current state (forgot to call `recuse`?"); + }; + if start_text != current_state.current_text { + return None; + } + + // OK, we have a match! + Some(*current_state) + } + /// UNSAFE: Caller must guarantee that `self.value` pointer is valid. - pub unsafe fn observe<'t, T>(&mut self, start_text: &'t str) -> ParseResult<'t, T> + pub unsafe fn observe<'t, T>(&mut self, start_text: &'t str) -> Option> where T: Clone + 'static, { - assert_eq!(self.start_text, start_text as *const str); assert_eq!(self.type_id, TypeId::of::()); - assert!( - self.current_state.is_some(), - "observed a stack frame with no current state (forgot to call `recuse`?)" - ); + assert_eq!(self.start_text, start_text); // must be left-recursion + assert!(self.current_state.is_some()); self.observed = true; - match self.value { - Some(ptr) => { - let ptr = ptr as *const SuccessfulParse<'t, T>; - // UNSAFE: We rely on the caller to entry ptr is valid. - let ptr = unsafe { &*ptr }; - Ok(ptr.clone()) - } - None => Err(ParseError::at( - start_text, - format!("recursive grammar for `{}`", std::any::type_name::()), - )), - } + let ptr = self.value?; + let ptr = ptr as *const SuccessfulParse<'t, T>; + // UNSAFE: We rely on the caller to entry ptr is valid. + let ptr = unsafe { &*ptr }; + + Some(ptr.clone()) } } pub fn enter<'s, 't, L, T>( scope: &'s Scope, text: &'t str, - mut op: impl FnMut() -> ParseResult<'t, T>, + mut op: impl FnMut(usize) -> ParseResult<'t, T>, ) -> ParseResult<'t, T> where L: Language, @@ -146,37 +197,148 @@ where ); // First check whether we are already parsing this same text in this same scope as this same type. - let previous_result = STACK.with_borrow_mut(|stack| { - if let Some(entry) = stack - .iter_mut() - .find(|entry| entry.matches::(scope, text)) - { - // UNSAFE: We need to justify that `entry.value` will be valid. - // - // Each entry in `stack` corresponds to an active stack frame `F` on this thread - // and each entry in `stack` is only mutated by `F` - // - // The value in `entry.value` will either be `None` (in which case it is valid) - // or `Some(p)` where `p` is a pointer. - // - // `p` will have been assigned by `F` just before invoking `op()`. It is a reference - // to the last value in a vector owned by `F`. Since `F` is still active, that vector - // is still valid. The borrow to produce `p` is valid (by inspection) because there are no - // accesses to the vector until `op` completes - // (and, to arrive at this code, `op` has not yet completed). - unsafe { - let result = entry.observe::(text); - tracing::trace!("found left-recursive stack entry, result = {:?}", result); - Some(result) + let mut min_precedence_level = 0; + let return_value = STACK.with_borrow_mut(|stack| { + for entry in stack.iter_mut().rev() { + match entry.matches_current_state::(scope, text) { + // Keep searching. + None => (), + + // If this is left-recursion, then we will always return some value, but which value + // depends on a few factors. Consider `E = int | E + E | E * E` as our example. Because this + // is left-recursion, we know we are recursively parsing the first `E` in the `E + E` + // or `E * E` variant. + // + // The easy case is where there is no previous result. In that case, we return an error. + // This consitutes the 'base case' for a recursive grammar. + // We will only successfully parse the `int` case on that first round, but then we will try again. + // + // Otherwise, we have a prevous result for E, and we need to decide whether we can + // embed it into the variant we are currently parsing. Here we must consider the + // precedence of the variant we are parsing as well as the precedence level of the value + // we are attempting to reuse. The general rule is that you can embed higher precedence things + // into lower precedence things but not vice versa. + // + // If the current variant is left associative (as are all variants in our example), + // then we can accept values with the same precedence level as the variant or higher. + // So if the variant is `E * E` (precedence level = 2), we would only accept precedence + // level 2 (`E * E`) or 3 (`int`). If the variant were `E + E`, we could acccept anything. + // + // If the current variant is right or none associative, then we can accept values with + // strictly higher precedence level. So e.g. if `E + E` were level 1 and non-associative, + // then it would accept only things at level 2 or higher. + Some(CurrentState { + left_right: LeftRight::Left, + precedence: current_precedence, + .. + }) => { + let previous_result = unsafe { + // UNSAFE: See [1] below for justification. + entry.observe::(text) + }; + tracing::trace!( + "found left-recursive stack entry with precedence {:?}, previous_result = {:?}", + current_precedence, + previous_result + ); + let Some(previous_result) = previous_result else { + // Case 1: no previous value. + return ControlFlow::Break(Err(ParseError::at( + text, + format!( + "left-recursion on `{}` with no previous value", + std::any::type_name::() + ), + ))); + }; + + // If there is a previous value, check the precedence as described above. + let precedence_valid = match current_precedence.associativity { + Associativity::Left => { + previous_result.precedence.level >= current_precedence.level + } + Associativity::Right | Associativity::None => { + previous_result.precedence.level > current_precedence.level + } + Associativity::Both => true, + }; + tracing::trace!( + "precedence_valid = {}", + precedence_valid, + ); + if !precedence_valid { + return ControlFlow::Break(Err(ParseError::at( + text, + format!( + "left-recursion with invalid precedence \ + (current variant has precedence {:?}, previous value has level {})", + current_precedence, previous_result.precedence.level, + ), + ))); + } + + // Return the previous value. + return ControlFlow::Break(Ok(previous_result)); + + // [1] UNSAFE: We need to justify that `entry.value` will be valid. + // + // Each entry in `stack` corresponds to an active stack frame `F` on this thread + // and each entry in `stack` is only mutated by `F` + // + // The value in `entry.value` will either be `None` (in which case it is valid) + // or `Some(p)` where `p` is a pointer. + // + // `p` will have been assigned by `F` just before invoking `op()`. It is a reference + // to the last value in a vector owned by `F`. Since `F` is still active, that vector + // is still valid. The borrow to produce `p` is valid (by inspection) because there are no + // accesses to the vector until `op` completes + // (and, to arrive at this code, `op` has not yet completed). + } + + // If this is right-recursion, then we will not reuse the previous value, but we will + // consult the level of the variant to limit the variants we consider in this parse. + // Consider `E = int | E + E | E * E` as our example. Because this + // is right-recursion, we know we are recursively parsing the SECOND `E` in the `E + E` + // or `E * E` variant. + // + // If the current variant is left or none associative (as in our example), + // then we set the minimum precedence level to **one higher** than the variant's precedence. + // So if `E * E` is the current variant, we would only permit precedence level 2 (the `int` variant). + // So if `E + E` is the current variant, we would permit precedence level 1 or 2. + // + // If the current variant is right associative, then we can accept values with + // equal precedence. + Some(CurrentState { + left_right: LeftRight::Right, + precedence: current_precedence, + .. + }) => { + tracing::trace!( + "found right-recursive stack entry with precedence {:?}", + current_precedence, + ); + match current_precedence.associativity { + Associativity::Left | Associativity::None => { + min_precedence_level = current_precedence.level + 1; + } + Associativity::Right => { + min_precedence_level = current_precedence.level; + } + Associativity::Both => {} + }; + break; + } } - } else { - stack.push(StackEntry::new::(scope, text)); - None } + + stack.push(StackEntry::new::(scope, text)); + ControlFlow::Continue(()) }); - if let Some(previous_result) = previous_result { - return previous_result; + + if let ControlFlow::Break(return_value) = return_value { + return return_value; } + tracing::trace!("min_precedence_level = {}", min_precedence_level,); // Access the top stack frame. Use a macro because we don't support closures // that are generic over the return type. @@ -184,7 +346,7 @@ where (|$top:ident| $body:expr) => { STACK.with_borrow_mut(|stack| { let $top = stack.last_mut().unwrap(); - assert!($top.matches::(scope, text)); + assert!($top.matches_start_state::(scope, text)); $body }) }; @@ -193,7 +355,7 @@ where // Pop the stack before we return final_fn::final_fn!(STACK.with_borrow_mut(|stack| { let top = stack.pop().unwrap(); - assert!(top.matches::(scope, text)); + assert!(top.matches_start_state::(scope, text)); })); // EXAMPLE: Consider this grammar @@ -229,7 +391,7 @@ where // First round parse is a bit special, because if we get an error here, we can just return immediately, // as there is no base case to build from. let mut values = vec![]; - match op() { + match op(min_precedence_level) { Ok(v) => values.push(v), Err(errs) => return Err(errs), }; @@ -257,7 +419,7 @@ where // Invoke the operation. As noted above, if we get a failed parse NOW, // we know we already found the best result, so we can just use it. - let Ok(value1) = op() else { + let Ok(value1) = op(min_precedence_level) else { return Ok(values.pop().unwrap()); // If not, we are done. }; diff --git a/crates/formality-types/src/grammar/ty/parse_impls.rs b/crates/formality-types/src/grammar/ty/parse_impls.rs index 82765eb7..4c593196 100644 --- a/crates/formality-types/src/grammar/ty/parse_impls.rs +++ b/crates/formality-types/src/grammar/ty/parse_impls.rs @@ -18,7 +18,7 @@ use crate::rust::FormalityLang as Rust; // Implement custom parsing for rigid types. impl CoreParse for RigidTy { fn parse<'t>(scope: &Scope, text: &'t str) -> ParseResult<'t, Self> { - Parser::multi_variant(scope, text, "AliasTy", |parser| { + Parser::multi_variant(scope, text, "RigidTy", |parser| { // Parse a `ScalarId` (and upcast it to `RigidTy`) with the highest // precedence. If someone writes `u8`, we always interpret it as a // scalar-id. diff --git a/examples/formality-eg/grammar.rs b/examples/formality-eg/grammar.rs index 83f3ee64..a6dc25fc 100644 --- a/examples/formality-eg/grammar.rs +++ b/examples/formality-eg/grammar.rs @@ -84,17 +84,19 @@ pub enum Expr { StructLiteral(StructTy, Vec), #[grammar($v0 + $v1)] + #[precedence(1)] Add(Arc, Arc), #[grammar($v0 - $v1)] + #[precedence(1)] Sub(Arc, Arc), #[grammar($v0 * $v1)] - #[precedence(1)] + #[precedence(2)] Mul(Arc, Arc), #[grammar($v0 / $v1)] - #[precedence(1)] + #[precedence(2)] Div(Arc, Arc), #[grammar(($v0))] diff --git a/tests/parser-torture-tests/main.rs b/tests/parser-torture-tests/main.rs index 8a922c31..39836bd1 100644 --- a/tests/parser-torture-tests/main.rs +++ b/tests/parser-torture-tests/main.rs @@ -1,5 +1,6 @@ mod ambiguity; mod grammar; +mod path; mod precedence; formality_core::declare_language! { diff --git a/tests/parser-torture-tests/path.rs b/tests/parser-torture-tests/path.rs new file mode 100644 index 00000000..ddefc017 --- /dev/null +++ b/tests/parser-torture-tests/path.rs @@ -0,0 +1,41 @@ +use formality_core::{term, test}; +use std::sync::Arc; + +#[term] +pub enum Path { + #[cast] + Id(Id), + + #[grammar($v0 . $v1)] + Field(Arc, Id), + + #[grammar($v0 [ $v1 ])] + Index(Arc, Arc), +} + +formality_core::id!(Id); + +#[test] +fn path() { + let term: Path = crate::ptt::term("a.b[c.d].e"); + expect_test::expect![[r#" + Field( + Index( + Field( + Id( + a, + ), + b, + ), + Field( + Id( + c, + ), + d, + ), + ), + e, + ) + "#]] + .assert_debug_eq(&term); +} diff --git a/tests/parser-torture-tests/precedence.rs b/tests/parser-torture-tests/precedence.rs index 85af2408..d40931e2 100644 --- a/tests/parser-torture-tests/precedence.rs +++ b/tests/parser-torture-tests/precedence.rs @@ -1,4 +1,4 @@ -use formality_core::term; +use formality_core::{term, test}; use std::sync::Arc; #[term] @@ -39,22 +39,21 @@ fn mul_is_higher_precedence() { } #[test] -// FIXME #[should_panic(expected = "ambiguous parse")] -fn equal_precedence_panics() { +fn left_associative() { let term: Expr = crate::ptt::term("a + b + c"); expect_test::expect![[r#" Add( - Id( - a, - ), Add( Id( - b, + a, ), Id( - c, + b, ), ), + Id( + c, + ), ) "#]] .assert_debug_eq(&term);