diff --git a/Cargo.lock b/Cargo.lock index 6762fcfa1e3e..4e7876f6e42d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -303,6 +303,12 @@ dependencies = [ "simd-abstraction", ] +[[package]] +name = "beef" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" + [[package]] name = "better_scoped_tls" version = "1.0.0" @@ -1893,7 +1899,7 @@ checksum = "cdc6457c0eb62c71aac4bc17216026d8410337c4126773b9c5daba343f17964f" dependencies = [ "atomic-polyfill", "hash32", - "rustc_version 0.4.0", + "rustc_version 0.4.1", "spin", "stable_deref_trait", ] @@ -2611,6 +2617,40 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "logos" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab6f536c1af4c7cc81edf73da1f8029896e7e1e16a219ef09b184e76a296f3db" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-codegen" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "189bbfd0b61330abea797e5e9276408f2edbe4f822d7ad08685d67419aafb34e" +dependencies = [ + "beef", + "fnv", + "lazy_static", + "proc-macro2", + "quote", + "regex-syntax 0.8.4", + "rustc_version 0.4.1", + "syn 2.0.87", +] + +[[package]] +name = "logos-derive" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebfe8e1a19049ddbfccbd14ac834b215e11b85b90bab0c2dba7c7b92fb5d5cba" +dependencies = [ + "logos-codegen", +] + [[package]] name = "lru" version = "0.10.1" @@ -3786,9 +3826,9 @@ dependencies = [ [[package]] name = "rustc_version" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ "semver 1.0.23", ] @@ -5217,6 +5257,7 @@ dependencies = [ "codspeed-criterion-compat", "criterion", "either", + "logos", "new_debug_unreachable", "num-bigint", "num-traits", @@ -5230,6 +5271,7 @@ dependencies = [ "swc_atoms", "swc_common", "swc_ecma_ast", + "swc_ecma_raw_lexer", "swc_ecma_visit", "swc_malloc", "testing", @@ -5293,6 +5335,17 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "swc_ecma_raw_lexer" +version = "0.1.0" +dependencies = [ + "logos", + "new_debug_unreachable", + "pretty_assertions", + "regex", + "swc_common", +] + [[package]] name = "swc_ecma_testing" version = "5.0.0" diff --git a/Cargo.toml b/Cargo.toml index a5107594fbd3..cfe3f63e996c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -58,6 +58,7 @@ resolver = "2" jsonc-parser = "0.21.0" lazy_static = "1.4.0" lexical = "6.1.0" + logos = "0.15.0" lru = "0.10.0" memchr = "2.6.1" miette = "7.2.0" @@ -129,7 +130,7 @@ lto = true # We use CARGO_PROFILE_RELEASE_LTO for production builds # lto = "fat" -# debug = true +debug = true # opt-level = 'z' [profile.bench] diff --git a/crates/swc_common/src/input.rs b/crates/swc_common/src/input.rs index 52c325c6b705..c7565721dff3 100644 --- a/crates/swc_common/src/input.rs +++ b/crates/swc_common/src/input.rs @@ -40,7 +40,7 @@ impl<'a> StringInput<'a> { } #[inline(always)] - pub fn as_str(&self) -> &str { + pub fn as_str(&self) -> &'a str { self.iter.as_str() } diff --git a/crates/swc_ecma_parser/Cargo.toml b/crates/swc_ecma_parser/Cargo.toml index f4405098f25a..0a6168b5ea57 100644 --- a/crates/swc_ecma_parser/Cargo.toml +++ b/crates/swc_ecma_parser/Cargo.toml @@ -25,21 +25,23 @@ typescript = [] verify = ["swc_ecma_visit"] [dependencies] -either = { workspace = true } -num-bigint = { workspace = true } -num-traits = { workspace = true } -serde = { workspace = true, features = ["derive"] } -smallvec = { workspace = true } -smartstring = { workspace = true } -tracing = { workspace = true } -typed-arena = { workspace = true } - +either = { workspace = true } +logos = { workspace = true } new_debug_unreachable = { workspace = true } +num-bigint = { workspace = true } +num-traits = { workspace = true } phf = { workspace = true, features = ["macros"] } -swc_atoms = { version = "3.0.2", path = "../swc_atoms" } -swc_common = { version = "5.0.0", path = "../swc_common" } -swc_ecma_ast = { version = "5.0.1", path = "../swc_ecma_ast" } -swc_ecma_visit = { version = "5.0.0", path = "../swc_ecma_visit", optional = true } +serde = { workspace = true, features = ["derive"] } +smallvec = { workspace = true } +smartstring = { workspace = true } +tracing = { workspace = true } +typed-arena = { workspace = true } + +swc_atoms = { version = "3.0.2", path = "../swc_atoms" } +swc_common = { version = "5.0.0", path = "../swc_common" } +swc_ecma_ast = { version = "5.0.1", path = "../swc_ecma_ast" } +swc_ecma_raw_lexer = { version = "0.1.0", path = "../swc_ecma_raw_lexer" } +swc_ecma_visit = { version = "5.0.0", path = "../swc_ecma_visit", optional = true } [target.'cfg(not(any(target_arch = "wasm32", target_arch = "arm")))'.dependencies] stacker = { version = "0.1.15", optional = true } diff --git a/crates/swc_ecma_parser/src/error.rs b/crates/swc_ecma_parser/src/error.rs index 37ebffe5b5ea..f2e65b532d52 100644 --- a/crates/swc_ecma_parser/src/error.rs +++ b/crates/swc_ecma_parser/src/error.rs @@ -7,6 +7,7 @@ use swc_common::{ errors::{DiagnosticBuilder, Handler}, Span, Spanned, }; +use swc_ecma_raw_lexer::LogosError; use crate::token::Token; @@ -292,6 +293,8 @@ pub enum SyntaxError { ReservedTypeAssertion, ReservedArrowTypeParam, + + UnexpectedCharFromLexer, } impl SyntaxError { @@ -758,6 +761,7 @@ impl SyntaxError { as in `() => ...`." .into(), SyntaxError::InvalidAssignTarget => "Invalid assignment target".into(), + SyntaxError::UnexpectedCharFromLexer => "Unexpected character".into(), } } } @@ -800,3 +804,12 @@ impl Error { fn size_of_error() { assert_eq!(std::mem::size_of::(), 8); } + +impl From for SyntaxError { + fn from(e: LogosError) -> Self { + match e { + LogosError::UnterminatedStr => SyntaxError::UnterminatedStrLit, + LogosError::UnknownChar => SyntaxError::UnexpectedCharFromLexer, + } + } +} diff --git a/crates/swc_ecma_parser/src/lexer/jsx.rs b/crates/swc_ecma_parser/src/lexer/jsx.rs index b3c8852773c1..50a1abed7e20 100644 --- a/crates/swc_ecma_parser/src/lexer/jsx.rs +++ b/crates/swc_ecma_parser/src/lexer/jsx.rs @@ -1,18 +1,18 @@ use either::Either; use smartstring::{LazyCompact, SmartString}; +use swc_ecma_raw_lexer::RawToken; use super::*; impl Lexer<'_> { - pub(super) fn read_jsx_token(&mut self) -> LexResult> { + pub(super) fn read_jsx_token(&mut self, start: &mut BytePos) -> LexResult> { debug_assert!(self.syntax.jsx()); - let start = self.input.cur_pos(); let mut chunk_start = self.input.cur_pos(); let mut value = String::new(); loop { - let cur = match self.input.cur() { + let cur = match self.cur()? { Some(c) => c, None => { let start = self.state.start; @@ -22,25 +22,25 @@ impl Lexer<'_> { let cur_pos = self.input.cur_pos(); match cur { - '<' if self.had_line_break_before_last() && self.is_str("<<<<<< ") => { + RawToken::ConflictMarker => { let span = Span::new(cur_pos, cur_pos + BytePos(7)); self.emit_error_span(span, SyntaxError::TS1185); - self.skip_line_comment(6); - self.skip_space::(); - return self.read_token(); + // Bump conflict marker + self.input.next(); + + *start = self.input.cur_pos(); + return self.try_read_token(start); } - '<' | '{' => { + RawToken::LtOp | RawToken::LBrace => { // if cur_pos == self.state.start { - if cur == '<' && self.state.is_expr_allowed { - unsafe { - // Safety: cur() was Some('<') - self.input.bump(); - } + if cur == RawToken::LtOp && self.state.is_expr_allowed { + self.input.next(); return Ok(Some(Token::JSXTagStart)); } - return self.read_token(); + *start = self.input.cur_pos(); + return self.try_read_token(start); } let value = if value.is_empty() { @@ -61,38 +61,32 @@ impl Lexer<'_> { let raw = { let s = unsafe { // Safety: We already checked for the range - self.input.slice(start, cur_pos) + self.input.slice(*start, cur_pos) }; self.atoms.atom(s) }; return Ok(Some(Token::JSXText { raw, value })); } - '>' => { + RawToken::GtOp => { self.emit_error( cur_pos, SyntaxError::UnexpectedTokenWithSuggestions { candidate_list: vec!["`{'>'}`", "`>`"], }, ); - unsafe { - // Safety: cur() was Some('>') - self.input.bump() - } + self.input.next(); } - '}' => { + RawToken::RBrace => { self.emit_error( cur_pos, SyntaxError::UnexpectedTokenWithSuggestions { candidate_list: vec!["`{'}'}`", "`}`"], }, ); - unsafe { - // Safety: cur() was Some('}') - self.input.bump() - } + self.input.next(); } - '&' => { + RawToken::BitAndOp => { value.push_str(unsafe { // Safety: We already checked for the range self.input.slice(chunk_start, cur_pos) @@ -116,10 +110,7 @@ impl Lexer<'_> { } chunk_start = self.input.cur_pos(); } else { - unsafe { - // Safety: cur() was Some(c) - self.input.bump() - } + self.input.next(); } } } @@ -149,23 +140,23 @@ impl Lexer<'_> { let mut s = SmartString::::default(); - let c = self.input.cur(); - debug_assert_eq!(c, Some('&')); + let c = self.cur()?; + debug_assert_eq!(c, Some(RawToken::BitAndOp)); unsafe { // Safety: cur() was Some('&') - self.input.bump(); + self.input.bump(1); } let start_pos = self.input.cur_pos(); for _ in 0..10 { - let c = match self.input.cur() { + let c = match self.input.cur_char() { Some(c) => c, None => break, }; unsafe { // Safety: cur() was Some(c) - self.input.bump(); + self.input.bump(1); } if c == ';' { @@ -205,17 +196,11 @@ impl Lexer<'_> { ) -> LexResult> { debug_assert!(self.syntax.jsx()); - let ch = self.input.cur().unwrap(); - unsafe { - // Safety: cur() was Some(ch) - self.input.bump(); - } + let ch = self.input.cur_char().unwrap(); + self.input.next(); - let out = if ch == '\r' && self.input.cur() == Some('\n') { - unsafe { - // Safety: cur() was Some('\n') - self.input.bump(); - } + let out = if ch == '\r' && self.input.cur_char() == Some('\n') { + self.input.next(); Either::Left(if normalize_crlf { "\n" } else { "\r\n" }) } else { Either::Right(ch) @@ -227,157 +212,22 @@ impl Lexer<'_> { Ok(out) } - pub(super) fn read_jsx_str(&mut self, quote: char) -> LexResult { + pub(super) fn read_jsx_str(&mut self) -> LexResult { debug_assert!(self.syntax.jsx()); - let start = self.input.cur_pos(); - - unsafe { - // Safety: cur() was Some(quote) - self.input.bump(); // `quote` - } - - let mut out = String::new(); - let mut chunk_start = self.input.cur_pos(); - - loop { - let ch = match self.input.cur() { - Some(c) => c, - None => { - let start = self.state.start; - self.emit_error(start, SyntaxError::UnterminatedStrLit); - break; - } - }; - - let cur_pos = self.input.cur_pos(); - - if ch == '\\' { - let value = unsafe { - // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) - }; - - out.push_str(value); - out.push('\\'); - - self.bump(); - - chunk_start = self.input.cur_pos(); - - continue; - } - - if ch == quote { - break; - } - - if ch == '&' { - let value = unsafe { - // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) - }; + let s = self.input.cur_slice(); + let value = &s[1..s.len() - 1]; - out.push_str(value); - - let jsx_entity = self.read_jsx_entity()?; - - out.push(jsx_entity.0); - - chunk_start = self.input.cur_pos(); - } else if ch.is_line_terminator() { - let value = unsafe { - // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) - }; - - out.push_str(value); - - match self.read_jsx_new_line(false)? { - Either::Left(s) => { - out.push_str(s); - } - Either::Right(c) => { - out.push(c); - } - } - - chunk_start = cur_pos + BytePos(ch.len_utf8() as _); - } else { - unsafe { - // Safety: cur() was Some(ch) - self.input.bump(); - } - } - } - - let value = if out.is_empty() { - // Fast path: We don't need to allocate - - let cur_pos = self.input.cur_pos(); - let value = unsafe { - // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) - }; - - self.atoms.atom(value) - } else { - let cur_pos = self.input.cur_pos(); - let value = unsafe { - // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) - }; - - out.push_str(value); - - self.atoms.atom(out) - }; + let raw = self.atoms.atom(s); + let value = self.atoms.atom(value); // it might be at the end of the file when // the string literal is unterminated - if self.input.peek_ahead().is_some() { - unsafe { - // Safety: We called peek_ahead() which means cur() was Some - self.input.bump(); - } + if matches!(self.input.peek_ahead(), Ok(Some(..))) { + let _ = self.input.next(); } - let end = self.input.cur_pos(); - let raw = unsafe { - // Safety: Both of `start` and `end` are generated from `cur_pos()` - self.input.slice(start, end) - }; - - Ok(Token::Str { - value, - raw: self.atoms.atom(raw), - }) - } - - /// Read a JSX identifier (valid tag or attribute name). - /// - /// Optimized version since JSX identifiers can"t contain - /// escape characters and so can be read as single slice. - /// Also assumes that first character was already checked - /// by isIdentifierStart in readToken. - pub(super) fn read_jsx_word(&mut self) -> LexResult { - debug_assert!(self.syntax.jsx()); - debug_assert!(self.input.cur().is_some()); - debug_assert!(self.input.cur().unwrap().is_ident_start()); - - let mut first = true; - let slice = self.input.uncons_while(|c| { - if first { - first = false; - c.is_ident_start() - } else { - c.is_ident_part() || c == '-' - } - }); - - Ok(Token::JSXName { - name: self.atoms.atom(slice), - }) + Ok(Token::Str { value, raw }) } } diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 31bb91f29d3a..dea85b459cc3 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -1,26 +1,21 @@ //! ECMAScript lexer. -use std::{cell::RefCell, char, iter::FusedIterator, mem::transmute, rc::Rc}; +use std::{cell::RefCell, char, iter::FusedIterator, rc::Rc}; -use either::Either::{Left, Right}; use smallvec::{smallvec, SmallVec}; use swc_atoms::{Atom, AtomStoreCell}; use swc_common::{comments::Comments, input::StringInput, BytePos, Span}; -use swc_ecma_ast::{op, AssignOp, EsVersion, Ident}; +use swc_ecma_ast::{AssignOp, EsVersion}; +use swc_ecma_raw_lexer::{RawLexer, RawToken}; -use self::{ - comments_buffer::CommentsBuffer, - state::State, - table::{ByteHandler, BYTE_HANDLERS}, - util::*, -}; +use self::{comments_buffer::CommentsBuffer, state::State, util::CharExt}; pub use self::{ input::Input, state::{TokenContext, TokenContexts}, }; use crate::{ error::{Error, SyntaxError}, - token::{BinOpToken, IdentLike, Token, Word}, + token::{BinOpToken, IdentLike, Keyword, KnownIdent, Token, Word}, Context, Syntax, }; @@ -30,11 +25,9 @@ pub mod input; mod jsx; mod number; mod state; -mod table; #[cfg(test)] mod tests; pub mod util; -mod whitespace; pub(crate) type LexResult = Result; @@ -121,7 +114,7 @@ pub struct Lexer<'a> { comments_buffer: Option, pub(crate) ctx: Context, - input: StringInput<'a>, + input: RawLexer<'a>, start_pos: BytePos, state: State, @@ -151,7 +144,7 @@ impl<'a> Lexer<'a> { comments, comments_buffer: comments.is_some().then(CommentsBuffer::new), ctx: Default::default(), - input, + input: RawLexer::new(input), start_pos, state: State::new(syntax, start_pos), syntax, @@ -175,268 +168,292 @@ impl<'a> Lexer<'a> { op(self, &mut buf) } - /// babel: `getTokenFromCode` - fn read_token(&mut self) -> LexResult> { - let byte = match self.input.as_str().as_bytes().first() { - Some(&v) => v, + fn try_read_token(&mut self, start: &mut BytePos) -> LexResult> { + let cur = match self.cur()? { + Some(cur) => cur, None => return Ok(None), }; - let handler = unsafe { *(&BYTE_HANDLERS as *const ByteHandler).offset(byte as isize) }; - - match handler { - Some(handler) => handler(self), - None => { - let start = self.cur_pos(); - self.input.bump_bytes(1); - self.error_span( - pos_span(start), - SyntaxError::UnexpectedChar { c: byte as _ }, - ) - } - } - } - - /// `#` - fn read_token_number_sign(&mut self) -> LexResult> { - debug_assert!(self.cur().is_some()); - - unsafe { - // Safety: cur() is Some('#') - self.input.bump(); // '#' - } - - // `#` can also be a part of shebangs, however they should have been - // handled by `read_shebang()` - debug_assert!( - !self.input.is_at_start() || self.cur() != Some('!'), - "#! should have already been handled by read_shebang()" - ); - Ok(Some(Token::Hash)) + self.read_token(cur, start) } - /// Read a token given `.`. - /// - /// This is extracted as a method to reduce size of `read_token`. - #[inline(never)] - fn read_token_dot(&mut self) -> LexResult { - // Check for eof - let next = match self.input.peek() { - Some(next) => next, - None => { - unsafe { - // Safety: cur() is Some(',') - self.input.bump(); + /// babel: `getTokenFromCode` + fn read_token(&mut self, cur: RawToken, start: &mut BytePos) -> LexResult> { + let token = match cur { + RawToken::LegacyCommentOpen | RawToken::LegacyCommentClose => { + // XML style comment. ` - if self.state.had_line_break && c == b'-' && self.eat(b'>') { - self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule); - self.skip_line_comment(0); - self.skip_space::(); - return self.read_token(); - } - - if c == b'+' { - Token::PlusPlus - } else { - Token::MinusMinus - } - } else if self.input.eat_byte(b'=') { - Token::AssignOp(if c == b'+' { - AssignOp::AddAssign - } else { - AssignOp::SubAssign - }) - } else { - Token::BinOp(if c == b'+' { - BinOpToken::Add - } else { - BinOpToken::Sub - }) - })) - } - - fn read_token_bang_or_eq(&mut self, c: u8) -> LexResult> { - let start = self.cur_pos(); - let had_line_break_before_last = self.had_line_break_before_last(); - - unsafe { - // Safety: cur() is Some(c) if this method is called. - self.input.bump(); - } - - Ok(Some(if self.input.eat_byte(b'=') { - // "==" - - if self.input.eat_byte(b'=') { - if c == b'!' { - Token::BinOp(BinOpToken::NotEqEq) - } else { - // ======= - // ^ - if had_line_break_before_last && self.is_str("====") { - self.emit_error_span(fixed_len_span(start, 7), SyntaxError::TS1185); - self.skip_line_comment(4); - self.skip_space::(); - return self.read_token(); - } - - Token::BinOp(BinOpToken::EqEqEq) - } - } else if c == b'!' { - Token::BinOp(BinOpToken::NotEq) - } else { - Token::BinOp(BinOpToken::EqEq) - } - } else if c == b'=' && self.input.eat_byte(b'>') { - // "=>" - - Token::Arrow - } else if c == b'!' { - Token::Bang - } else { - Token::AssignOp(AssignOp::Assign) - })) - } } impl Lexer<'_> { - #[inline(never)] - fn read_slash(&mut self) -> LexResult> { - debug_assert_eq!(self.cur(), Some('/')); - - // Divide operator - self.bump(); - - Ok(Some(if self.eat(b'=') { - tok!("/=") - } else { - tok!('/') - })) - } - - #[inline(never)] - fn read_token_lt_gt(&mut self) -> LexResult> { - debug_assert!(self.cur() == Some('<') || self.cur() == Some('>')); - - let had_line_break_before_last = self.had_line_break_before_last(); - let start = self.cur_pos(); - let c = self.cur().unwrap(); - self.bump(); - - if self.syntax.typescript() && self.ctx.in_type && !self.ctx.should_not_lex_lt_or_gt_as_type - { - if c == '<' { - return Ok(Some(tok!('<'))); - } else if c == '>' { - return Ok(Some(tok!('>'))); - } - } - - // XML style comment. `")] + LegacyCommentClose, + + #[token("<<<<<<<")] + #[token(">>>>>>>")] + #[token("=======")] + #[token("|||||||")] + ConflictMarker, + + #[token("await")] + Await, + + #[token("break")] + Break, + + #[token("case")] + Case, + + #[token("catch")] + Catch, + + #[token("continue")] + Continue, + + #[token("debugger")] + Debugger, + + #[token("default")] + Default_, + + #[token("do")] + Do, + + #[token("else")] + Else, + + #[token("finally")] + Finally, + + #[token("for")] + For, + + #[token("function")] + Function, + + #[token("if")] + If, + + #[token("return")] + Return, + + #[token("switch")] + Switch, + + #[token("throw")] + Throw, + + #[token("try")] + Try, + + #[token("var")] + Var, + + #[token("let")] + Let, + + #[token("const")] + Const, + + #[token("while")] + While, + + #[token("with")] + With, + + #[token("new")] + New, + + #[token("this")] + This, + + #[token("super")] + Super, + + #[token("class")] + Class, + + #[token("extends")] + Extends, + + #[token("export")] + Export, + + #[token("import")] + Import, + + #[token("yield")] + Yield, + + #[token("in")] + In, + + #[token("instanceof")] + InstanceOf, + + #[token("typeof")] + TypeOf, + + #[token("void")] + Void, + + #[token("delete")] + Delete, + + #[token("abstract")] + Abstract, + + #[token("as")] + As, + + #[token("async")] + Async, + + #[token("from")] + From, + + #[token("of")] + Of, + + #[token("type")] + Type, + + #[token("global")] + Global, + + #[token("static")] + Static, + + #[token("using")] + Using, + + #[token("readonly")] + Readonly, + + #[token("unique")] + Unique, + + #[token("keyof")] + Keyof, + + #[token("declare")] + Declare, + + #[token("enum")] + Enum, + + #[token("is")] + Is, + + #[token("infer")] + Infer, + + Symbol, + + #[token("undefined")] + Undefined, + + #[token("interface")] + Interface, + + #[token("implements")] + Implements, + + #[token("asserts")] + Asserts, + + #[token("require")] + Require, + + #[token("get")] + Get, + + #[token("set")] + Set, + + #[token("any")] + Any, + + #[token("intrinsic")] + Intrinsic, + + #[token("unknown")] + Unknown, + + #[token("string")] + String, + + #[token("object")] + Object, + + #[token("number")] + Number, + + #[token("bigint")] + Bigint, + + #[token("boolean")] + Boolean, + + #[token("never")] + Never, + + #[token("assert")] + Assert, + + #[token("namespace")] + Namespace, + + #[token("accessor")] + Accessor, + + #[token("meta")] + Meta, + + #[token("target")] + Target, + + #[token("satisfies")] + Satisfies, + + #[token("package")] + Package, + + #[token("protected")] + Protected, + + #[token("private")] + Private, + + #[token("public")] + Public, +} + +fn newline_callback(l: &mut Lexer) -> Skip { + l.extras.had_line_break = true; + Skip +} + +fn whitespace_callback(_: &mut Lexer) -> Skip { + Skip +} + +impl RawToken { + pub fn is_line_terminator(&self) -> bool { + matches!(self, RawToken::NewLine) + } +} + +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] +pub enum LogosError { + #[default] + UnknownChar, + UnterminatedStr, +} diff --git a/crates/swc_ecma_raw_lexer/src/peek.rs b/crates/swc_ecma_raw_lexer/src/peek.rs new file mode 100644 index 000000000000..65686d3ef7b8 --- /dev/null +++ b/crates/swc_ecma_raw_lexer/src/peek.rs @@ -0,0 +1,84 @@ +//! Copied from `itertools` + +use std::collections::VecDeque; + +use crate::size_hint; + +/// See [`peek_nth()`] for more information. +#[derive(Clone, Debug)] +#[must_use = "iterator adaptors are lazy and do nothing unless consumed"] +pub struct PeekNth +where + I: Iterator, +{ + iter: I, + buf: VecDeque, +} + +/// A drop-in replacement for [`std::iter::Peekable`] which adds a `peek_nth` +/// method allowing the user to `peek` at a value several iterations forward +/// without advancing the base iterator. +/// +/// This differs from `multipeek` in that subsequent calls to `peek` or +/// `peek_nth` will always return the same value until `next` is called +/// (making `reset_peek` unnecessary). +pub fn peek_nth(iterable: I) -> PeekNth +where + I: Iterator, +{ + PeekNth { + iter: iterable, + buf: VecDeque::new(), + } +} + +impl PeekNth +where + I: Iterator, +{ + /// Works exactly like the `peek` method in [`std::iter::Peekable`]. + pub fn peek(&mut self) -> Option<&I::Item> { + self.peek_nth(0) + } + + pub fn peek_nth(&mut self, n: usize) -> Option<&I::Item> { + let unbuffered_items = (n + 1).saturating_sub(self.buf.len()); + + self.buf.extend(self.iter.by_ref().take(unbuffered_items)); + + self.buf.get(n) + } + + pub fn inner_mut(&mut self) -> &mut I { + &mut self.iter + } + + pub fn inner(&self) -> &I { + &self.iter + } +} + +impl Iterator for PeekNth +where + I: Iterator, +{ + type Item = I::Item; + + fn next(&mut self) -> Option { + self.buf.pop_front().or_else(|| self.iter.next()) + } + + fn size_hint(&self) -> (usize, Option) { + size_hint::add_scalar(self.iter.size_hint(), self.buf.len()) + } + + fn fold(self, mut init: B, mut f: F) -> B + where + F: FnMut(B, Self::Item) -> B, + { + init = self.buf.into_iter().fold(init, &mut f); + self.iter.fold(init, f) + } +} + +impl ExactSizeIterator for PeekNth where I: ExactSizeIterator {} diff --git a/crates/swc_ecma_raw_lexer/src/regexp.rs b/crates/swc_ecma_raw_lexer/src/regexp.rs new file mode 100644 index 000000000000..6665ff6324e7 --- /dev/null +++ b/crates/swc_ecma_raw_lexer/src/regexp.rs @@ -0,0 +1,43 @@ +use logos::Logos; + +use crate::{LogosError, RawLexer}; + +impl<'a> RawLexer<'a> { + /// Current token must be [`RawToken::DivOp`] + pub fn read_regexp(&mut self) -> Result<&'a str, LogosError> { + self.reset_peeked(); + + let s = self.lexer.inner().remainder(); + let remainder_len = s.len(); + + let mut lexer = RegexpContent::lexer(s); + + for token in lexer.by_ref().flatten() { + if token == RegexpContent::Terminate { + break; + } + } + + let consumed = remainder_len - lexer.remainder().len(); + + self.lexer.inner_mut().bump(consumed); + + Ok(s.get(..consumed).unwrap()) + } +} + +#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)] +#[logos(error = LogosError)] +enum RegexpContent { + #[regex(r#"\[[^\]]+\]"#)] + Class, + + #[regex(r#"\\."#)] + Escape, + + #[token(r"/")] + Terminate, + + #[regex(r#"[^\\\[/]+"#)] + Normal, +} diff --git a/crates/swc_ecma_raw_lexer/src/size_hint.rs b/crates/swc_ecma_raw_lexer/src/size_hint.rs new file mode 100644 index 000000000000..dea3e2eb1eb5 --- /dev/null +++ b/crates/swc_ecma_raw_lexer/src/size_hint.rs @@ -0,0 +1,95 @@ +//! Copied from `itertools` +//! +//! Arithmetic on `Iterator.size_hint()` values. + +use std::cmp; + +/// `SizeHint` is the return type of `Iterator::size_hint()`. +pub type SizeHint = (usize, Option); + +/// Add `SizeHint` correctly. +#[inline] +pub fn add(a: SizeHint, b: SizeHint) -> SizeHint { + let min = a.0.saturating_add(b.0); + let max = match (a.1, b.1) { + (Some(x), Some(y)) => x.checked_add(y), + _ => None, + }; + + (min, max) +} + +/// Add `x` correctly to a `SizeHint`. +#[inline] +pub fn add_scalar(sh: SizeHint, x: usize) -> SizeHint { + let (mut low, mut hi) = sh; + low = low.saturating_add(x); + hi = hi.and_then(|elt| elt.checked_add(x)); + (low, hi) +} + +/// Subtract `x` correctly from a `SizeHint`. +#[inline] +pub fn sub_scalar(sh: SizeHint, x: usize) -> SizeHint { + let (mut low, mut hi) = sh; + low = low.saturating_sub(x); + hi = hi.map(|elt| elt.saturating_sub(x)); + (low, hi) +} + +/// Multiply `SizeHint` correctly +#[inline] +pub fn mul(a: SizeHint, b: SizeHint) -> SizeHint { + let low = a.0.saturating_mul(b.0); + let hi = match (a.1, b.1) { + (Some(x), Some(y)) => x.checked_mul(y), + (Some(0), None) | (None, Some(0)) => Some(0), + _ => None, + }; + (low, hi) +} + +/// Multiply `x` correctly with a `SizeHint`. +#[inline] +pub fn mul_scalar(sh: SizeHint, x: usize) -> SizeHint { + let (mut low, mut hi) = sh; + low = low.saturating_mul(x); + hi = hi.and_then(|elt| elt.checked_mul(x)); + (low, hi) +} + +/// Return the maximum +#[inline] +pub fn max(a: SizeHint, b: SizeHint) -> SizeHint { + let (a_lower, a_upper) = a; + let (b_lower, b_upper) = b; + + let lower = cmp::max(a_lower, b_lower); + + let upper = match (a_upper, b_upper) { + (Some(x), Some(y)) => Some(cmp::max(x, y)), + _ => None, + }; + + (lower, upper) +} + +/// Return the minimum +#[inline] +pub fn min(a: SizeHint, b: SizeHint) -> SizeHint { + let (a_lower, a_upper) = a; + let (b_lower, b_upper) = b; + let lower = cmp::min(a_lower, b_lower); + let upper = match (a_upper, b_upper) { + (Some(u1), Some(u2)) => Some(cmp::min(u1, u2)), + _ => a_upper.or(b_upper), + }; + (lower, upper) +} + +#[test] +fn mul_size_hints() { + assert_eq!(mul((3, Some(4)), (3, Some(4))), (9, Some(16))); + assert_eq!(mul((3, Some(4)), (usize::MAX, None)), (usize::MAX, None)); + assert_eq!(mul((3, None), (0, Some(0))), (0, Some(0))); +} diff --git a/crates/swc_ecma_raw_lexer/src/string.rs b/crates/swc_ecma_raw_lexer/src/string.rs new file mode 100644 index 000000000000..7a32aca49b3e --- /dev/null +++ b/crates/swc_ecma_raw_lexer/src/string.rs @@ -0,0 +1,107 @@ +use logos::{Lexer, Logos}; + +use crate::{LogosError, RawToken}; + +pub fn consume_str_single_quote(lex: &mut Lexer) -> Result<(), LogosError> { + consume_str(lex, StrContent::SingleQuote) +} + +pub fn consume_str_double_quote(lex: &mut Lexer) -> Result<(), LogosError> { + consume_str(lex, StrContent::DoubleQuote) +} + +fn consume_str(lex: &mut Lexer, stop_token: StrContent) -> Result<(), LogosError> { + let remainder = lex.remainder(); + let total_len = remainder.len(); + + let mut str_lexer = Lexer::::new(remainder); + let mut terminated = false; + + while let Some(Ok(token)) = str_lexer.next() { + if token == stop_token { + terminated = true; + break; + } + } + + let left_len = str_lexer.remainder().len(); + let consumed = total_len - left_len; + lex.bump(consumed); + + if !terminated { + return Err(LogosError::UnterminatedStr); + } + + Ok(()) +} + +#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)] +enum StrContent { + #[regex(r#"\\["'\\bfnrtv]"#, priority = 100)] + #[regex(r#"\\0[0-7]*"#, priority = 100)] + #[regex(r#"\\x[0-9a-fA-F]{2}"#, priority = 100)] + #[regex(r#"\\u[0-9a-fA-F]{4}"#, priority = 100)] + #[regex(r#"\\[^'"\\]+"#)] + Escape, + + #[regex(r#"[^'"\\]+"#)] + Normal, + + #[regex(r#"'"#)] + SingleQuote, + + #[regex(r#"""#)] + DoubleQuote, +} + +#[cfg(test)] +mod tests { + use logos::Lexer; + use pretty_assertions::assert_eq; + + use super::StrContent; + + fn assert_str(text: &str, expected: &[StrContent]) { + dbg!(text); + dbg!(expected); + + let actual = Lexer::::new(text) + .map(|v| v.unwrap()) + .collect::>(); + + let mut lexer = Lexer::::new(text); + + while let Some(Ok(token)) = lexer.next() { + dbg!(&token); + dbg!(lexer.slice()); + } + + // Actual contains the last quote + + assert_eq!(expected.len() + 1, actual.len()); + assert_eq!(expected, &actual[..expected.len()]); + + assert!(matches!( + actual.last(), + Some(StrContent::SingleQuote | StrContent::DoubleQuote) + )); + } + + #[test] + fn test_newline() { + assert_str( + "hello\\nworld'", + &[StrContent::Normal, StrContent::Escape, StrContent::Normal], + ); + } + + #[test] + fn test_escape() { + assert_str(r#"\''"#, &[StrContent::Escape]); + } + + #[test] + fn test_escape_escape() { + assert_str(r#"\\'"#, &[StrContent::Escape]); + } +} diff --git a/crates/swc_ecma_raw_lexer/tests/raw_lexer.rs b/crates/swc_ecma_raw_lexer/tests/raw_lexer.rs new file mode 100644 index 000000000000..99fc9133b3d3 --- /dev/null +++ b/crates/swc_ecma_raw_lexer/tests/raw_lexer.rs @@ -0,0 +1,61 @@ +use logos::Logos; +use swc_ecma_raw_lexer::RawToken; + +fn assert_str(s: &str) { + println!("String input: `{s}`"); + + let mut tokens = RawToken::lexer(s); + + assert_eq!(tokens.next(), Some(Ok(RawToken::Str))); + assert_eq!(tokens.next(), None); + + println!("Done") +} + +fn assert_strs(s: &str) { + println!("String input: `{s}`"); + + let mut tokens = RawToken::lexer(s); + + while let Some(Ok(token)) = tokens.next() { + assert_eq!(token, RawToken::Str); + assert_eq!(tokens.next(), Some(Ok(RawToken::Semi))); + } + + assert_eq!(tokens.next(), None); +} + +#[test] +fn test_str_1() { + assert_str(r#""hello""#); + assert_str(r#"'hello'"#); +} + +#[test] +fn test_str_escape_single_char() { + assert_str(r#""hello\nworld""#); + assert_str(r#"'hello\nworld'"#); +} + +#[test] +fn test_str_escape_hex() { + assert_str(r#""use\x20strict""#); +} + +#[test] +fn test_str_escape_zero_octal() { + assert_str(r#""use\0strict""#); +} + +#[test] +fn test_str_escape_unicode() { + assert_str(r#""use\u2028strict""#); +} + +#[test] +fn test_str_escape_escape() { + assert_str(r#"'\\\\'"#); + assert_str(r#""\\\\""#); + + assert_strs(r#"'\\\\';"\\\\";"#); +}