diff --git a/src/lexer/char_scanner.rs b/src/lexer/char_scanner.rs index e7bdac3..ad5d2ca 100644 --- a/src/lexer/char_scanner.rs +++ b/src/lexer/char_scanner.rs @@ -82,7 +82,7 @@ impl Scannable for CharScanner { } // Returns the current character - fn peek(&self) -> char { + fn curr(&self) -> char { self.current } } diff --git a/src/lexer/lexem.rs b/src/lexer/lexem.rs index 6b97626..3b50063 100644 --- a/src/lexer/lexem.rs +++ b/src/lexer/lexem.rs @@ -1,4 +1,4 @@ -use std::fmt::Display; +use std::{error::Error, fmt::Display}; use crate::{ lexer::{keywords::Keyword, operators::Operator}, @@ -61,22 +61,68 @@ impl Lexem { pub struct LexemBuilder<'a> { scanner: &'a mut CharScanner, start: Position, + errors: &'a mut Vec, } -impl<'a> LexemBuilder<'a> { - pub fn new(scanner: &'a mut CharScanner) -> Self { - let start = (&*scanner).last_pos(); - Self { scanner, start } +#[derive(Debug, PartialEq, Eq)] +pub enum LexemErrorVariant { + CommentNeverEnds, + CommentTooLong, + StringNeverEnds, + StringTooLong, + IntegerPartTooBig, + DecimalPartTooBig, + IdentifierTooLong, + InvalidEscapeCharacter(char), + InvalidSequence(String), +} + +impl Display for LexemErrorVariant { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + LexemErrorVariant::CommentNeverEnds => f.write_str("comment never ends"), + LexemErrorVariant::CommentTooLong => f.write_str("comment too long"), + LexemErrorVariant::StringNeverEnds => f.write_str("string never ends"), + LexemErrorVariant::StringTooLong => f.write_str("string too long"), + LexemErrorVariant::IntegerPartTooBig => f.write_str("integer part too big"), + LexemErrorVariant::DecimalPartTooBig => f.write_str("decimal part too big"), + LexemErrorVariant::IdentifierTooLong => f.write_str("identifier too long"), + LexemErrorVariant::InvalidEscapeCharacter(c) => { + f.write_fmt(format_args!("invalid escape character `\\{}`", c)) + } + LexemErrorVariant::InvalidSequence(s) => { + f.write_fmt(format_args!("invalid sequence `{}`", s)) + } + } } +} + +#[derive(Debug)] +pub struct LexemError { + pub start: Position, + pub end: Position, + pub variant: LexemErrorVariant, +} - /// Lexem start position - pub fn get_start(&self) -> Position { - self.start +impl Display for LexemError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_fmt(format_args!( + "Error from {} to {}: {}", + self.start, self.end, self.variant + )) } +} + +impl Error for LexemError {} - /// Scanner position - pub fn get_here(&self) -> Position { - self.scanner.last_pos() +impl<'a> LexemBuilder<'a> { + pub fn new(scanner: &'a mut CharScanner, errors: &'a mut Vec) -> Self { + let start = (&*scanner).last_pos(); + Self { + scanner, + start, + errors, + } } /// Create a lexem @@ -88,6 +134,15 @@ impl<'a> LexemBuilder<'a> { pub fn bake(&self, token_type: LexemType) -> Option { Some(self.bake_raw(token_type)) } + + /// Reports an error that happen during building + pub fn error(&mut self, e: LexemErrorVariant) { + self.errors.push(LexemError { + start: self.start, + end: self.scanner.last_pos(), + variant: e, + }); + } } impl<'a> Scannable for LexemBuilder<'a> { @@ -97,8 +152,8 @@ impl<'a> Scannable for LexemBuilder<'a> { } #[inline] - fn peek(&self) -> char { - self.scanner.peek() + fn curr(&self) -> char { + self.scanner.curr() } } diff --git a/src/lexer/macros.rs b/src/lexer/macros.rs index 84cb23b..2d21cb3 100644 --- a/src/lexer/macros.rs +++ b/src/lexer/macros.rs @@ -11,7 +11,7 @@ macro_rules! char_match { } }; ($token_builder: expr, $default: expr, $($pattern: literal, $operator: expr), +) => { { $token_builder.pop(); - match $token_builder.peek() { + match $token_builder.curr() { $($pattern => { $token_builder.pop(); Some($operator) diff --git a/src/lexer/matchers/comment.rs b/src/lexer/matchers/comment.rs index 44d8926..ef311b5 100644 --- a/src/lexer/matchers/comment.rs +++ b/src/lexer/matchers/comment.rs @@ -1,6 +1,6 @@ use crate::{ lexer::{ - lexem::{Lexem, LexemBuilder, LexemType}, + lexem::{Lexem, LexemBuilder, LexemErrorVariant, LexemType}, operators::Operator, }, scannable::Scannable, @@ -10,30 +10,30 @@ use crate::{ /// - `/` - division /// - `//` - single line comment /// - `/* [...] */` - multi-line comment -pub fn match_comment_or_division(tb: &mut LexemBuilder) -> Option { - if tb.peek() == '/' { - tb.pop(); - match tb.peek() { - '*' => return Some(complete_multi_line_comment(tb)), - '/' => return Some(complete_single_line_comment(tb)), - _ => return tb.bake(LexemType::Operator(Operator::Slash)), +pub fn match_comment_or_division(lb: &mut LexemBuilder, max: usize) -> Option { + if lb.curr() == '/' { + lb.pop(); + match lb.curr() { + '*' => return Some(complete_multi_line_comment(lb, max)), + '/' => return Some(complete_single_line_comment(lb, max)), + _ => return lb.bake(LexemType::Operator(Operator::Slash)), } } None } /// Completes a multi-line comment -fn complete_multi_line_comment(tb: &mut LexemBuilder) -> Lexem { +fn complete_multi_line_comment(lb: &mut LexemBuilder, max: usize) -> Lexem { let mut content: Vec = vec![]; - tb.pop(); + lb.pop(); loop { - match tb.peek() { + match lb.curr() { '*' => { - tb.pop(); - match tb.peek() { + lb.pop(); + match lb.curr() { '/' => { - tb.pop(); - break tb.bake_raw(LexemType::Comment(content.into_iter().collect())); + lb.pop(); + break lb.bake_raw(LexemType::Comment(content.into_iter().collect())); } c => { content.push('*'); @@ -42,38 +42,48 @@ fn complete_multi_line_comment(tb: &mut LexemBuilder) -> Lexem { } } '\x03' => { - eprintln!("Comment started at {} never ends.", tb.get_start()); - let t = tb.bake_raw(LexemType::Comment(content.into_iter().collect())); - tb.pop(); + lb.error(LexemErrorVariant::CommentNeverEnds); + let t = lb.bake_raw(LexemType::Comment(content.into_iter().collect())); + lb.pop(); break t; } c => { content.push(c); } } - tb.pop(); + if content.len() > max { + content.pop(); + lb.error(LexemErrorVariant::CommentTooLong); + break lb.bake_raw(LexemType::Comment(content.into_iter().collect())); + } + lb.pop(); } } /// Completes a single-line comment -fn complete_single_line_comment(tb: &mut LexemBuilder) -> Lexem { +fn complete_single_line_comment(lb: &mut LexemBuilder, max: usize) -> Lexem { let mut content: Vec = vec![]; - tb.pop(); + lb.pop(); loop { - match tb.peek() { - '\n' | '\x03' => return tb.bake_raw(LexemType::Comment(content.into_iter().collect())), + match lb.curr() { + '\n' | '\x03' => break lb.bake_raw(LexemType::Comment(content.into_iter().collect())), c => { content.push(c); } } - tb.pop(); + if content.len() > max { + content.pop(); + lb.error(LexemErrorVariant::CommentTooLong); + break lb.bake_raw(LexemType::Comment(content.into_iter().collect())); + } + lb.pop(); } } #[cfg(test)] mod tests { use crate::lexer::{ - lexem::{Lexem, LexemType}, + lexem::{Lexem, LexemError, LexemErrorVariant, LexemType}, matchers::test_utils::{lexem_with, matcher_with}, operators::Operator, }; @@ -81,7 +91,13 @@ mod tests { use super::match_comment_or_division; fn matcher(string: &'static str) -> Option { - matcher_with(match_comment_or_division, string) + let r = matcher_with(|lb| match_comment_or_division(lb, 32), string); + assert!(r.1.is_empty()); + r.0 + } + + fn err_matcher(string: &'static str) -> (Option, Vec) { + matcher_with(|lb| match_comment_or_division(lb, 32), string) } fn comment_lexem( @@ -110,6 +126,25 @@ mod tests { fn com_single_multi() { assert_eq!(matcher("//a\nb"), comment_lexem("a", (1, 1), (1, 4))); } + + #[test] + fn com_single_max_long() { + assert_eq!( + matcher("//___a___b___a___c___a___b___a___d"), + comment_lexem("___a___b___a___c___a___b___a___d", (1, 1), (1, 35)) + ); + } + + #[test] + fn com_single_too_long() { + let (result, errors) = err_matcher("//___a___b___a___c___a___b___a___d_"); + assert_eq!( + result, + comment_lexem("___a___b___a___c___a___b___a___d", (1, 1), (1, 35)) + ); + assert!(errors[0].variant == LexemErrorVariant::CommentTooLong); + } + #[test] fn empty_com_single_multi() { assert_eq!(matcher("//\n"), comment_lexem("", (1, 1), (1, 3))); @@ -120,6 +155,24 @@ mod tests { assert_eq!(matcher("/*ab*/"), comment_lexem("ab", (1, 1), (1, 7))); } + #[test] + fn com_multi_max_long() { + assert_eq!( + matcher("/*___a___b___a___c___a___b___a___d*/"), + comment_lexem("___a___b___a___c___a___b___a___d", (1, 1), (1, 37)) + ); + } + + #[test] + fn com_multi_too_long() { + let (result, errors) = err_matcher("/*___a___b___a___c___a___b___a___d_*/"); + assert_eq!( + result, + comment_lexem("___a___b___a___c___a___b___a___d", (1, 1), (1, 35)) + ); + assert!(errors[0].variant == LexemErrorVariant::CommentTooLong); + } + #[test] fn empty_com_multi() { assert_eq!(matcher("/**/"), comment_lexem("", (1, 1), (1, 5))); @@ -132,7 +185,9 @@ mod tests { #[test] fn com_multi_no_end() { - assert_eq!(matcher("/*a\n"), comment_lexem("a\n", (1, 1), (2, 1))); + let (result, errors) = err_matcher("/*a\n"); + assert_eq!(result, comment_lexem("a\n", (1, 1), (2, 1))); + assert!(errors[0].variant == LexemErrorVariant::CommentNeverEnds); } #[test] diff --git a/src/lexer/matchers/identifier_or_keyword.rs b/src/lexer/matchers/identifier_or_keyword.rs index d972a02..3ba1705 100644 --- a/src/lexer/matchers/identifier_or_keyword.rs +++ b/src/lexer/matchers/identifier_or_keyword.rs @@ -1,6 +1,6 @@ use crate::{ lexer::keywords::Keyword, - lexer::lexem::{Lexem, LexemBuilder, LexemType}, + lexer::lexem::{Lexem, LexemBuilder, LexemErrorVariant, LexemType}, scannable::Scannable, }; @@ -17,42 +17,42 @@ fn can_continue(c: char) -> bool { } /// Matches an identifier or a keyword -pub fn match_identifier_or_keyword(tb: &mut LexemBuilder) -> Option { - if can_begin(tb.peek()) { - let mut name = vec![tb.peek()]; - tb.pop(); - while can_continue(tb.peek()) { // TODO maksymalna długość - name.push(tb.peek()); - tb.pop(); - } - let name: String = name.into_iter().collect(); - if let Some(token) = match_keyword(tb, &name) { - Some(token) - } else { - tb.bake(LexemType::Identifier(name)) +pub fn match_identifier_or_keyword(lb: &mut LexemBuilder, max: usize) -> Option { + if !can_begin(lb.curr()) { + return None; + } + let mut name = vec![lb.curr()]; + lb.pop(); + while can_continue(lb.curr()) { + name.push(lb.curr()); + if name.len() > max { + name.pop(); + lb.error(LexemErrorVariant::IdentifierTooLong); + break; } - } else { - None + lb.pop(); } + let name: String = name.into_iter().collect(); + match_keyword(lb, &name).or_else(|| lb.bake(LexemType::Identifier(name))) } /// Matches a keyword -fn match_keyword(tb: &mut LexemBuilder, name: &str) -> Option { +fn match_keyword(lb: &mut LexemBuilder, name: &str) -> Option { match name { - "int" => tb.bake(LexemType::Keyword(Keyword::Int)), - "float" => tb.bake(LexemType::Keyword(Keyword::Float)), - "bool" => tb.bake(LexemType::Keyword(Keyword::Bool)), - "string" => tb.bake(LexemType::Keyword(Keyword::String)), - "let" => tb.bake(LexemType::Keyword(Keyword::Let)), - "fn" => tb.bake(LexemType::Keyword(Keyword::Fn)), - "return" => tb.bake(LexemType::Keyword(Keyword::Return)), - "while" => tb.bake(LexemType::Keyword(Keyword::While)), - "for" => tb.bake(LexemType::Keyword(Keyword::For)), - "in" => tb.bake(LexemType::Keyword(Keyword::In)), - "if" => tb.bake(LexemType::Keyword(Keyword::If)), - "else" => tb.bake(LexemType::Keyword(Keyword::Else)), - "true" => tb.bake(LexemType::Keyword(Keyword::True)), - "false" => tb.bake(LexemType::Keyword(Keyword::False)), + "int" => lb.bake(LexemType::Keyword(Keyword::Int)), + "float" => lb.bake(LexemType::Keyword(Keyword::Float)), + "bool" => lb.bake(LexemType::Keyword(Keyword::Bool)), + "string" => lb.bake(LexemType::Keyword(Keyword::String)), + "let" => lb.bake(LexemType::Keyword(Keyword::Let)), + "fn" => lb.bake(LexemType::Keyword(Keyword::Fn)), + "return" => lb.bake(LexemType::Keyword(Keyword::Return)), + "while" => lb.bake(LexemType::Keyword(Keyword::While)), + "for" => lb.bake(LexemType::Keyword(Keyword::For)), + "in" => lb.bake(LexemType::Keyword(Keyword::In)), + "if" => lb.bake(LexemType::Keyword(Keyword::If)), + "else" => lb.bake(LexemType::Keyword(Keyword::Else)), + "true" => lb.bake(LexemType::Keyword(Keyword::True)), + "false" => lb.bake(LexemType::Keyword(Keyword::False)), _ => None, } } @@ -61,14 +61,20 @@ fn match_keyword(tb: &mut LexemBuilder, name: &str) -> Option { mod tests { use crate::lexer::{ keywords::Keyword, - lexem::{Lexem, LexemType}, + lexem::{Lexem, LexemError, LexemErrorVariant, LexemType}, matchers::test_utils::{lexem_with, matcher_with}, }; use super::match_identifier_or_keyword; fn matcher(string: &'static str) -> Option { - matcher_with(match_identifier_or_keyword, string) + let r = matcher_with(|lb| match_identifier_or_keyword(lb, 32), string); + assert!(r.1.is_empty()); + r.0 + } + + fn err_matcher(string: &'static str) -> (Option, Vec) { + matcher_with(|lb| match_identifier_or_keyword(lb, 32), string) } fn id_lexem( @@ -132,6 +138,24 @@ mod tests { assert_eq!(matcher("_"), id_lexem("_", (1, 1), (1, 2))); } + #[test] + fn id_max_long() { + assert_eq!( + matcher("___a___b___a___c___a___b___a___d"), + id_lexem("___a___b___a___c___a___b___a___d", (1, 1), (1, 33)) + ); + } + + #[test] + fn id_too_long() { + let (result, errors) = err_matcher("___a___b___a___c___a___b___a___d_"); + assert_eq!( + result, + id_lexem("___a___b___a___c___a___b___a___d", (1, 1), (1, 33)) + ); + assert!(errors[0].variant == LexemErrorVariant::IdentifierTooLong); + } + #[test] fn id_not() { assert_eq!(matcher("5sdas"), None); diff --git a/src/lexer/matchers/numerical.rs b/src/lexer/matchers/numerical.rs index 618d1de..ed771f0 100644 --- a/src/lexer/matchers/numerical.rs +++ b/src/lexer/matchers/numerical.rs @@ -1,97 +1,93 @@ use crate::{ - lexer::lexem::{Lexem, LexemBuilder, LexemType}, + lexer::lexem::{Lexem, LexemBuilder, LexemErrorVariant, LexemType}, scannable::Scannable, }; +/// Turns `0` - `9` characters to their `i64` representation +fn char2num(c: char) -> i64 { + c as i64 - '0' as i64 +} + +/// Performs a checked multiplication followed by addition of `a * b + c` +fn checked_mul_add(a: i64, b: i64, c: i64) -> Option { + a.checked_mul(b).and_then(|v| v.checked_add(c)) +} + /// Matches an integer or a float constant -pub fn match_numerical(tb: &mut LexemBuilder) -> Option { - if tb.peek().is_ascii_digit() { - let mut integer_part: i64 = tb.peek() as i64 - '0' as i64; // TODO wyjąc konwertowanie - if tb.peek() != '0' { - tb.pop(); - loop { // TODO użyć while - if tb.peek().is_ascii_digit() { - if let Some(new_integer_part) = integer_part.checked_mul(10) { - integer_part = new_integer_part; - integer_part += tb.peek() as i64 - '0' as i64; // TODO Sprawdzić czy nie ma przepełnienia - tb.pop(); - } else { - eprintln!( - "Integer too big from {} to {}.", - tb.get_start(), - tb.get_here() - ); - break; - } - } else if tb.peek() == '_' { - tb.pop(); - } else { - break; - } +pub fn match_numerical(lb: &mut LexemBuilder) -> Option { + if !lb.curr().is_ascii_digit() { + return None; + } + let mut integer_part: i64 = char2num(lb.curr()); + if lb.curr() != '0' { + lb.pop(); + while lb.curr().is_ascii_digit() || lb.curr() == '_' { + if lb.curr() == '_' { + lb.pop(); + } else if let Some(new_integer_part) = + checked_mul_add(integer_part, 10, char2num(lb.curr())) + { + integer_part = new_integer_part; + lb.pop(); + } else { + lb.error(LexemErrorVariant::IntegerPartTooBig); + break; } - } else { - tb.pop(); - } - if let Some(token) = match_float(tb, integer_part) { - Some(token) - } else { - tb.bake(LexemType::Int(integer_part)) } } else { - None + lb.pop(); } + match_float(lb, integer_part).or_else(|| lb.bake(LexemType::Int(integer_part))) } /// Matches a float constant -fn match_float(tb: &mut LexemBuilder, integer_part: i64) -> Option { - if tb.peek() == '.' { // TODO odwrócić warunek - tb.pop(); - if tb.peek().is_ascii_digit() { - let mut digits = 1; - let mut decimal_part: i64 = tb.peek() as i64 - '0' as i64; - tb.pop(); - loop { - if tb.peek().is_ascii_digit() { - if let Some(new_decimal_part) = decimal_part.checked_mul(10) { - decimal_part = new_decimal_part; - digits += 1; - decimal_part += tb.peek() as i64 - '0' as i64; - tb.pop(); - } else { - eprintln!( - "Decimal part too big from {} to {}.", - tb.get_start(), - tb.get_here() - ); - break; - } - } else if tb.peek() == '_' { - tb.pop(); - } else { - break; - } +fn match_float(lb: &mut LexemBuilder, integer_part: i64) -> Option { + if lb.curr() != '.' { + return None; + } + lb.pop(); + if lb.curr().is_ascii_digit() { + let mut digits = 1; + let mut decimal_part: i64 = char2num(lb.curr()); + lb.pop(); + while lb.curr().is_ascii_digit() || lb.curr() == '_' { + if lb.curr() == '_' { + lb.pop(); + } else if let Some(new_decimal_part) = + checked_mul_add(decimal_part, 10, char2num(lb.curr())) + { + decimal_part = new_decimal_part; + digits += 1; + lb.pop(); + } else { + lb.error(LexemErrorVariant::DecimalPartTooBig); + break; } - tb.bake(LexemType::Float( - integer_part as f64 + decimal_part as f64 / 10f64.powf(digits as f64), - )) - } else { - tb.bake(LexemType::Float(integer_part as f64)) } + lb.bake(LexemType::Float( + integer_part as f64 + decimal_part as f64 / 10f64.powf(digits as f64), + )) } else { - None + lb.bake(LexemType::Float(integer_part as f64)) } } #[cfg(test)] mod tests { use crate::lexer::{ - lexem::{Lexem, LexemType}, + lexem::{Lexem, LexemError, LexemErrorVariant, LexemType}, matchers::test_utils::{lexem_with, matcher_with}, }; use super::match_numerical; fn matcher(string: &'static str) -> Option { + let r = matcher_with(match_numerical, string); + assert!(r.1.is_empty()); + r.0 + } + + fn err_matcher(string: &'static str) -> (Option, Vec) { matcher_with(match_numerical, string) } @@ -129,19 +125,17 @@ mod tests { } #[test] - fn int_limit2() { - assert_eq!( - matcher("9_223_372_036_854_775_808"), - int_lexem(9223372036854775800, (1, 1), (1, 26)) - ); + fn int_just_above_limit() { + let (result, errors) = err_matcher("9_223_372_036_854_775_808"); + assert_eq!(result, int_lexem(922337203685477580, (1, 1), (1, 25))); + assert!(errors[0].variant == LexemErrorVariant::IntegerPartTooBig); } #[test] fn int_above_limit() { - assert_eq!( - matcher("101273576184162375213625468214"), - int_lexem(1012735761841623752, (1, 1), (1, 20)) - ); + let (result, errors) = err_matcher("101273576184162375213625468214"); + assert_eq!(result, int_lexem(1012735761841623752, (1, 1), (1, 20))); + assert!(errors[0].variant == LexemErrorVariant::IntegerPartTooBig); } #[test] @@ -187,11 +181,20 @@ mod tests { } #[test] - fn float_above_limit() { + fn float_just_above_limit() { + let (result, errors) = err_matcher("0.9_223_372_036_854_775_808"); assert_eq!( - matcher("0.101273576184162375213625468214"), - float_lexem(0.10127357618416238, (1, 1), (1, 22)) + result, + float_lexem(0.922_337_203_685_477_7, (1, 1), (1, 27)) ); + assert!(errors[0].variant == LexemErrorVariant::DecimalPartTooBig); + } + + #[test] + fn float_above_limit() { + let (result, errors) = err_matcher("0.101273576184162375213625468214"); + assert_eq!(result, float_lexem(0.10127357618416238, (1, 1), (1, 22))); + assert!(errors[0].variant == LexemErrorVariant::DecimalPartTooBig); } #[test] diff --git a/src/lexer/matchers/operator.rs b/src/lexer/matchers/operator.rs index a8a8156..af33725 100644 --- a/src/lexer/matchers/operator.rs +++ b/src/lexer/matchers/operator.rs @@ -8,30 +8,30 @@ use crate::{ type Op = Operator; /// Matches an operator -pub fn match_operator(t_b: &mut LexemBuilder) -> Option { - match t_b.peek() { - '+' => char_match!(t_b, Op::Plus), - '-' => char_match!(t_b, Op::Minus, '>', Op::Arrow), - '*' => char_match!(t_b, Op::Asterisk), - '=' => char_match!(t_b, Op::Equal, '=', Op::DoubleEqual), - '<' => char_match!(t_b, Op::Lesser, '=', Op::LesserEqual), - '>' => char_match!(t_b, Op::Greater, '=', Op::GreaterEqual), - '(' => char_match!(t_b, Op::OpenRoundBracket), - ')' => char_match!(t_b, Op::CloseRoundBracket), - '{' => char_match!(t_b, Op::OpenCurlyBracket), - '}' => char_match!(t_b, Op::CloseCurlyBracket), - '[' => char_match!(t_b, Op::OpenSquareBracket), - ']' => char_match!(t_b, Op::CloseSquareBracket), - ':' => char_match!(t_b, Op::Colon, ':', Op::DoubleColon), - '&' => char_match!(t_b, Op::And), - '|' => char_match!(t_b, Op::Or), - ';' => char_match!(t_b, Op::Semicolon), - ',' => char_match!(t_b, Op::Split), - '!' => char_match!(t_b, Op::ExclamationMark, '=', Op::Unequal), - '%' => char_match!(t_b, Op::Modulo), +pub fn match_operator(lb: &mut LexemBuilder) -> Option { + match lb.curr() { + '+' => char_match!(lb, Op::Plus), + '-' => char_match!(lb, Op::Minus, '>', Op::Arrow), + '*' => char_match!(lb, Op::Asterisk), + '=' => char_match!(lb, Op::Equal, '=', Op::DoubleEqual), + '<' => char_match!(lb, Op::Lesser, '=', Op::LesserEqual), + '>' => char_match!(lb, Op::Greater, '=', Op::GreaterEqual), + '(' => char_match!(lb, Op::OpenRoundBracket), + ')' => char_match!(lb, Op::CloseRoundBracket), + '{' => char_match!(lb, Op::OpenCurlyBracket), + '}' => char_match!(lb, Op::CloseCurlyBracket), + '[' => char_match!(lb, Op::OpenSquareBracket), + ']' => char_match!(lb, Op::CloseSquareBracket), + ':' => char_match!(lb, Op::Colon, ':', Op::DoubleColon), + '&' => char_match!(lb, Op::And), + '|' => char_match!(lb, Op::Or), + ';' => char_match!(lb, Op::Semicolon), + ',' => char_match!(lb, Op::Split), + '!' => char_match!(lb, Op::ExclamationMark, '=', Op::Unequal), + '%' => char_match!(lb, Op::Modulo), _ => None, } - .map(|operator| t_b.bake_raw(LexemType::Operator(operator))) + .map(|operator| lb.bake_raw(LexemType::Operator(operator))) } #[cfg(test)] @@ -45,7 +45,9 @@ mod tests { use super::match_operator; fn matcher(string: &'static str) -> Option { - matcher_with(match_operator, string) + let r = matcher_with(match_operator, string); + assert!(r.1.is_empty()); + r.0 } fn lexem(operator: Operator, start: (usize, usize), stop: (usize, usize)) -> Option { diff --git a/src/lexer/matchers/string.rs b/src/lexer/matchers/string.rs index 2051b43..c7fbf68 100644 --- a/src/lexer/matchers/string.rs +++ b/src/lexer/matchers/string.rs @@ -1,63 +1,81 @@ use crate::{ - lexer::lexem::{Lexem, LexemBuilder, LexemType}, + lexer::lexem::{Lexem, LexemBuilder, LexemErrorVariant, LexemType}, scannable::Scannable, }; /// Matches a string constant -pub fn match_string(tb: &mut LexemBuilder) -> Option { - if tb.peek() == '"' { - tb.pop(); - Some(complete_string(tb)) +pub fn match_string(lb: &mut LexemBuilder, max: usize) -> Option { + if lb.curr() == '"' { + lb.pop(); + Some(complete_string(lb, max)) } else { None } } +/// Handles different kinds of escape characters +fn escape_characters(lb: &mut LexemBuilder, content: &mut Vec) { + match lb.curr() { + '0' => content.push('\0'), + 'b' => content.push('\x08'), + 'f' => content.push('\x0c'), + 'n' => content.push('\n'), + 'r' => content.push('\r'), + 't' => content.push('\t'), + '"' => content.push('"'), + '\\' => content.push('\\'), + c => { + lb.error(LexemErrorVariant::InvalidEscapeCharacter(c)); + } + } +} + /// Completes a string constant -fn complete_string(tb: &mut LexemBuilder) -> Lexem { +fn complete_string(lb: &mut LexemBuilder, max: usize) -> Lexem { let mut content: Vec = vec![]; loop { - let c = tb.peek(); - match tb.peek() { + let c = lb.curr(); + match lb.curr() { '\\' => { - let pos = tb.get_here(); - tb.pop(); - match tb.peek() { // Więcej znaków ucieczki np. z JSON-a - '\\' => content.push('\\'), - '"' => content.push('"'), - c => { - eprintln!( // Zwracać błąd lepiej - "Unknown escape sequence `\\{}` inside string at {}.", - c, pos - ) - } - } + lb.pop(); + escape_characters(lb, &mut content); } '\x03' => { - eprintln!("String started at {} never ends.", tb.get_start()); - break tb.bake_raw(LexemType::String(content.into_iter().collect())); + lb.error(LexemErrorVariant::StringNeverEnds); + break lb.bake_raw(LexemType::String(content.into_iter().collect())); } '"' => { - tb.pop(); - break tb.bake_raw(LexemType::String(content.into_iter().collect())); + lb.pop(); + break lb.bake_raw(LexemType::String(content.into_iter().collect())); } _ => content.push(c), } - tb.pop(); + if content.len() > max { + content.pop(); + lb.error(LexemErrorVariant::StringTooLong); + break lb.bake_raw(LexemType::String(content.into_iter().collect())); + } + lb.pop(); } } #[cfg(test)] mod tests { use crate::lexer::{ - lexem::{Lexem, LexemType}, + lexem::{Lexem, LexemError, LexemErrorVariant, LexemType}, matchers::test_utils::{lexem_with, matcher_with}, }; use super::match_string; fn matcher(string: &'static str) -> Option { - matcher_with(match_string, string) + let r = matcher_with(|lb| match_string(lb, 32), string); + assert!(r.1.is_empty()); + r.0 + } + + fn err_matcher(string: &'static str) -> (Option, Vec) { + matcher_with(|lb| match_string(lb, 32), string) } fn lexem(string: &'static str, start: (usize, usize), stop: (usize, usize)) -> Option { @@ -86,7 +104,9 @@ mod tests { #[test] fn no_end() { - assert_eq!(matcher("\"abcd"), lexem("abcd", (1, 1), (1, 6))); + let (result, errors) = err_matcher("\"abcd"); + assert_eq!(result, lexem("abcd", (1, 1), (1, 6))); + assert!(errors[0].variant == LexemErrorVariant::StringNeverEnds); } #[test] @@ -96,7 +116,9 @@ mod tests { #[test] fn empty_no_end() { - assert_eq!(matcher("\""), lexem("", (1, 1), (1, 2))); + let (result, errors) = err_matcher("\""); + assert_eq!(result, lexem("", (1, 1), (1, 2))); + assert!(errors[0].variant == LexemErrorVariant::StringNeverEnds); } #[test] @@ -107,6 +129,12 @@ mod tests { #[test] fn escape() { assert_eq!(matcher("\"ab\\\"cd\""), lexem("ab\"cd", (1, 1), (1, 9))); + assert_eq!(matcher("\"\\0\""), lexem("\0", (1, 1), (1, 5))); + assert_eq!(matcher("\"\\b\""), lexem("\x08", (1, 1), (1, 5))); + assert_eq!(matcher("\"\\f\""), lexem("\x0c", (1, 1), (1, 5))); + assert_eq!(matcher("\"\\n\""), lexem("\n", (1, 1), (1, 5))); + assert_eq!(matcher("\"\\r\""), lexem("\r", (1, 1), (1, 5))); + assert_eq!(matcher("\"\\t\""), lexem("\t", (1, 1), (1, 5))); } #[test] @@ -116,7 +144,27 @@ mod tests { #[test] fn unknown_escape() { - assert_eq!(matcher("\"abc\\d\""), lexem("abc", (1, 1), (1, 8))); + let (result, errors) = err_matcher("\"abc\\j\""); + assert_eq!(result, lexem("abc", (1, 1), (1, 8))); + assert!(errors[0].variant == LexemErrorVariant::InvalidEscapeCharacter('j')); + } + + #[test] + fn max_long() { + assert_eq!( + matcher("\"___a___b___a___c___a___b___a___d\""), + lexem("___a___b___a___c___a___b___a___d", (1, 1), (1, 35)) + ); + } + + #[test] + fn too_long() { + let (result, errors) = err_matcher("\"___a___b___a___c___a___b___a___d_\""); + assert_eq!( + result, + lexem("___a___b___a___c___a___b___a___d", (1, 1), (1, 34)) + ); + assert!(errors[0].variant == LexemErrorVariant::StringTooLong); } #[test] diff --git a/src/lexer/matchers/test_utils.rs b/src/lexer/matchers/test_utils.rs index 021c434..5b6e6ab 100644 --- a/src/lexer/matchers/test_utils.rs +++ b/src/lexer/matchers/test_utils.rs @@ -2,17 +2,18 @@ use std::io::BufReader; use crate::lexer::{ char_scanner::CharScanner, - lexem::{Lexem, LexemBuilder, LexemType}, + lexem::{Lexem, LexemBuilder, LexemError, LexemType}, }; #[allow(dead_code)] pub fn matcher_with( matcher: fn(&mut LexemBuilder) -> Option, string: &'static str, -) -> Option { - let scanner = &mut CharScanner::new(BufReader::new(string.as_bytes())); - let lb = &mut LexemBuilder::new(scanner); - matcher(lb) +) -> (Option, Vec) { + let mut scanner = CharScanner::new(BufReader::new(string.as_bytes())); + let mut errors: Vec = vec![]; + let lb = &mut LexemBuilder::new(&mut scanner, &mut errors); + (matcher(lb), errors) } #[allow(dead_code)] diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 1b38103..f0da738 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -16,42 +16,49 @@ use matchers::{ use self::{ char_scanner::CharScanner, - lexem::{Lexem, LexemBuilder}, + lexem::{Lexem, LexemBuilder, LexemError, LexemErrorVariant}, }; pub struct Lexer { + max_identifier_length: usize, + max_string_length: usize, + max_comment_length: usize, pub scanner: CharScanner, - // TODO bufor na błędy + pub errors: Vec, } impl Lexer { pub fn new(source: impl BufRead + 'static) -> Self { Self { + max_identifier_length: 256, + max_string_length: 256, + max_comment_length: 256, scanner: CharScanner::new(source), + errors: vec![], } } /// Removes whitespace fn skip_whitespace(&mut self) { - while self.scanner.peek().is_whitespace() { + while self.scanner.curr().is_whitespace() { self.scanner.pop(); } } /// Matches lexems fn match_lexem(&mut self) -> Option { - let tb = &mut LexemBuilder::new(&mut self.scanner); - match_numerical(tb) - .or_else(|| match_identifier_or_keyword(tb)) - .or_else(|| match_operator(tb)) - .or_else(|| match_string(tb)) - .or_else(|| match_comment_or_division(tb)) + let lb = &mut LexemBuilder::new(&mut self.scanner, &mut self.errors); + match_numerical(lb) + .or_else(|| match_identifier_or_keyword(lb, self.max_identifier_length)) + .or_else(|| match_operator(lb)) + .or_else(|| match_string(lb, self.max_string_length)) + .or_else(|| match_comment_or_division(lb, self.max_comment_length)) } /// Skips whitespace and matches lexems or ETX fn skip_and_match(&mut self) -> Option> { self.skip_whitespace(); - if self.scanner.peek() == '\x03' { + if self.scanner.curr() == '\x03' { Some(None) } else { self.match_lexem().map(Some) @@ -69,37 +76,45 @@ impl Lexer { loop { if let Some(lexem) = self.skip_and_match() { if !invalid_sequence.is_empty() { - eprintln!( - "Invalid sequence of characters `{}` from {} to {}", - invalid_sequence.iter().collect::(), - sequence_start, - sequence_stop - ) + self.errors.push(LexemError { + start: sequence_start, + end: sequence_stop, + variant: LexemErrorVariant::InvalidSequence( + invalid_sequence.iter().collect::(), + ), + }); } break lexem; } else { - invalid_sequence.push(self.scanner.peek()); + invalid_sequence.push(self.scanner.curr()); self.scanner.pop(); sequence_stop = self.scanner.last_pos(); } } } } -} - -impl Iterator for Lexer { - type Item = Lexem; - fn next(&mut self) -> Option { + /// Returns lexems until it runs out + #[allow(dead_code)] + pub fn next(&mut self) -> Option { self.catch_invalid_sequence() } + + /// Returns all lexems + pub fn all(&mut self) -> Vec { + let mut lexems = vec![]; + while let Some(l) = self.catch_invalid_sequence() { + lexems.push(l); + } + lexems + } } #[cfg(test)] mod tests { use std::{fs::OpenOptions, io::BufReader}; - use crate::lexer::{keywords::Keyword, operators::Operator, Lexer}; + use crate::lexer::{keywords::Keyword, lexem::LexemErrorVariant, operators::Operator, Lexer}; use super::lexem::{Lexem, LexemType}; @@ -146,23 +161,25 @@ mod tests { .read(true) .open("snippets/short.txt") .unwrap(); - let parser = Lexer::new(BufReader::new(file)); - let output = parser.into_iter().collect::>(); + let mut parser = Lexer::new(BufReader::new(file)); + let output = parser.all(); assert_eq!(output, correct_output()); + assert!(parser.errors.is_empty()); } #[test] fn test_string() { let string = "// do nothing\nfn main() {\n let a = 5;\n}"; - let parser = Lexer::new(BufReader::new(string.as_bytes())); - let output = parser.into_iter().collect::>(); + let mut parser = Lexer::new(BufReader::new(string.as_bytes())); + let output = parser.all(); assert_eq!(output, correct_output()); + assert!(parser.errors.is_empty()); } #[test] fn invalid_sequence() { let string = "invalid $@#@$#@$#$@ sequence breaks$stuff 0#.323"; - let parser = Lexer::new(BufReader::new(string.as_bytes())); + let mut parser = Lexer::new(BufReader::new(string.as_bytes())); let correct_output = vec![ Lexem::new(LexemType::Identifier("invalid".to_owned()), (1, 1), (1, 8)), Lexem::new( @@ -175,14 +192,20 @@ mod tests { Lexem::new(LexemType::Int(0), (1, 43), (1, 44)), Lexem::new(LexemType::Int(323), (1, 46), (1, 49)), ]; - let output = parser.into_iter().collect::>(); + let output = parser.all(); assert_eq!(output, correct_output); + assert!( + parser.errors[0].variant + == LexemErrorVariant::InvalidSequence("$@#@$#@$#$@".to_owned()) + ); + assert!(parser.errors[1].variant == LexemErrorVariant::InvalidSequence("$".to_owned())); + assert!(parser.errors[2].variant == LexemErrorVariant::InvalidSequence("#.".to_owned())); } #[test] fn incomplete_string() { let string = "// do nothing\nfn main() \"{\n let a = 5;\n}\n"; - let parser = Lexer::new(BufReader::new(string.as_bytes())); + let mut parser = Lexer::new(BufReader::new(string.as_bytes())); let correct_output = vec![ Lexem::new( LexemType::Comment(" do nothing".to_owned()), @@ -207,14 +230,15 @@ mod tests { (5, 1), ), ]; - let output = parser.into_iter().collect::>(); + let output = parser.all(); assert_eq!(output, correct_output); + assert!(parser.errors[0].variant == LexemErrorVariant::StringNeverEnds); } #[test] fn incomplete_comment() { let string = "// do nothing\nfn main() /*{\n let a = 5;\n}\n"; - let parser = Lexer::new(BufReader::new(string.as_bytes())); + let mut parser = Lexer::new(BufReader::new(string.as_bytes())); let correct_output = vec![ Lexem::new( LexemType::Comment(" do nothing".to_owned()), @@ -239,7 +263,8 @@ mod tests { (5, 1), ), ]; - let output = parser.into_iter().collect::>(); + let output = parser.all(); assert_eq!(output, correct_output); + assert!(parser.errors[0].variant == LexemErrorVariant::CommentNeverEnds); } } diff --git a/src/main.rs b/src/main.rs index c7e0f49..027e215 100644 --- a/src/main.rs +++ b/src/main.rs @@ -63,11 +63,18 @@ fn parse_args() -> Result { /// Consumes and prints all lexems fn print_lexems(lexer: &mut Lexer) { - for token in lexer { + for token in lexer.all() { println!("{}", token); } } +/// Prints all errors +fn print_errors(lexer: &Lexer) { + for e in &lexer.errors { + eprintln!("{}", e); + } +} + /// Application error containing message and process return code struct AppError { msg: String, @@ -121,5 +128,7 @@ fn run(input: InputType) -> Result<(), AppError> { print_lexems(&mut lexer); // TEMPORARY + print_errors(&lexer); + Ok(()) } diff --git a/src/parser/tokens.rs b/src/parser/tokens.rs new file mode 100644 index 0000000..e69de29 diff --git a/src/scannable.rs b/src/scannable.rs index 0c441c3..fca18ca 100644 --- a/src/scannable.rs +++ b/src/scannable.rs @@ -1,5 +1,5 @@ /// Iterator which buffers last result pub trait Scannable { - fn peek(&self) -> T; + fn curr(&self) -> T; fn pop(&mut self) -> bool; }