Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(parser): the 1st version of a new combinator style parser #16876

Merged
merged 39 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
64132ed
feat(parser): the 1st version of a new parser-combinator style parser
TennyZhuang May 21, 2024
f68d97e
add parse_v2
TennyZhuang May 21, 2024
12bfadf
Update src/sqlparser/src/parser_v2/data_type.rs
TennyZhuang May 21, 2024
b0a0afd
minor fix
TennyZhuang May 22, 2024
8ea355a
minor improvement
TennyZhuang May 22, 2024
332234b
minor fix
TennyZhuang May 22, 2024
2ccd163
add some label
TennyZhuang May 22, 2024
5a3b34d
introduce TokenStreamWrapper for better readability
TennyZhuang May 22, 2024
6c9683a
fix parse_v2
TennyZhuang May 22, 2024
905fa6a
handle whitespace
TennyZhuang May 22, 2024
80208ef
use preceed
TennyZhuang May 22, 2024
9c9472f
remove dbg
TennyZhuang May 22, 2024
2d6692c
fix custom data type
TennyZhuang May 22, 2024
7e8166d
fix array parsing
TennyZhuang May 23, 2024
686217c
add a context
TennyZhuang May 23, 2024
51f9265
fix unused import
TennyZhuang May 23, 2024
89f67e5
fix struct parsing
TennyZhuang May 23, 2024
6b62577
fix float parsing
TennyZhuang May 23, 2024
32d29db
fix precision parsing
TennyZhuang May 23, 2024
c76c314
add many comments
TennyZhuang May 23, 2024
d00ec9f
fix clippy
TennyZhuang May 23, 2024
60f2019
simplify error def
TennyZhuang May 23, 2024
3b366ab
fix precision parsing
TennyZhuang May 23, 2024
26c7499
fix double
TennyZhuang May 23, 2024
b26810d
fix custom type parsing
TennyZhuang May 23, 2024
5cdab19
1..54
TennyZhuang May 23, 2024
79ce4ea
support TEXT
TennyZhuang May 23, 2024
df6137b
fix
TennyZhuang May 23, 2024
4504419
fix ut
TennyZhuang May 23, 2024
0e239cc
refine error message
TennyZhuang May 23, 2024
9ba9e97
fix stateful parsing
TennyZhuang May 23, 2024
3cf20d1
refine error
TennyZhuang May 23, 2024
88562ff
fix ut
TennyZhuang May 23, 2024
8452dc7
fix struct sep parsing
TennyZhuang May 24, 2024
f9d93c7
Update src/sqlparser/src/parser_v2/number.rs
TennyZhuang May 24, 2024
162a663
address comments
TennyZhuang May 24, 2024
cc9dbc3
fix warning
TennyZhuang May 24, 2024
d8b7d60
fix
TennyZhuang May 24, 2024
b7d79e1
revert a behavior
TennyZhuang May 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 19 additions & 9 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 3 additions & 4 deletions src/sqlparser/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
[package]
name = "risingwave_sqlparser"
license = "Apache-2.0"
include = [
"src/**/*.rs",
"Cargo.toml",
]
include = ["src/**/*.rs", "Cargo.toml"]
version = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
Expand All @@ -27,8 +24,10 @@ normal = ["workspace-hack"]
[dependencies]
itertools = { workspace = true }
serde = { version = "1.0", features = ["derive"], optional = true }
thiserror = "1.0.61"
tracing = "0.1"
tracing-subscriber = "0.3"
winnow = { version = "0.6.8", git = "https://github.com/TennyZhuang/winnow.git", rev = "a6b1f04" }

[target.'cfg(not(madsim))'.dependencies]
workspace-hack = { path = "../workspace-hack" }
Expand Down
1 change: 1 addition & 0 deletions src/sqlparser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ extern crate alloc;
pub mod ast;
pub mod keywords;
pub mod parser;
pub mod parser_v2;
pub mod tokenizer;

#[doc(hidden)]
Expand Down
180 changes: 42 additions & 138 deletions src/sqlparser/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use tracing::{debug, instrument};

use crate::ast::*;
use crate::keywords::{self, Keyword};
use crate::parser_v2;
use crate::tokenizer::*;

pub(crate) const UPSTREAM_SOURCE_KEY: &str = "connector";
Expand Down Expand Up @@ -172,19 +173,51 @@ pub struct Parser {
tokens: Vec<TokenWithLocation>,
/// The index of the first unprocessed token in `self.tokens`
index: usize,
/// Since we cannot distinguish `>>` and double `>`, so use `angle_brackets_num` to store the
/// number of `<` to match `>` in sql like `struct<v1 struct<v2 int>>`.
angle_brackets_num: i32,
}

impl Parser {
/// Parse the specified tokens
pub fn new(tokens: Vec<TokenWithLocation>) -> Self {
Parser {
tokens,
index: 0,
angle_brackets_num: 0,
}
Parser { tokens, index: 0 }
}

/// Adaptor for [`parser_v2`].
///
/// You can call a v2 parser from original parser by using this method.
pub(crate) fn parse_v2<'a, O>(
&'a mut self,
mut parse_next: impl winnow::Parser<
winnow::Located<parser_v2::TokenStreamWrapper<'a>>,
O,
winnow::error::ContextError,
>,
) -> Result<O, ParserError> {
use winnow::stream::Location;

let mut token_stream = winnow::Located::new(parser_v2::TokenStreamWrapper {
tokens: &self.tokens[self.index..],
});
let output = parse_next.parse_next(&mut token_stream).map_err(|e| {
let msg = if let Some(e) = e.into_inner()
&& let Some(cause) = e.cause()
{
format!(": {}", cause)
} else {
"".to_string()
};
ParserError::ParserError(format!(
"Unexpected {}{}",
if self.index + token_stream.location() >= self.tokens.len() {
&"EOF" as &dyn std::fmt::Display
} else {
&self.tokens[self.index + token_stream.location()] as &dyn std::fmt::Display
},
msg
))
Comment on lines +201 to +216
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use ParseError here for better error reporting?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which ParserError? If you mean v2, I'd prefer give user a consistent user experience now.

});
let offset = token_stream.location();
self.index += offset;
output
}

/// Parse a SQL statement and produce an Abstract Syntax Tree (AST)
Expand Down Expand Up @@ -3806,136 +3839,7 @@ impl Parser {
/// Parse a SQL datatype (in the context of a CREATE TABLE statement for example) and convert
/// into an array of that datatype if needed
pub fn parse_data_type(&mut self) -> Result<DataType, ParserError> {
let mut data_type = self.parse_data_type_inner()?;
while self.consume_token(&Token::LBracket) {
self.expect_token(&Token::RBracket)?;
data_type = DataType::Array(Box::new(data_type));
}
Ok(data_type)
}

/// Parse struct `data_type` e.g.`<v1 int, v2 int, v3 struct<...>>`.
pub fn parse_struct_data_type(&mut self) -> Result<Vec<StructField>, ParserError> {
let mut columns = vec![];
if !self.consume_token(&Token::Lt) {
return self.expected("'<' after struct", self.peek_token());
}
self.angle_brackets_num += 1;

loop {
if let Token::Word(_) = self.peek_token().token {
let name = self.parse_identifier_non_reserved()?;
let data_type = self.parse_data_type()?;
columns.push(StructField { name, data_type })
} else {
return self.expected("struct field name", self.peek_token());
}
if self.angle_brackets_num == 0 {
break;
} else if self.consume_token(&Token::Gt) {
self.angle_brackets_num -= 1;
break;
} else if self.consume_token(&Token::ShiftRight) {
if self.angle_brackets_num >= 1 {
self.angle_brackets_num -= 2;
break;
} else {
return parser_err!("too much '>'");
}
} else if !self.consume_token(&Token::Comma) {
return self.expected("',' or '>' after column definition", self.peek_token());
}
}

Ok(columns)
}

/// Parse a SQL datatype
pub fn parse_data_type_inner(&mut self) -> Result<DataType, ParserError> {
let token = self.next_token();
match token.token {
Token::Word(w) => match w.keyword {
Keyword::BOOLEAN | Keyword::BOOL => Ok(DataType::Boolean),
Keyword::FLOAT => {
let precision = self.parse_optional_precision()?;
match precision {
Some(0) => Err(ParserError::ParserError(
"precision for type float must be at least 1 bit".to_string(),
)),
Some(54..) => Err(ParserError::ParserError(
"precision for type float must be less than 54 bits".to_string(),
)),
_ => Ok(DataType::Float(precision)),
}
}
Keyword::REAL => Ok(DataType::Real),
Keyword::DOUBLE => {
let _ = self.parse_keyword(Keyword::PRECISION);
Ok(DataType::Double)
}
Keyword::SMALLINT => Ok(DataType::SmallInt),
Keyword::INT | Keyword::INTEGER => Ok(DataType::Int),
Keyword::BIGINT => Ok(DataType::BigInt),
Keyword::STRING | Keyword::VARCHAR => Ok(DataType::Varchar),
Keyword::CHAR | Keyword::CHARACTER => {
if self.parse_keyword(Keyword::VARYING) {
Ok(DataType::Varchar)
} else {
Ok(DataType::Char(self.parse_optional_precision()?))
}
}
Keyword::UUID => Ok(DataType::Uuid),
Keyword::DATE => Ok(DataType::Date),
Keyword::TIMESTAMP => {
let with_time_zone = self.parse_keyword(Keyword::WITH);
if with_time_zone || self.parse_keyword(Keyword::WITHOUT) {
self.expect_keywords(&[Keyword::TIME, Keyword::ZONE])?;
}
Ok(DataType::Timestamp(with_time_zone))
}
Keyword::TIME => {
let with_time_zone = self.parse_keyword(Keyword::WITH);
if with_time_zone || self.parse_keyword(Keyword::WITHOUT) {
self.expect_keywords(&[Keyword::TIME, Keyword::ZONE])?;
}
Ok(DataType::Time(with_time_zone))
}
// Interval types can be followed by a complicated interval
// qualifier that we don't currently support. See
// parse_interval_literal for a taste.
Keyword::INTERVAL => Ok(DataType::Interval),
Keyword::REGCLASS => Ok(DataType::Regclass),
Keyword::REGPROC => Ok(DataType::Regproc),
Keyword::TEXT => {
if self.consume_token(&Token::LBracket) {
// Note: this is postgresql-specific
self.expect_token(&Token::RBracket)?;
Ok(DataType::Array(Box::new(DataType::Text)))
} else {
Ok(DataType::Text)
}
}
Keyword::STRUCT => Ok(DataType::Struct(self.parse_struct_data_type()?)),
Keyword::BYTEA => Ok(DataType::Bytea),
Keyword::NUMERIC | Keyword::DECIMAL | Keyword::DEC => {
let (precision, scale) = self.parse_optional_precision_scale()?;
Ok(DataType::Decimal(precision, scale))
}
_ => {
self.prev_token();
let type_name = self.parse_object_name()?;
// JSONB is not a keyword
if type_name.to_string().eq_ignore_ascii_case("jsonb") {
Ok(DataType::Jsonb)
} else {
Ok(DataType::Custom(type_name))
}
}
},
unexpected => {
self.expected("a data type name", unexpected.with_location(token.location))
}
}
self.parse_v2(parser_v2::data_type)
}

/// Parse `AS identifier` (or simply `identifier` if it's not a reserved keyword)
Expand Down
Loading
Loading