Skip to content

Commit

Permalink
feat(parser): the 1st version of a new combinator style parser (#16876)
Browse files Browse the repository at this point in the history
Signed-off-by: TennyZhuang <[email protected]>
Co-authored-by: Bugen Zhao <[email protected]>
  • Loading branch information
TennyZhuang and BugenZhao authored May 24, 2024
1 parent 7a16a2c commit 1aa8579
Show file tree
Hide file tree
Showing 11 changed files with 719 additions and 159 deletions.
28 changes: 19 additions & 9 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 3 additions & 4 deletions src/sqlparser/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
[package]
name = "risingwave_sqlparser"
license = "Apache-2.0"
include = [
"src/**/*.rs",
"Cargo.toml",
]
include = ["src/**/*.rs", "Cargo.toml"]
version = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
Expand All @@ -27,8 +24,10 @@ normal = ["workspace-hack"]
[dependencies]
itertools = { workspace = true }
serde = { version = "1.0", features = ["derive"], optional = true }
thiserror = "1.0.61"
tracing = "0.1"
tracing-subscriber = "0.3"
winnow = { version = "0.6.8", git = "https://github.com/TennyZhuang/winnow.git", rev = "a6b1f04" }

[target.'cfg(not(madsim))'.dependencies]
workspace-hack = { path = "../workspace-hack" }
Expand Down
1 change: 1 addition & 0 deletions src/sqlparser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ extern crate alloc;
pub mod ast;
pub mod keywords;
pub mod parser;
pub mod parser_v2;
pub mod tokenizer;

#[doc(hidden)]
Expand Down
180 changes: 42 additions & 138 deletions src/sqlparser/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use tracing::{debug, instrument};

use crate::ast::*;
use crate::keywords::{self, Keyword};
use crate::parser_v2;
use crate::tokenizer::*;

pub(crate) const UPSTREAM_SOURCE_KEY: &str = "connector";
Expand Down Expand Up @@ -172,19 +173,51 @@ pub struct Parser {
tokens: Vec<TokenWithLocation>,
/// The index of the first unprocessed token in `self.tokens`
index: usize,
/// Since we cannot distinguish `>>` and double `>`, so use `angle_brackets_num` to store the
/// number of `<` to match `>` in sql like `struct<v1 struct<v2 int>>`.
angle_brackets_num: i32,
}

impl Parser {
/// Parse the specified tokens
pub fn new(tokens: Vec<TokenWithLocation>) -> Self {
Parser {
tokens,
index: 0,
angle_brackets_num: 0,
}
Parser { tokens, index: 0 }
}

/// Adaptor for [`parser_v2`].
///
/// You can call a v2 parser from original parser by using this method.
pub(crate) fn parse_v2<'a, O>(
&'a mut self,
mut parse_next: impl winnow::Parser<
winnow::Located<parser_v2::TokenStreamWrapper<'a>>,
O,
winnow::error::ContextError,
>,
) -> Result<O, ParserError> {
use winnow::stream::Location;

let mut token_stream = winnow::Located::new(parser_v2::TokenStreamWrapper {
tokens: &self.tokens[self.index..],
});
let output = parse_next.parse_next(&mut token_stream).map_err(|e| {
let msg = if let Some(e) = e.into_inner()
&& let Some(cause) = e.cause()
{
format!(": {}", cause)
} else {
"".to_string()
};
ParserError::ParserError(format!(
"Unexpected {}{}",
if self.index + token_stream.location() >= self.tokens.len() {
&"EOF" as &dyn std::fmt::Display
} else {
&self.tokens[self.index + token_stream.location()] as &dyn std::fmt::Display
},
msg
))
});
let offset = token_stream.location();
self.index += offset;
output
}

/// Parse a SQL statement and produce an Abstract Syntax Tree (AST)
Expand Down Expand Up @@ -3824,136 +3857,7 @@ impl Parser {
/// Parse a SQL datatype (in the context of a CREATE TABLE statement for example) and convert
/// into an array of that datatype if needed
pub fn parse_data_type(&mut self) -> Result<DataType, ParserError> {
let mut data_type = self.parse_data_type_inner()?;
while self.consume_token(&Token::LBracket) {
self.expect_token(&Token::RBracket)?;
data_type = DataType::Array(Box::new(data_type));
}
Ok(data_type)
}

/// Parse struct `data_type` e.g.`<v1 int, v2 int, v3 struct<...>>`.
pub fn parse_struct_data_type(&mut self) -> Result<Vec<StructField>, ParserError> {
let mut columns = vec![];
if !self.consume_token(&Token::Lt) {
return self.expected("'<' after struct", self.peek_token());
}
self.angle_brackets_num += 1;

loop {
if let Token::Word(_) = self.peek_token().token {
let name = self.parse_identifier_non_reserved()?;
let data_type = self.parse_data_type()?;
columns.push(StructField { name, data_type })
} else {
return self.expected("struct field name", self.peek_token());
}
if self.angle_brackets_num == 0 {
break;
} else if self.consume_token(&Token::Gt) {
self.angle_brackets_num -= 1;
break;
} else if self.consume_token(&Token::ShiftRight) {
if self.angle_brackets_num >= 1 {
self.angle_brackets_num -= 2;
break;
} else {
return parser_err!("too much '>'");
}
} else if !self.consume_token(&Token::Comma) {
return self.expected("',' or '>' after column definition", self.peek_token());
}
}

Ok(columns)
}

/// Parse a SQL datatype
pub fn parse_data_type_inner(&mut self) -> Result<DataType, ParserError> {
let token = self.next_token();
match token.token {
Token::Word(w) => match w.keyword {
Keyword::BOOLEAN | Keyword::BOOL => Ok(DataType::Boolean),
Keyword::FLOAT => {
let precision = self.parse_optional_precision()?;
match precision {
Some(0) => Err(ParserError::ParserError(
"precision for type float must be at least 1 bit".to_string(),
)),
Some(54..) => Err(ParserError::ParserError(
"precision for type float must be less than 54 bits".to_string(),
)),
_ => Ok(DataType::Float(precision)),
}
}
Keyword::REAL => Ok(DataType::Real),
Keyword::DOUBLE => {
let _ = self.parse_keyword(Keyword::PRECISION);
Ok(DataType::Double)
}
Keyword::SMALLINT => Ok(DataType::SmallInt),
Keyword::INT | Keyword::INTEGER => Ok(DataType::Int),
Keyword::BIGINT => Ok(DataType::BigInt),
Keyword::STRING | Keyword::VARCHAR => Ok(DataType::Varchar),
Keyword::CHAR | Keyword::CHARACTER => {
if self.parse_keyword(Keyword::VARYING) {
Ok(DataType::Varchar)
} else {
Ok(DataType::Char(self.parse_optional_precision()?))
}
}
Keyword::UUID => Ok(DataType::Uuid),
Keyword::DATE => Ok(DataType::Date),
Keyword::TIMESTAMP => {
let with_time_zone = self.parse_keyword(Keyword::WITH);
if with_time_zone || self.parse_keyword(Keyword::WITHOUT) {
self.expect_keywords(&[Keyword::TIME, Keyword::ZONE])?;
}
Ok(DataType::Timestamp(with_time_zone))
}
Keyword::TIME => {
let with_time_zone = self.parse_keyword(Keyword::WITH);
if with_time_zone || self.parse_keyword(Keyword::WITHOUT) {
self.expect_keywords(&[Keyword::TIME, Keyword::ZONE])?;
}
Ok(DataType::Time(with_time_zone))
}
// Interval types can be followed by a complicated interval
// qualifier that we don't currently support. See
// parse_interval_literal for a taste.
Keyword::INTERVAL => Ok(DataType::Interval),
Keyword::REGCLASS => Ok(DataType::Regclass),
Keyword::REGPROC => Ok(DataType::Regproc),
Keyword::TEXT => {
if self.consume_token(&Token::LBracket) {
// Note: this is postgresql-specific
self.expect_token(&Token::RBracket)?;
Ok(DataType::Array(Box::new(DataType::Text)))
} else {
Ok(DataType::Text)
}
}
Keyword::STRUCT => Ok(DataType::Struct(self.parse_struct_data_type()?)),
Keyword::BYTEA => Ok(DataType::Bytea),
Keyword::NUMERIC | Keyword::DECIMAL | Keyword::DEC => {
let (precision, scale) = self.parse_optional_precision_scale()?;
Ok(DataType::Decimal(precision, scale))
}
_ => {
self.prev_token();
let type_name = self.parse_object_name()?;
// JSONB is not a keyword
if type_name.to_string().eq_ignore_ascii_case("jsonb") {
Ok(DataType::Jsonb)
} else {
Ok(DataType::Custom(type_name))
}
}
},
unexpected => {
self.expected("a data type name", unexpected.with_location(token.location))
}
}
self.parse_v2(parser_v2::data_type)
}

/// Parse `AS identifier` (or simply `identifier` if it's not a reserved keyword)
Expand Down
Loading

0 comments on commit 1aa8579

Please sign in to comment.