-
Notifications
You must be signed in to change notification settings - Fork 0
/
preludio.g4
236 lines (193 loc) · 7.17 KB
/
preludio.g4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
// TODO: - Some rules are silent because we don't strictly need them, but that may be too rushed
// — maybe we do actually want to know about e.g. comments in the parse tree (and potentially put
// them into SQL comments) - Need to resolve how to handle "inline pipelines"; there is a rule here
// but it's not used or tested. It's partly a language question — do those need to start with
// `from`? How do these work in the midst of an `aggregate` transform?
grammar preludio;
FUNC: 'func';
PRQL: 'prql';
LET: 'let';
ARROW: '->';
ASSIGN: '=';
PLUS: '+';
MINUS: '-';
STAR: '*';
POW: '**';
DIV: '/';
MOD: '%';
MODEL: '~';
EQ: '==';
NE: '!=';
LE: '<=';
// LT: '<';
GE: '>=';
// GT: '>';
BAR: '|';
COLON: ':';
COMMA: ',';
DOT: '.';
DOLLAR: '$';
RANGE: '..';
LANG: '<';
RANG: '>';
LBRACKET: '[';
RBRACKET: ']';
LPAREN: '(';
RPAREN: ')';
UNDERSCORE: '_';
BACKTICK: '`';
DOUBLE_QUOTE: '"';
SINGLE_QUOTE: '\'';
TRIPLE_DOUBLE_QUOTE: '"""';
TRIPLE_SINGLE_QUOTE: '\'\'\'';
AND: 'and';
OR: 'or';
NOT: 'not';
COALESCE: '??';
NULL_: 'null';
BOOLEAN: 'true' | 'false';
fragment DIGIT: [0-9];
fragment LETTER: [a-zA-Z];
fragment EXP: ('E' | 'e') ('+' | '-')? INTEGER;
INTEGER: DIGIT+;
FLOAT: DIGIT+ DOT DIGIT* EXP? | DIGIT+ EXP? | DOT DIGIT+ EXP?;
// Either a normal ident (starting with a letter, `$` or `_`), or any string surrounded by
// backticks. We allow `e.*`, but not just `*`, since it might conflict with multiply in some cases.
IDENT: IDENT_START (DOT IDENT_NEXT)*;
IDENT_START: (LETTER | DOLLAR | UNDERSCORE) (
LETTER
| DIGIT
| UNDERSCORE
)*;
IDENT_NEXT: IDENT_START | STAR;
WHITESPACE: (' ' | '\t') -> skip;
NEWLINE: '\r'? '\n';
// Need to exclude # in strings (and maybe confirm whether this the syntax we want)
COMMENT: '#' ~('\r' | '\n')* NEWLINE;
nl: NEWLINE | COMMENT;
program:
nl* programIntro? nl* ((funcDef | stmt | pipeline) nl*)* EOF;
programIntro: PRQL namedArg* nl;
funcDef: FUNC funcDefName funcDefParams ARROW expr;
funcDefName: IDENT typeDef?;
funcDefParams: funcDefParam*;
funcDefParam: (namedArg | IDENT) typeDef?;
typeDef: LANG typeTerm BAR typeTerm* RANG;
typeTerm: IDENT typeDef?;
stmt: varAssignStmt | varDeclStmt | expr;
varAssignStmt: IDENT ASSIGN expr;
varDeclStmt: LET IDENT ASSIGN expr;
// pipe: nl | BAR; // original pipeline separator
pipeline: exprCall (nl funcCall)* (nl | EOF);
inlinePipeline: exprCall (BAR funcCall)*;
// We include backticks because some DBs use them (e.g. BigQuery) and we don't, so we pass anything
// within them directly through, including otherwise invalid idents, like those with hyphens.
// Possibly we should consider applying this to expressions rather than just idents — we can adjust
// as we see more cases. ident: operator (keyword WHITESPACE) ident_start (DOT ident_next)*; //
// Either a normal ident (starting with a letter, `$` or `_`), or any string surrounded by //
// backticks. ident_start: ( (ASCII_ALPHA | DOLLAR | UNDERSCORE) ~ ( ASCII_ALPHANUMERIC | DOLLAR )*
// ) | identBackticks+; // We allow `e.*`, but not just `*`, since it might conflict with multiply
// in some cases. ident_next: ident_start | '*'; Anything surrounded by backticks, we pass through.
identBacktick: BACKTICK ~(NEWLINE | BACKTICK)* BACKTICK;
// For sorting signedIdent: (PLUS | MINUS) IDENT;
// A central issue around the terms vs expr is that we want to be able to parse: [foo bar + 1, 2]
// as: - foo bar + 1 - foo bar - foo - bar - + - 1 - 2 So this requires two non-silent rules: - A
// notion of list item that contains anything, including operators (but not commas); e.g. `foo bar +
// 1`. - A notion of expr that aggregates things between operators, e.g. foo bar. So we call the
// list item `expr`, and the things between separators `terms`.
//
// We could have them be the same, but then we need logic in the parser to account for where the
// list item is in this parse tree - foo bar - foo - bar - + - 1 - 2
// whitespace is required to prevent matching s"string". Forbid `operator` so `a - b` can't parse as
// `a` & `-b`.
funcCall: IDENT funcCallParam*;
funcCallParam: namedArg | assign | multiAssign | expr;
namedArg: IDENT COLON (assign | expr);
assign: IDENT ASSIGN exprCall;
multiAssign: list ASSIGN exprCall;
// assignCall: IDENT ASSIGN exprCall;
exprCall: expr | funcCall;
expr:
expr (STAR | DIV | MOD) expr
| expr (MINUS | PLUS) expr
| expr POW expr
| expr MODEL expr
| expr (EQ | NE | GE | LE | LANG | RANG) expr
| expr COALESCE expr
| expr (AND | OR) expr
| LPAREN expr RPAREN
| term;
term:
literal
| identBacktick
| exprUnary
| list
| nestedPipeline;
// exprUnary is for sorting.
exprUnary: (MINUS | PLUS | NOT) (
nestedPipeline
| literal
| IDENT
);
literal:
IDENT
| NULL_
| BOOLEAN
| STRING // | timestamp | date | time | s_string | f_string |
| INTEGER
| FLOAT
| (INTEGER | FLOAT) INTERVAL_KIND
| (INTEGER | FLOAT | IDENT) RANGE (INTEGER | FLOAT | IDENT);
INTERVAL_KIND:
'microseconds'
| 'milliseconds'
| 'seconds'
| 'minutes'
| 'hours'
| 'days'
| 'weeks'
| 'months'
| 'years';
list:
LBRACKET (
nl* (assign | multiAssign | exprCall) (
COMMA nl* (assign | multiAssign | exprCall)
)* COMMA? nl?
)? RBRACKET;
nestedPipeline:
LPAREN nl* (pipeline | inlinePipeline) nl* RPAREN;
// We haven't implemented escapes — I think we can mostly pass those through to SQL, but there may
// be things we're missing. https://pest.rs/book/examples/rust/literals.html
// We need to have a non-silent rule which contains the quotes — `string` in this case — because of
// https://github.com/pest-parser/pest/issues/583. Then when converting to AST, we only keep the
// `string_inner` and discard the `string` given it contains the quotes.
//
// TODO: I'm still a bit unclear how preceeding and trailing spaces are working -- it seems that
// inner spaces are included without an atomic operator (or with `ANY`), but prceeding & trailing
// spaces require both `ANY` _and_ an atomic operator. We have some rudimentary tests for these.
STRING: '"' (ESC | ~[\\"])*? '"' | '\'' (ESC | ~[\\'])*? '\'';
fragment ESC:
'\\' [abtnfrv"'\\]
| UNICODE_ESCAPE
| HEX_ESCAPE
| OCTAL_ESCAPE;
fragment UNICODE_ESCAPE:
'\\' 'u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT
| '\\' 'u' '{' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT '}';
fragment OCTAL_ESCAPE:
'\\' [0-3] [0-7] [0-7]
| '\\' [0-7] [0-7]
| '\\' [0-7];
fragment HEX_ESCAPE: '\\' HEXDIGIT HEXDIGIT?;
fragment HEXDIGIT: ('0' ..'9' | 'a' ..'f' | 'A' ..'F');
// date: '@' date_inner end_expr; time: '@' time_inner end_expr; timestamp: '@' timestamp_inner
// end_expr;
// We use the `inner` types as containing the data that we want to retain in the AST. date_inner =
// ${ ASCII_DIGIT{4} ~ "-" ~ ASCII_DIGIT{2} ~ "-" ~ ASCII_DIGIT{2} } Times are liberally defined
// atm, we could make this more robust. time_inner = ${ ASCII_DIGIT{2} ~ (( ":" | "." ) ~
// ASCII_DIGIT* )* ~ ((( "+" | "-" ) ~ (ASCII_DIGIT | ":" )*) | "Z")? } timestamp_inner = ${
// date_inner ~ "T" ~ time_inner }
// We can use this when want to ensure something is ending, like a date, so `@20-01-0` isn't treated
// like a time `@20-01` `-` (minus) `0`. (Not sure whether `..` should be here or in the items that
// allow it; feel free to demote it to those items if `end_expr` is used somewhere where it's not
// supported) end_expr: ',' | ')' | ']' | nl | '..';