From 15ca4c040061d99ffa594e56e52a911d34913f6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jannis=20Christopher=20K=C3=B6hl?= Date: Fri, 26 Jul 2024 17:26:20 +0200 Subject: [PATCH] Upgrade grammar to TQL2 --- package.json | 4 +- src/index.ts | 22 +--- src/syntax.grammar | 86 ------------- src/tokens.ts | 100 +++++++++++++++ src/tql.grammar | 135 ++++++++++++++++++++ src/{syntax.grammar.d.ts => tql.grammar.ts} | 0 test/cases.txt | 96 ++++++++++++-- 7 files changed, 328 insertions(+), 115 deletions(-) delete mode 100644 src/syntax.grammar create mode 100644 src/tokens.ts create mode 100644 src/tql.grammar rename src/{syntax.grammar.d.ts => tql.grammar.ts} (100%) diff --git a/package.json b/package.json index 063f8a2..0e56653 100644 --- a/package.json +++ b/package.json @@ -10,8 +10,8 @@ "version": "0.3.2", "description": "Tenzir Query Language (TQL) support for CodeMirror", "scripts": { - "build": "lezer-generator src/syntax.grammar -o src/parser && rollup -c", - "build-debug": "lezer-generator src/syntax.grammar --names -o src/parser && rollup -c", + "build": "lezer-generator src/tql.grammar -o src/parser && rollup -c", + "build-debug": "lezer-generator src/tql.grammar --names -o src/parser && rollup -c", "test": "mocha test/test.js" }, "type": "module", diff --git a/src/index.ts b/src/index.ts index 4ad631a..13d6a82 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,20 +1,19 @@ -import { parser } from "./syntax.grammar"; +import { parser } from "./tql.grammar"; import { LRLanguage, LanguageSupport } from "@codemirror/language"; import { styleTags, tags as t } from "@lezer/highlight"; -import { completeFromList } from "@codemirror/autocomplete"; +import { completeFromList, Completion } from "@codemirror/autocomplete"; import { data } from "../output.js"; export const TenzirQueryLang = LRLanguage.define({ parser: parser.configure({ props: [ styleTags({ - "Null Bool Number Ip String Time": t.literal, - "OperatorName!": t.name, - Punct: t.punctuation, - Type: t.typeName, - Pipe: t.separator, + "Scalar true false null DollarIdent": t.literal, + "String": t.string, + "StringEsc and else if in let match meta not or this": t.keyword, + "OpName! FnIdent": t.name, + "+ - \"*\" \"/\" = . ' : \"!\" < > \"?\" \"|\"": t.punctuation, "LineComment BlockComment": t.comment, - Meta: t.meta, }), ], // TODO: add folding later @@ -29,13 +28,6 @@ type GeneretedCompletion = { docLink: string; }; -type Completion = { - label: string; - type: string; - detail: string; - info: () => Node; -}; - const getCompletion = (completion: GeneretedCompletion): Completion => { return { label: completion.label, diff --git a/src/syntax.grammar b/src/syntax.grammar deleted file mode 100644 index c15acd0..0000000 --- a/src/syntax.grammar +++ /dev/null @@ -1,86 +0,0 @@ -// Note: This is rather incomplete and on a best-effort basis. Also, writing -// such a grammar would be easier with lookahead, but this does not seem to be -// supported by lezer. - -@top Root { - (Operator (Pipe Operator)*)? -} -Operator { OperatorName arg* } -OperatorName { Word } -arg { - Word | - Path | - Flag | - String | - Punct | - Time | - Number | - Ip | - Type | - Meta |  - Bool | - Null | - Parenthesis -} - -@tokens { - Word { $[a-zA-Z0-9-_/$:]+ } - Path { "/" ((Word | ".")* "/")+ (Word | ".")+ } - Identifier { $[a-zA-Z]$[a-zA-Z0-9]* } - String { - "\"" (!["] | "\\\"")* "\"" | - "'" (!['] | "\\'")* "'" | - "/" (![/] | "\\/")* "/" - } - Flag { "--" Word | "-" $[a-zA-Z0-9] whitespace } - Punct { "==" | "||" | "&" | "=" | "!=" | "<" | "<=" | ">" | ">=" | "in" | "not" | "+" | "-" | "*" | "/" | "." | "%" | "," } - Parenthesis { $[()[\]{}] } - Type { ":" ($[a-zA-Z_]+ | @eof) } - Meta { "#" $[a-zA-Z_]+ } - Number { Integer | Float | Count } - Count { Integer @whitespace* ("Ki" | "Mi" | "Gi" | "Ti" | "Pi" | "Ei") } - Integer { ("-" | "+")? $[0-9]+ } - Float { ("-" | "+")? @digit+ "." @digit* } - Bool { "true" | "false" } - Null { "null" } - Ip { - @digit+ "." @digit+ "." @digit+ "." @digit+ ("/" @digit*)? | - $[0-9a-fA-F]* "::" ($[0-9a-fA-F]+ ("::" | "."))* $[0-9a-fA-F]+ - } - Pipe { "|" } - Time { - "now" | - "@" (Integer | Float) | - $[1-2]$[0-9]$[0-9]$[0-9] "-" - $[0-1]?$[0-9] ( - "-" Integer ( - ("+" | "T" | " ") - (Integer (":" Integer)*)? - )? - )? | - (Integer | Float) @whitespace* TimeUnit (@whitespace+ "ago")? | - "in" @whitespace+ (Integer | Float) @whitespace* TimeUnit - } - TimeUnit { - "nanoseconds" | "nanosecond" | "nsecs" | "nsec" | "ns" | "microseconds" | "microsecond" | "usecs" | "usec" | "us" | "milliseconds" | "millisecond" | "msecs" | "msec" | "ms" | "seconds" | "second" | "secs" | "sec" | "s" | "minutes" | "minute" | "mins" | "min" | "m" | "hours" | "hour" | "hrs" | "h" | "days" | "day" | "d" | "weeks" | "week" | "w" | "years" | "year" | "y" - } - - whitespace { @whitespace } - LineComment { ("//" | "# " | "#!" ) ![\n]* } - - @precedence { "/*", LineComment, Time, Type, Ip, Number, Path, String, Flag, Punct, Null, Bool, Identifier, Word } -} - -@skip { whitespace | BlockComment | LineComment } - -@skip {} { - BlockComment { "/*" (blockCommentContent | blockCommentNewline)* blockCommentEnd } -} - -@local tokens { - blockCommentEnd { "*/" } - blockCommentNewline { "\n" } - @else blockCommentContent -} - -@detectDelim diff --git a/src/tokens.ts b/src/tokens.ts new file mode 100644 index 0000000..13bd6ef --- /dev/null +++ b/src/tokens.ts @@ -0,0 +1,100 @@ +import { ExternalTokenizer, ContextTracker, Stack } from "@lezer/lr" +import { + ignoredNewline, + newline, + LParen, + RParen, + LBrace, + RBrace, + LBracket, + RBracket, + Comma, + Ident, + FnIdent, + DollarIdent +} from "./parser.terms.js" + +type ContextData = { + ignoreNewlines: boolean + justHadComma: boolean +} + +class MyContext { + constructor(parent: MyContext | null, data: ContextData) { + this.parent = parent ?? this + this.data = data + } + + public ignoreNewlines(value: boolean): MyContext { + return new MyContext(this, { ...this.data, ignoreNewlines: value }) + } + + public justHadComma(value: boolean): MyContext { + return new MyContext(this.parent, { ...this.data, justHadComma: value }) + } + + parent: MyContext + data: ContextData +} + +const startContext = new MyContext(null, { ignoreNewlines: false, justHadComma: false }) + +export const context = new ContextTracker({ + start: startContext, + reduce(context, term, stack, input) { + return context + }, + shift(context, term, stack, input) { + context = context.justHadComma(term === Comma) + if (term === LParen || term == LBracket) { + return context.ignoreNewlines(true) + } + if (term === LBrace) { + return context.ignoreNewlines(false) + } + if (term === RParen || term == RBrace || term == RBracket) { + return context.parent + } + return context + } +}) + +function code(x: string): number { + return x.charCodeAt(0) +} + +export const newlines = new ExternalTokenizer((input, stack) => { + let ctx = stack.context.data; + if (input.next == code("\n")) { + let ignore = ctx.ignoreNewlines || ctx.justHadComma + input.acceptToken(ignore ? ignoredNewline : newline, 1) + return + } +}, { contextual: true }) + +export const identifiers = new ExternalTokenizer((input, stack) => { + const a = code("a"); + const z = code("z"); + const A = code("A"); + const Z = code("Z"); + const u = code("_"); + const n0 = code("0"); + const n9 = code("9"); + const first = (n: number) => (a <= n && n <= z) || (A <= n && n <= Z) || (n == u); + const rest = (n: number) => first(n) || (n0 <= n && n <= n9); + let token = Ident; + if (!first(input.peek(0))) { + if (input.peek(0) != code("$")) { + return; + } + token = DollarIdent; + } + let n = 1; + while (rest(input.peek(n))) { + n += 1; + } + if (input.peek(n) == code("(")) { + token = FnIdent; + } + input.acceptToken(token, n); +}) diff --git a/src/tql.grammar b/src/tql.grammar new file mode 100644 index 0000000..1b9beb4 --- /dev/null +++ b/src/tql.grammar @@ -0,0 +1,135 @@ +@context context from "./tokens.ts" + +@top Pipeline { _Statements? } + +_Statements { + newline* (Pipe newline*)? _Stmt ((newline | Pipe) _Stmt?)* +} + +_Stmt { + OpStmt | + AssignStmt | + (kw<"let"> | kw<"if"> | kw<"match">) Soup } + +OpStmt { OpName ~op_name ~op_name2 Soup } + +AssignStmt[@dynamicPrecedence=1] { + UnExpr ~op_name2 "=" Soup +} + +OpName { Ident ~op_name } // TODO: Why do we need this here? +// Entity { Ident ("'" Ident)* } + +// TODO: Expand this. +UnExpr { Ident ~op_name2 ("." (Ident | ".")*)? } + +Soup { _Any* } + +// Anything but { } | and newline. +_Any { + FnIdent | + Ident | + DollarIdent | + Scalar | + _Punct | + _Keyword | + RecordStart RecordRest | + PipeExpr | + String +} + +RecordStart[@dynamicPrecedence=2] { + "{" newline* Ident ~op_name ":" +} + +RecordRest { + (_Any | newline | Pipe)* "}" +} + +PipeExpr { + "{" _Statements "}" +} + +_Keyword { + kw<"and"> | + kw<"else"> | + kw<"false"> | + kw<"if"> | + kw<"in"> | + kw<"let"> | + kw<"match"> | + kw<"meta"> | + kw<"not"> | + kw<"null"> | + kw<"or"> | + kw<"this"> | + kw<"true"> +} + +_Punct { + "+" | + "-" | + "*" | + "/" | + "," | + "=" | + "." | + "'" | + ":" | + "!" | + "?" | + "<" | + ">" | + "[" | + "]" | + "(" | + ")" +} + +Pipe { "|" } + +@tokens { + space { ($[ \t\f] | "\\" $[\n\r])+ } + LineComment { "//" ![\n\r]* } + ","[@export=Comma] + "("[@export=LParen] ")"[@export=RParen] + "{"[@export=LBrace] "}"[@export=RBrace] + "["[@export=LBracket] "]"[@export=RBracket] + Scalar { $[0-9]($[0-9] | "." | "_")* $[a-zA-Z0-9_]* } + @precedence { LineComment "/" } + "+" "-" "*" "/" "," "=" "." "'" ":" "!" "?" "<" ">" + stringContent { ![\\\"]+ } + StringEsc { "\\" _ } // TODO: Complete it. +} + + +@skip {} { + String { + "\"" (stringContent | StringEsc)* "\"" + } +} + + +@skip { space | ignoredNewline | LineComment | BlockComment } + +@skip {} { + BlockComment { "/*" blockCommentContent* blockCommentEnd } +} + +@local tokens { + blockCommentEnd { "*/" } + @else blockCommentContent +} + +@external tokens newlines from "./tokens.ts" { + newline, + ignoredNewline +} + +@external tokens identifiers from "./tokens.ts" { + Ident, + FnIdent, + DollarIdent +} + +kw { @specialize[@name={term}] } diff --git a/src/syntax.grammar.d.ts b/src/tql.grammar.ts similarity index 100% rename from src/syntax.grammar.d.ts rename to src/tql.grammar.ts diff --git a/test/cases.txt b/test/cases.txt index d06f3b7..0e723af 100644 --- a/test/cases.txt +++ b/test/cases.txt @@ -1,30 +1,102 @@ -# One +# Simple -export | drop foo bar | head | drop qux | head 300 | serve +foo +==> +Pipeline(OpStmt(OpName(Ident),Soup)) + + +# Simple and 1 argument + +foo bar +==> +Pipeline(OpStmt(OpName(Ident),Soup(Ident))) + + +# Simple and 2 arguments + +foo bar, baz +==> +Pipeline(OpStmt(OpName(Ident),Soup(Ident,Punct(","),Ident))) + + + +# Operator Pipe +foo bar | foo baz ==> -Root(Operator(OperatorName(Word)),Pipe,Operator(OperatorName(Word),Word,Word),Pipe,Operator(OperatorName(Word)),Pipe,Operator(OperatorName(Word),Word),Pipe,Operator(OperatorName(Word),Integer),Pipe,Operator(OperatorName(Word))) +Pipeline(OpStmt(OpName(Ident),Soup(Ident)),Pipe,OpStmt(OpName(Ident),Soup(Ident))) -# Where 1 +# Operator Newline + +foo bar +foo baz +==> +Pipeline(OpStmt(OpName(Ident),Soup(Ident)),OpStmt(OpName(Ident),Soup(Ident))) -export | where src_ip == 1.2.3.4 | serve +# Simple assignment +foo = bar ==> -Root(Operator(OperatorName(Word)),Pipe,Operator(OperatorName(Word),Word,Punct,Ip),Pipe,Operator(OperatorName(Word))) +Pipeline(AssignStmt(UnExpr(Ident),"=",Soup(Ident))) -# Where 2 +# Newline in expression +foo ( + bar) +==> +Pipeline(OpStmt(OpName(Ident),Soup("(",Ident,")"))) -export | where id.orig_h == 1.2.3.4 && ts > 1 hour ago | to /tmp/haha.json +# Newline in expression then operator +foo ( + bar) +baz ==> -Root(Operator(OperatorName(Word)),Pipe,Operator(OperatorName(Word),Word,Punct,Word,Punct,Ip,Punct,Word,Punct,Integer,Word,Word),Pipe,Operator(OperatorName(Word),Punct,Word,Punct,Word)) +Pipeline(OpStmt(OpName(Ident),Soup("(",Ident,")")),OpStmt(OpName(Ident),Soup)) -# Shouldn't Parse Correctly +# Let Definition +let $foo = 42s +==> +Pipeline(LetStmt(Let,DollarIdent,"=",Soup(Scalar))) + + +# Newline after comma +foo bar, + baz, + qux +==> +Pipeline(OpStmt(OpName(Ident),Soup(Ident,Punct(","),Ident,Punct(","),Ident))) + + +# Nested pipeline +foo bar=baz { + qux +} +==> +Pipeline(OpStmt(OpName(Ident),Soup(Ident,Punct("="),Ident,PipeExpr(OpStmt(OpName(Ident),Soup))))) + + +# Record with no fields +foo {} +==> +Pipeline(OpStmt(OpName(Ident),Soup(RecordExpr("{","}")))) + + +# Record with one field +foo { foo: bar } +==> +Pipeline(OpStmt(OpName(Ident),Soup(RecordExpr("{",Ident,Soup(Ident),"}")))) + + +# Record with two fields +foo { foo: bar,baz:qux} +==> +Pipeline(OpStmt(OpName(Ident),Soup(RecordExpr("{",Ident,Soup(Ident,Punct(","),Ident,Punct,Ident),"}")))) -1232131 +# Backslash +test \ + foo=42 ==> -Root(⚠(Integer))