From 15ca4c040061d99ffa594e56e52a911d34913f6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jannis=20Christopher=20K=C3=B6hl?= <mail@koehl.dev>
Date: Fri, 26 Jul 2024 17:26:20 +0200
Subject: [PATCH] Upgrade grammar to TQL2

---
 package.json                                |   4 +-
 src/index.ts                                |  22 +---
 src/syntax.grammar                          |  86 -------------
 src/tokens.ts                               | 100 +++++++++++++++
 src/tql.grammar                             | 135 ++++++++++++++++++++
 src/{syntax.grammar.d.ts => tql.grammar.ts} |   0
 test/cases.txt                              |  96 ++++++++++++--
 7 files changed, 328 insertions(+), 115 deletions(-)
 delete mode 100644 src/syntax.grammar
 create mode 100644 src/tokens.ts
 create mode 100644 src/tql.grammar
 rename src/{syntax.grammar.d.ts => tql.grammar.ts} (100%)

diff --git a/package.json b/package.json
index 063f8a2..0e56653 100644
--- a/package.json
+++ b/package.json
@@ -10,8 +10,8 @@
   "version": "0.3.2",
   "description": "Tenzir Query Language (TQL) support for CodeMirror",
   "scripts": {
-    "build": "lezer-generator src/syntax.grammar -o src/parser && rollup -c",
-    "build-debug": "lezer-generator src/syntax.grammar --names -o src/parser && rollup -c",
+    "build": "lezer-generator src/tql.grammar -o src/parser && rollup -c",
+    "build-debug": "lezer-generator src/tql.grammar --names -o src/parser && rollup -c",
     "test": "mocha test/test.js"
   },
   "type": "module",
diff --git a/src/index.ts b/src/index.ts
index 4ad631a..13d6a82 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,20 +1,19 @@
-import { parser } from "./syntax.grammar";
+import { parser } from "./tql.grammar";
 import { LRLanguage, LanguageSupport } from "@codemirror/language";
 import { styleTags, tags as t } from "@lezer/highlight";
-import { completeFromList } from "@codemirror/autocomplete";
+import { completeFromList, Completion } from "@codemirror/autocomplete";
 import { data } from "../output.js";
 
 export const TenzirQueryLang = LRLanguage.define({
   parser: parser.configure({
     props: [
       styleTags({
-        "Null Bool Number Ip String Time": t.literal,
-        "OperatorName!": t.name,
-        Punct: t.punctuation,
-        Type: t.typeName,
-        Pipe: t.separator,
+        "Scalar true false null DollarIdent": t.literal,
+        "String": t.string,
+        "StringEsc and else if in let match meta not or this": t.keyword,
+        "OpName! FnIdent": t.name,
+        "+ - \"*\" \"/\" = . ' : \"!\" < > \"?\" \"|\"": t.punctuation,
         "LineComment BlockComment": t.comment,
-        Meta: t.meta,
       }),
     ],
     // TODO: add folding later
@@ -29,13 +28,6 @@ type GeneretedCompletion = {
   docLink: string;
 };
 
-type Completion = {
-  label: string;
-  type: string;
-  detail: string;
-  info: () => Node;
-};
-
 const getCompletion = (completion: GeneretedCompletion): Completion => {
   return {
     label: completion.label,
diff --git a/src/syntax.grammar b/src/syntax.grammar
deleted file mode 100644
index c15acd0..0000000
--- a/src/syntax.grammar
+++ /dev/null
@@ -1,86 +0,0 @@
-// Note: This is rather incomplete and on a best-effort basis. Also, writing
-// such a grammar would be easier with lookahead, but this does not seem to be
-// supported by lezer.
-
-@top Root {
-  (Operator (Pipe Operator)*)?
-}
-Operator { OperatorName arg* }
-OperatorName { Word }
-arg {
-  Word |
-  Path |
-  Flag |
-  String |
-  Punct |
-  Time |
-  Number |
-  Ip |
-  Type |
-  Meta | 
-  Bool |
-  Null |
-  Parenthesis
-}
-
-@tokens {
-  Word { $[a-zA-Z0-9-_/$:]+ }
-  Path { "/" ((Word | ".")* "/")+ (Word | ".")+ }
-  Identifier { $[a-zA-Z]$[a-zA-Z0-9]* }
-  String {
-    "\"" (!["] | "\\\"")* "\"" |
-    "'" (!['] | "\\'")* "'" |
-    "/" (![/] | "\\/")* "/"
-  }
-  Flag { "--" Word | "-" $[a-zA-Z0-9] whitespace }
-  Punct { "==" | "||" | "&" | "=" | "!=" | "<" | "<=" | ">" | ">=" | "in" | "not" | "+" | "-" | "*" | "/" | "." | "%" | "," }
-  Parenthesis { $[()[\]{}] }
-  Type { ":" ($[a-zA-Z_]+ | @eof) }
-  Meta { "#" $[a-zA-Z_]+ }
-  Number { Integer | Float | Count }
-  Count { Integer @whitespace* ("Ki" | "Mi" | "Gi" | "Ti" | "Pi" | "Ei") }
-  Integer { ("-" | "+")? $[0-9]+ }
-  Float { ("-" | "+")? @digit+ "." @digit* }
-  Bool { "true" | "false" }
-  Null { "null" }
-  Ip {
-    @digit+ "." @digit+ "." @digit+ "." @digit+ ("/" @digit*)? |
-    $[0-9a-fA-F]* "::" ($[0-9a-fA-F]+ ("::" | "."))* $[0-9a-fA-F]+
-  }
-  Pipe { "|" }
-  Time {
-    "now" |
-    "@" (Integer | Float) |
-    $[1-2]$[0-9]$[0-9]$[0-9] "-"
-      $[0-1]?$[0-9] (
-        "-" Integer (
-          ("+" | "T" | " ")
-          (Integer (":" Integer)*)?
-        )?
-      )? |
-    (Integer | Float) @whitespace* TimeUnit (@whitespace+ "ago")? |
-    "in" @whitespace+ (Integer | Float) @whitespace* TimeUnit
-  }
-  TimeUnit {
-    "nanoseconds" | "nanosecond" | "nsecs" | "nsec" | "ns" | "microseconds" | "microsecond" | "usecs" | "usec" | "us" | "milliseconds" | "millisecond" | "msecs" | "msec" | "ms" | "seconds" | "second" | "secs" | "sec" | "s" | "minutes" | "minute" | "mins" | "min" | "m" | "hours" | "hour" | "hrs" | "h" | "days" | "day" | "d" | "weeks" | "week" | "w" | "years" | "year" | "y"
-  }
-
-  whitespace { @whitespace }
-  LineComment { ("//" | "# " | "#!" ) ![\n]* }
-
-  @precedence { "/*", LineComment, Time, Type, Ip, Number, Path, String, Flag, Punct, Null, Bool, Identifier, Word }
-}
-
-@skip { whitespace | BlockComment | LineComment }
-
-@skip {} {
-  BlockComment { "/*" (blockCommentContent | blockCommentNewline)* blockCommentEnd }
-}
-
-@local tokens {
-  blockCommentEnd { "*/" }
-  blockCommentNewline { "\n" }
-  @else blockCommentContent
-}
-
-@detectDelim
diff --git a/src/tokens.ts b/src/tokens.ts
new file mode 100644
index 0000000..13bd6ef
--- /dev/null
+++ b/src/tokens.ts
@@ -0,0 +1,100 @@
+import { ExternalTokenizer, ContextTracker, Stack } from "@lezer/lr"
+import {
+  ignoredNewline,
+  newline,
+  LParen,
+  RParen,
+  LBrace,
+  RBrace,
+  LBracket,
+  RBracket,
+  Comma,
+  Ident,
+  FnIdent,
+  DollarIdent
+} from "./parser.terms.js"
+
+type ContextData = {
+  ignoreNewlines: boolean
+  justHadComma: boolean
+}
+
+class MyContext {
+  constructor(parent: MyContext | null, data: ContextData) {
+    this.parent = parent ?? this
+    this.data = data
+  }
+
+  public ignoreNewlines(value: boolean): MyContext {
+    return new MyContext(this, { ...this.data, ignoreNewlines: value })
+  }
+
+  public justHadComma(value: boolean): MyContext {
+    return new MyContext(this.parent, { ...this.data, justHadComma: value })
+  }
+
+  parent: MyContext
+  data: ContextData
+}
+
+const startContext = new MyContext(null, { ignoreNewlines: false, justHadComma: false })
+
+export const context = new ContextTracker({
+  start: startContext,
+  reduce(context, term, stack, input) {
+    return context
+  },
+  shift(context, term, stack, input) {
+    context = context.justHadComma(term === Comma)
+    if (term === LParen || term == LBracket) {
+      return context.ignoreNewlines(true)
+    }
+    if (term === LBrace) {
+      return context.ignoreNewlines(false)
+    }
+    if (term === RParen || term == RBrace || term == RBracket) {
+      return context.parent
+    }
+    return context
+  }
+})
+
+function code(x: string): number {
+  return x.charCodeAt(0)
+}
+
+export const newlines = new ExternalTokenizer((input, stack) => {
+  let ctx = stack.context.data;
+  if (input.next == code("\n")) {
+    let ignore = ctx.ignoreNewlines || ctx.justHadComma
+    input.acceptToken(ignore ? ignoredNewline : newline, 1)
+    return
+  }
+}, { contextual: true })
+
+export const identifiers = new ExternalTokenizer((input, stack) => {
+  const a = code("a");
+  const z = code("z");
+  const A = code("A");
+  const Z = code("Z");
+  const u = code("_");
+  const n0 = code("0");
+  const n9 = code("9");
+  const first = (n: number) => (a <= n && n <= z) || (A <= n && n <= Z) || (n == u);
+  const rest = (n: number) => first(n) || (n0 <= n && n <= n9);
+  let token = Ident;
+  if (!first(input.peek(0))) {
+    if (input.peek(0) != code("$")) {
+      return;
+    }
+    token = DollarIdent;
+  }
+  let n = 1;
+  while (rest(input.peek(n))) {
+    n += 1;
+  }
+  if (input.peek(n) == code("(")) {
+    token = FnIdent;
+  }
+  input.acceptToken(token, n);
+})
diff --git a/src/tql.grammar b/src/tql.grammar
new file mode 100644
index 0000000..1b9beb4
--- /dev/null
+++ b/src/tql.grammar
@@ -0,0 +1,135 @@
+@context context from "./tokens.ts"
+
+@top Pipeline { _Statements? }
+
+_Statements {
+  newline* (Pipe newline*)? _Stmt ((newline | Pipe) _Stmt?)*
+}
+
+_Stmt {
+  OpStmt |
+  AssignStmt |
+  (kw<"let"> | kw<"if"> | kw<"match">) Soup }
+
+OpStmt { OpName ~op_name ~op_name2 Soup }
+
+AssignStmt[@dynamicPrecedence=1] {
+  UnExpr ~op_name2 "=" Soup
+}
+
+OpName { Ident ~op_name } // TODO: Why do we need this here?
+// Entity { Ident ("'" Ident)* }
+
+// TODO: Expand this.
+UnExpr { Ident ~op_name2 ("." (Ident | ".")*)? }
+
+Soup { _Any* }
+
+// Anything but { } | and newline.
+_Any {
+  FnIdent |
+  Ident |
+  DollarIdent |
+  Scalar |
+  _Punct |
+  _Keyword |
+  RecordStart RecordRest |
+  PipeExpr |
+  String
+}
+
+RecordStart[@dynamicPrecedence=2] {
+  "{" newline* Ident ~op_name ":"
+}
+
+RecordRest {
+  (_Any | newline | Pipe)* "}"
+}
+
+PipeExpr {
+  "{" _Statements "}"
+}
+
+_Keyword {
+  kw<"and"> |
+  kw<"else"> |
+  kw<"false"> |
+  kw<"if"> |
+  kw<"in"> |
+  kw<"let"> |
+  kw<"match"> |
+  kw<"meta"> |
+  kw<"not"> |
+  kw<"null"> |
+  kw<"or"> |
+  kw<"this"> |
+  kw<"true">
+}
+
+_Punct {
+  "+" |
+  "-" |
+  "*" |
+  "/" |
+  "," |
+  "=" |
+  "." |
+  "'" |
+  ":" |
+  "!" |
+  "?" |
+  "<" |
+  ">" |
+  "[" |
+  "]" |
+  "(" |
+  ")"
+}
+
+Pipe { "|" }
+
+@tokens {
+  space { ($[ \t\f] | "\\" $[\n\r])+ }
+  LineComment { "//" ![\n\r]* }
+  ","[@export=Comma]
+  "("[@export=LParen] ")"[@export=RParen]
+  "{"[@export=LBrace] "}"[@export=RBrace]
+  "["[@export=LBracket] "]"[@export=RBracket]
+  Scalar { $[0-9]($[0-9] | "." | "_")* $[a-zA-Z0-9_]* }
+  @precedence { LineComment "/" }
+  "+" "-" "*" "/" "," "=" "." "'" ":" "!" "?" "<" ">"
+  stringContent { ![\\\"]+ }
+  StringEsc { "\\" _ }  // TODO: Complete it.
+}
+
+
+@skip {} {
+  String {
+    "\"" (stringContent | StringEsc)* "\""
+  }
+}
+
+
+@skip { space | ignoredNewline | LineComment | BlockComment }
+
+@skip {} {
+  BlockComment { "/*" blockCommentContent* blockCommentEnd }
+}
+
+@local tokens {
+  blockCommentEnd { "*/" }
+  @else blockCommentContent
+}
+
+@external tokens newlines from "./tokens.ts" {
+  newline,
+  ignoredNewline
+}
+
+@external tokens identifiers from "./tokens.ts" {
+  Ident,
+  FnIdent,
+  DollarIdent
+}
+
+kw<term> { @specialize[@name={term}]<Ident, term> }
diff --git a/src/syntax.grammar.d.ts b/src/tql.grammar.ts
similarity index 100%
rename from src/syntax.grammar.d.ts
rename to src/tql.grammar.ts
diff --git a/test/cases.txt b/test/cases.txt
index d06f3b7..0e723af 100644
--- a/test/cases.txt
+++ b/test/cases.txt
@@ -1,30 +1,102 @@
-# One
+# Simple
 
-export | drop foo bar | head | drop qux | head 300 | serve
+foo
+==>
+Pipeline(OpStmt(OpName(Ident),Soup))
+
+
+# Simple and 1 argument
+
+foo bar
+==>
+Pipeline(OpStmt(OpName(Ident),Soup(Ident)))
+
+
+# Simple and 2 arguments
+
+foo bar, baz
+==>
+Pipeline(OpStmt(OpName(Ident),Soup(Ident,Punct(","),Ident)))
+
+
+
+# Operator Pipe
 
+foo bar | foo baz
 ==>
-Root(Operator(OperatorName(Word)),Pipe,Operator(OperatorName(Word),Word,Word),Pipe,Operator(OperatorName(Word)),Pipe,Operator(OperatorName(Word),Word),Pipe,Operator(OperatorName(Word),Integer),Pipe,Operator(OperatorName(Word)))
+Pipeline(OpStmt(OpName(Ident),Soup(Ident)),Pipe,OpStmt(OpName(Ident),Soup(Ident)))
 
 
-# Where 1
+# Operator Newline
+
+foo bar
+foo baz
+==>
+Pipeline(OpStmt(OpName(Ident),Soup(Ident)),OpStmt(OpName(Ident),Soup(Ident)))
 
-export | where src_ip == 1.2.3.4 | serve
 
+# Simple assignment
+foo = bar
 ==>
-Root(Operator(OperatorName(Word)),Pipe,Operator(OperatorName(Word),Word,Punct,Ip),Pipe,Operator(OperatorName(Word)))
+Pipeline(AssignStmt(UnExpr(Ident),"=",Soup(Ident)))
 
 
-# Where 2
+# Newline in expression
+foo (
+  bar)
+==>
+Pipeline(OpStmt(OpName(Ident),Soup("(",Ident,")")))
 
-export | where id.orig_h == 1.2.3.4 && ts > 1 hour ago | to /tmp/haha.json
 
+# Newline in expression then operator
+foo (
+  bar)
+baz
 ==>
-Root(Operator(OperatorName(Word)),Pipe,Operator(OperatorName(Word),Word,Punct,Word,Punct,Ip,Punct,Word,Punct,Integer,Word,Word),Pipe,Operator(OperatorName(Word),Punct,Word,Punct,Word))
+Pipeline(OpStmt(OpName(Ident),Soup("(",Ident,")")),OpStmt(OpName(Ident),Soup))
 
 
-# Shouldn't Parse Correctly
+# Let Definition
+let $foo = 42s
+==>
+Pipeline(LetStmt(Let,DollarIdent,"=",Soup(Scalar)))
+
+
+# Newline after comma
+foo bar,
+  baz,
+  qux
+==>
+Pipeline(OpStmt(OpName(Ident),Soup(Ident,Punct(","),Ident,Punct(","),Ident)))
+
+
+# Nested pipeline
+foo bar=baz {
+  qux
+}
+==>
+Pipeline(OpStmt(OpName(Ident),Soup(Ident,Punct("="),Ident,PipeExpr(OpStmt(OpName(Ident),Soup)))))
+
+
+# Record with no fields
+foo {}
+==>
+Pipeline(OpStmt(OpName(Ident),Soup(RecordExpr("{","}"))))
+
+
+# Record with one field
+foo { foo: bar }
+==>
+Pipeline(OpStmt(OpName(Ident),Soup(RecordExpr("{",Ident,Soup(Ident),"}"))))
+
+
+# Record with two fields
+foo { foo: bar,baz:qux}
+==>
+Pipeline(OpStmt(OpName(Ident),Soup(RecordExpr("{",Ident,Soup(Ident,Punct(","),Ident,Punct,Ident),"}"))))
 
-1232131
 
+# Backslash
+test \
+  foo=42
 ==>
-Root(⚠(Integer))