Implement the executor (#30)

* The executor actually executes bash cells * We create a lexer to parse the code cells into commands * We call this lexer "bashish" becauses its a very small subset of the bash language; mainly just the ability to execute commands * Fix a bug in the grpc gateway annotations. We need to set body to "*" otherwise the mappings aren't done correctly.
jlewi · Apr 8, 2024 · 69bd015 · 69bd015
1 parent af93946
commit 69bd015
Show file tree

Hide file tree

Showing 14 changed files with 3,480 additions and 45 deletions.
diff --git a/app/go.mod b/app/go.mod
@@ -16,6 +16,7 @@ require (
 	github.com/pkg/errors v0.9.1
 	github.com/spf13/cobra v1.8.0
 	github.com/spf13/viper v1.18.2
+	github.com/timtadh/lexmachine v0.2.3
 	go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.1
 	go.uber.org/zap v1.27.0
 	google.golang.org/grpc v1.62.1
@@ -111,6 +112,7 @@ require (
 	github.com/spf13/cast v1.6.0 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 	github.com/subosito/gotenv v1.6.0 // indirect
+	github.com/timtadh/data-structures v0.6.1 // indirect
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.12 // indirect
 	github.com/vbatts/tar-split v0.11.3 // indirect

diff --git a/app/go.sum b/app/go.sum
@@ -329,6 +329,11 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT
 github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8=
 github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU=
+github.com/timtadh/data-structures v0.6.1 h1:76eDpwngj2rEi9r/qvdH6YL7wMXGsoFFzhEylo/IacA=
+github.com/timtadh/data-structures v0.6.1/go.mod h1:uYUnI1cQi/5yMCc7s23I+x8Mn8BCMf4WgK+7/4QSEk4=
+github.com/timtadh/getopt v1.0.0/go.mod h1:L3EL6YN2G0eIAhYBo9b7SB9d/kEQmdnwthIlMJfj210=
+github.com/timtadh/lexmachine v0.2.3 h1:ZqlfHnfMcAygtbNM5Gv7jQf8hmM8LfVzDjfCrq235NQ=
+github.com/timtadh/lexmachine v0.2.3/go.mod h1:oK1NW+93fQSIF6s+J6sXBFWsCPCFbNmrwKV1i0aqvW0=
 github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
 github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=

diff --git a/app/pkg/executor/bashish.go b/app/pkg/executor/bashish.go
@@ -0,0 +1,257 @@
+package executor
+
+import (
+	"strings"
+
+	"github.com/go-cmd/cmd"
+	"github.com/go-logr/zapr"
+	"github.com/jlewi/hydros/pkg/util"
+	"github.com/pkg/errors"
+	"github.com/timtadh/lexmachine"
+	"github.com/timtadh/lexmachine/machines"
+	"go.uber.org/zap"
+)
+
+type TokenType string
+
+const (
+	PipeToken       TokenType = "PIPE"
+	QuoteToken      TokenType = "QUOTE"
+	UnmatchedToken  TokenType = "UNMATCHED"
+	TextToken       TokenType = "TEXT"
+	WhiteSpaceToken TokenType = "WHITESPACE"
+)
+
+// BashishParser is a parser for the bashish language.
+// Bashish is a language that is a very simple subset of bash. It is basically
+// shell commands plus the ability to do things like pipe the output of one command to another.
+type BashishParser struct {
+	l *lexmachine.Lexer
+}
+
+// NewBashishParser creates a new parser for the bashish language.
+func NewBashishParser() (*BashishParser, error) {
+	// We need to construct a lexer for the bashish language.
+	l := lexmachine.NewLexer()
+
+	// Here's a couple important details about the how the lexer works. Keep these in mind when constructing the rules.
+	//
+	// 1. The lexer prefers lower precedence matches that are longer. So be careful about having
+	//    matches that are overly broad.
+	// 2. Lexer compiles regular expressions to a DFA; it doesn't use GoLang's regexp library.
+	//    As a result, the full regexp syntax is not supported. Notably, not all character classes are supported.
+	//    For a list of supported classes see https://github.com/timtadh/lexmachine#built-in-character-classes.
+	// 3. Another major limitation is https://github.com/timtadh/lexmachine/issues/34' lexmachine can't expand
+	//    character classes within character classes. As an example `[\w?=]` won't work. A work around is to expand \w
+	//    manually e.g. `[A-Za-z0-9_?=]`
+
+	l.Add([]byte(`\w+`), NewTokenAction(TextToken))
+	l.Add([]byte(`\s+`), NewTokenAction(WhiteSpaceToken))
+	l.Add([]byte(`['"]`), NewTokenAction(QuoteToken))
+	l.Add([]byte(`\|`), NewTokenAction(PipeToken))
+
+	// We rely on the toTokens function to turn unmatched characters into UnmatchedTexToken
+	if err := l.Compile(); err != nil {
+		return nil, errors.Wrapf(err, "Failed to compile the lexer")
+	}
+	return &BashishParser{
+		l: l,
+	}, nil
+}
+
+// Parse a multline string into a sequence of commands
+func (p *BashishParser) Parse(doc string) ([]Instruction, error) {
+	lines := strings.Split(doc, "\n")
+
+	instructions := make([]Instruction, 0, 10)
+	for _, l := range lines {
+		l := strings.TrimSpace(l)
+		tokens, err := p.toTokens([]byte(l))
+		if err != nil {
+			return nil, err
+		}
+
+		iParser := instructionParser{
+			insideQuote: false,
+			fields:      make([]string, 0, len(tokens)),
+			quoteChar:   "",
+			newField:    "",
+		}
+
+		newInstructions, err := iParser.parse(tokens)
+		if err != nil {
+			return nil, err
+		}
+		instructions = append(instructions, newInstructions...)
+	}
+	return instructions, nil
+}
+
+// toTokens turns the provided input into a stream of tokens.
+func (p *BashishParser) toTokens(inBytes []byte) ([]*Token, error) {
+	scanner, err := p.l.Scanner(inBytes)
+	if err != nil {
+		return nil, errors.Wrapf(err, "failed to initialize the scanner")
+	}
+	tokens := make([]*Token, 0, 50)
+	log := zapr.NewLogger(zap.L())
+	for tok, err, eos := scanner.Next(); !eos; tok, err, eos = scanner.Next() {
+		unErr := &machines.UnconsumedInput{}
+		isUncosumed := errors.As(err, &unErr)
+		if isUncosumed {
+			// If the scanner returns an unconsumed input we need to handle it.
+			// We rely on this to turn unmatched characters into UnmatchedToken
+			scanner.TC = unErr.FailTC
+			text := unErr.Text[unErr.StartTC:unErr.FailTC]
+			log.V(util.Debug).Info("lexer returned unconsumed token", "text", string(text))
+
+			newToken := &Token{
+				TokenType: UnmatchedToken,
+				Lexeme:    string(unErr.Text[unErr.StartTC:unErr.FailTC]),
+				Match: &machines.Match{
+					Bytes: text,
+				},
+			}
+			tokens = append(tokens, newToken)
+			continue
+		} else if err != nil {
+			return nil, err
+		}
+
+		token, ok := tok.(*Token)
+		if !ok {
+			return nil, errors.New("token isn't of type token")
+		}
+		tokens = append(tokens, token)
+	}
+	return tokens, nil
+}
+
+// instructionParser is a state machine for parsing a string of tokens.
+type instructionParser struct {
+	insideQuote bool
+	fields      []string
+	newField    string
+	// To handle nested quotes we need to keep track of the quote character
+	quoteChar string
+}
+
+// parse parses a sequence of tokens into a sequence of instructions.
+func (p *instructionParser) parse(tokens []*Token) ([]Instruction, error) {
+	instructions := make([]Instruction, 0, len(tokens))
+	for _, token := range tokens {
+		val := string(token.Match.Bytes)
+		switch token.TokenType {
+		case PipeToken:
+			if !p.insideQuote && len(p.fields) > 0 {
+				// Complete the instruction
+				i := Instruction{
+					Command: cmd.NewCmd(p.fields[0], p.fields[1:]...),
+					Piped:   true,
+				}
+				instructions = append(instructions, i)
+				p.fields = make([]string, 0, len(tokens))
+			} else {
+				p.newField += string(token.Match.Bytes)
+			}
+		case QuoteToken:
+			p.handleQuoteToken(val)
+		case TextToken:
+			p.newField += val
+		case UnmatchedToken:
+			p.newField += string(token.Match.Bytes)
+		case WhiteSpaceToken:
+			if !p.insideQuote {
+				if len(p.newField) > 0 {
+					p.fields = append(p.fields, p.newField)
+					p.newField = ""
+				}
+			} else {
+				p.newField += string(token.Match.Bytes)
+			}
+		default:
+			return nil, errors.Errorf("parse encoutered unknown token type %v", token.TokenType)
+		}
+	}
+
+	// Any remaining fields should be rolled up into a final instruction.s
+	if len(p.newField) > 0 {
+		p.fields = append(p.fields, p.newField)
+	}
+	if len(p.fields) > 0 {
+		i := Instruction{
+			Command: cmd.NewCmd(p.fields[0], p.fields[1:]...),
+			Piped:   false,
+		}
+		instructions = append(instructions, i)
+	}
+	return instructions, nil
+}
+
+func (p *instructionParser) handleQuoteToken(val string) {
+	lastChar := ""
+	if len(p.newField) > 0 {
+		lastChar = string(p.newField[len(p.newField)-1])
+	}
+
+	if lastChar == "\\" {
+		// Since slash is an escape character we remove it and add the quote
+		p.newField = p.newField[:len(p.newField)-1]
+		p.newField += val
+		return
+	}
+	if p.insideQuote && p.quoteChar != val {
+		// We encountered a quote within a quote but it is a different quote character
+		// so we aren't closing the quotation. So just add it to the field
+		p.newField += val
+		return
+	}
+	// We emulate the shell behavior. In particular, we don't include the quotes in the field.
+	// For example, suppose we have the shell command
+	// echo "hello world"
+	// This is equal to []string{"echo", "hello world"} not
+	// []string{"echo", "\"hello world\""}
+
+	if p.insideQuote {
+		// Close the quote by adding the field
+		p.fields = append(p.fields, p.newField)
+		p.newField = ""
+		p.quoteChar = ""
+		p.insideQuote = false
+	} else {
+		// Start a quotation
+		p.quoteChar = val
+		p.insideQuote = true
+	}
+}
+
+type Token struct {
+	TokenType TokenType
+	Lexeme    string
+	Match     *machines.Match
+}
+
+func NewToken(tokenType TokenType, m *machines.Match) *Token {
+	return &Token{
+		TokenType: tokenType,
+		Lexeme:    string(m.Bytes),
+		Match:     m,
+	}
+}
+
+// NewTokenAction creates a lexmachine action for the given tokentype
+func NewTokenAction(t TokenType) lexmachine.Action {
+	return func(scan *lexmachine.Scanner, match *machines.Match) (interface{}, error) {
+		return NewToken(t, match), nil
+	}
+}
+
+// Instruction represents one instruction in the bashish language.
+// This is typically a command that should be executed. In addition it contains information about
+// how that command should be executed; e.g. if the output of this command should be piped to the next command.
+type Instruction struct {
+	Command *cmd.Cmd
+
+	// Piped should be set to true if the output of this command should be piped to the next instruction.
+	Piped bool
+}
diff --git a/app/pkg/executor/bashish_test.go b/app/pkg/executor/bashish_test.go
@@ -0,0 +1,112 @@
+package executor
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/go-cmd/cmd"
+	"github.com/google/go-cmp/cmp"
+)
+
+func Test_BashishParser(t *testing.T) {
+	type testCase struct {
+		name     string
+		lines    []string
+		expected []Instruction
+	}
+
+	cases := []testCase{
+		{
+			name:  "basic",
+			lines: []string{"kubectl get pods"},
+			expected: []Instruction{
+				{
+					Command: cmd.NewCmd("kubectl", "get", "pods"),
+				},
+			},
+		},
+		{
+			// This text mimics what you would get if you typed the command into a shell
+			name:  "quoted",
+			lines: []string{"echo \"some text\""},
+			expected: []Instruction{
+				{
+					Command: cmd.NewCmd("echo", "some text"),
+				},
+			},
+		},
+		{
+			name:  "simple-pipe",
+			lines: []string{"ls -la | wc -l"},
+			expected: []Instruction{
+				{
+					Command: cmd.NewCmd("ls", "-la"),
+					Piped:   true,
+				},
+				{
+					Command: cmd.NewCmd("wc", "-l"),
+				},
+			},
+		},
+		{
+			name:  "pipe-quoted",
+			lines: []string{`kubectl get pods --format=yaml | jq 'select(.conditions[]) | .status'`},
+			expected: []Instruction{
+				{
+					Command: cmd.NewCmd("kubectl", "get", "pods", "--format=yaml"),
+					Piped:   true,
+				},
+				{
+					Command: cmd.NewCmd("jq", `select(.conditions[]) | .status`),
+				},
+			},
+		},
+		{
+			name:  "nested-quotes",
+			lines: []string{`gcloud logging read "resource.labels.project_id=\"foyle-dev\" resource.type=\"k8s_container\" resource.labels.location=\"us-west1\" resource.labels.cluster_name=\"dev\"" --project=foyle-dev`},
+			expected: []Instruction{
+				{
+					Command: cmd.NewCmd("gcloud", "logging", "read", "resource.labels.project_id=\"foyle-dev\" resource.type=\"k8s_container\" resource.labels.location=\"us-west1\" resource.labels.cluster_name=\"dev\"", "--project=foyle-dev"),
+				},
+			},
+		},
+	}
+
+	parser, err := NewBashishParser()
+
+	if err != nil {
+		t.Fatalf("NewBashishParser() returned error %v", err)
+	}
+
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			doc := strings.Join(c.lines, "\n")
+			actual, err := parser.Parse(doc)
+			if err != nil {
+				t.Fatalf("unexpected parsing error %v", err)
+			}
+			if len(actual) != len(c.expected) {
+				t.Errorf("Expected %v instructions got %v", len(c.expected), len(actual))
+			}
+
+			for i, eInstruction := range c.expected {
+				if i >= len(actual) {
+					break
+				}
+
+				aInstruction := actual[i]
+
+				if aInstruction.Command.Name != eInstruction.Command.Name {
+					t.Errorf("Expected command.Name to be %v got %v", eInstruction.Command.Name, aInstruction.Command.Name)
+				}
+				if d := cmp.Diff(eInstruction.Command.Args, aInstruction.Command.Args); d != "" {
+					t.Fatalf("Unexpected args (-want +got): %v", d)
+				}
+
+				if aInstruction.Piped != eInstruction.Piped {
+					t.Errorf("Expected Piped to be %v got %v", eInstruction.Piped, aInstruction.Piped)
+				}
+			}
+		})
+	}
+}