Prompt Tuning for Autocomplete (#184)

We want to tune the prompt to generate more sensible results in the case of Autocomplete. * With the updated prompt only 2 out of 26 examples in our eval set failed on the codeAfterMarkdown assertion and the ends with code cell assertion. * This should increase the likelihood that * AI responds with a code cell without extraneous exposition before or after the code cell * Response includes a single code cell; multiple cells can lead to a confusing UX. Related to #170
jlewi · Aug 5, 2024 · a6c0c58 · a6c0c58
1 parent be666aa
commit a6c0c58
Show file tree

Hide file tree

Showing 10 changed files with 326 additions and 50 deletions.
diff --git a/app/pkg/agent/prompt.go b/app/pkg/agent/prompt.go
@@ -9,12 +9,21 @@ const (
 	systemPrompt = `You are a helpful AI assistant for software developers. You are helping software engineers write markdown documents to deploy
 and operate software. Your job is to help users reason about problems and tasks and come up with the appropriate
 commands to accomplish them. You should never try to execute commands. You should always tell the user
-to execute the commands themselves. To help the user place the commands inside a code block with the language set to
+to execute the commands themselves. To help the user, place the commands inside a code block with the language set to
 bash. Users can then execute the commands inside VSCode notebooks. The output will then be appended to the document.
 You can then use that output to reason about the next steps.
 
 You are only helping users with tasks related to building, deploying, and operating software. You should interpret
 any questions or commands in that context.
+
+Keep these rules in mind when generating responses
+
+* If the document ends in a markdown cell, the response should start with a code cell.
+* The response should almost never have more than one code cell
+* If you want to suggest multiple commands put them all in a code cell
+* Once you suggest a code cell you should usually stop and wait for the user to execute the command
+* Only respond with a markdown cell if the previous cell contains the output of a command and you are explaining
+  the output
 `
 )
 

diff --git a/app/pkg/eval/assertions.go b/app/pkg/eval/assertions.go
@@ -2,12 +2,15 @@ package eval
 
 import (
 	"context"
+	"fmt"
 
 	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
 )
 
 const (
 	CodeAfterMarkdownName = "AssertCodeAfterMarkdown"
+	OneCodeCellName       = "AssertOneCodeCell"
+	EndsWithCodeCellName  = "AssertEndsWithCodeCell"
 )
 
 // AssertCodeAfterMarkdown is an assertion that checks that if the prompt ends in a markdown cell then the response
@@ -50,3 +53,86 @@ func (a *AssertCodeAfterMarkdown) Assert(ctx context.Context, doc *v1alpha1.Doc,
 func (a *AssertCodeAfterMarkdown) Name() string {
 	return CodeAfterMarkdownName
 }
+
+// AssertOneCodeCell is an assertion that checks the response has one code cell.
+// We don't want to suggest multiple code cells because that can be confusing. If one command depends on the
+// output of another we should generate one at a time.
+type AssertOneCodeCell struct {
+}
+
+func (a *AssertOneCodeCell) Assert(ctx context.Context, doc *v1alpha1.Doc, examples []*v1alpha1.Example, answer []*v1alpha1.Block) (*v1alpha1.Assertion, error) {
+	assertion := &v1alpha1.Assertion{
+		Name: a.Name(),
+	}
+
+	if len(doc.Blocks) == 0 {
+		assertion.Result = v1alpha1.AssertResult_FAILED
+		return assertion, nil
+	}
+
+	numCode := 0
+	for _, b := range answer {
+		if b.GetKind() == v1alpha1.BlockKind_CODE {
+			numCode++
+		}
+	}
+
+	if numCode != 1 {
+		assertion.Result = v1alpha1.AssertResult_FAILED
+		assertion.Detail = fmt.Sprintf("Answer doesn't contain exactly one code cell; it has %d code cells ", numCode)
+		return assertion, nil
+	}
+
+	assertion.Result = v1alpha1.AssertResult_PASSED
+	return assertion, nil
+}
+
+func (a *AssertOneCodeCell) Name() string {
+	return OneCodeCellName
+}
+
+// AssertEndsWithCodeCell is an assertion that checks that if the response has a code cell then it ends with the
+// code cell. If we need the user to run a command we should just suggest the command and not additional output
+// after that.
+type AssertEndsWithCodeCell struct {
+}
+
+func (a *AssertEndsWithCodeCell) Assert(ctx context.Context, doc *v1alpha1.Doc, examples []*v1alpha1.Example, answer []*v1alpha1.Block) (*v1alpha1.Assertion, error) {
+	assertion := &v1alpha1.Assertion{
+		Name: a.Name(),
+	}
+
+	if len(doc.Blocks) == 0 {
+		assertion.Result = v1alpha1.AssertResult_SKIPPED
+		assertion.Detail = "Doc is empty"
+		return assertion, nil
+	}
+
+	hasCodeCell := false
+	for _, b := range answer {
+		if b.GetKind() == v1alpha1.BlockKind_CODE {
+			hasCodeCell = true
+			break
+		}
+	}
+
+	if !hasCodeCell {
+		assertion.Result = v1alpha1.AssertResult_SKIPPED
+		assertion.Detail = "Answer doesn't contain a code cell"
+		return assertion, nil
+	}
+
+	last := answer[len(answer)-1]
+
+	if last.GetKind() != v1alpha1.BlockKind_CODE {
+		assertion.Result = v1alpha1.AssertResult_FAILED
+		return assertion, nil
+	}
+
+	assertion.Result = v1alpha1.AssertResult_PASSED
+	return assertion, nil
+}
+
+func (a *AssertEndsWithCodeCell) Name() string {
+	return EndsWithCodeCellName
+}
diff --git a/app/pkg/eval/assertions_test.go b/app/pkg/eval/assertions_test.go
@@ -82,3 +82,142 @@ func TestAssertCodeAfterMarkdown(t *testing.T) {
 		})
 	}
 }
+
+func TestAssertOneCodeCell(t *testing.T) {
+	cases := []testCase{
+		{
+			name:     "Empty",
+			doc:      &v1alpha1.Doc{},
+			examples: []*v1alpha1.Example{},
+			answer:   []*v1alpha1.Block{},
+			expected: &v1alpha1.Assertion{
+				Name:   "AssertCodeAfterMarkdown",
+				Result: v1alpha1.AssertResult_SKIPPED,
+			},
+		},
+		{
+			name: "Passed",
+			doc: &v1alpha1.Doc{
+				Blocks: []*v1alpha1.Block{
+					{
+						Kind: v1alpha1.BlockKind_MARKUP,
+					},
+				},
+			},
+			examples: []*v1alpha1.Example{},
+			answer: []*v1alpha1.Block{
+				{
+					Kind: v1alpha1.BlockKind_CODE,
+				},
+			},
+			expected: &v1alpha1.Assertion{
+				Name:   CodeAfterMarkdownName,
+				Result: v1alpha1.AssertResult_PASSED,
+			},
+		},
+		{
+			name: "Failed",
+			doc: &v1alpha1.Doc{
+				Blocks: []*v1alpha1.Block{
+					{
+						Kind: v1alpha1.BlockKind_MARKUP,
+					},
+				},
+			},
+			examples: []*v1alpha1.Example{},
+			answer: []*v1alpha1.Block{
+				{
+					Kind: v1alpha1.BlockKind_MARKUP,
+				},
+			},
+			expected: &v1alpha1.Assertion{
+				Name:   OneCodeCellName,
+				Result: v1alpha1.AssertResult_FAILED,
+			},
+		},
+	}
+
+	for _, c := range cases {
+		a := &AssertCodeAfterMarkdown{}
+		t.Run(c.name, func(t *testing.T) {
+			got, err := a.Assert(context.Background(), c.doc, c.examples, c.answer)
+			if err != nil {
+				t.Fatalf("Error: %v", err)
+			}
+			if got.Result != c.expected.Result {
+				t.Fatalf("Expected %v but got %v", c.expected.Result, got.Result)
+			}
+		})
+	}
+}
+
+func TestAssertEndsWithCodeCell(t *testing.T) {
+	cases := []testCase{
+		{
+			name:     "Empty",
+			doc:      &v1alpha1.Doc{},
+			examples: []*v1alpha1.Example{},
+			answer:   []*v1alpha1.Block{},
+			expected: &v1alpha1.Assertion{
+				Name:   EndsWithCodeCellName,
+				Result: v1alpha1.AssertResult_SKIPPED,
+			},
+		},
+		{
+			name: "Passed",
+			doc: &v1alpha1.Doc{
+				Blocks: []*v1alpha1.Block{
+					{
+						Kind: v1alpha1.BlockKind_MARKUP,
+					},
+				},
+			},
+			examples: []*v1alpha1.Example{},
+			answer: []*v1alpha1.Block{
+				{
+					Kind: v1alpha1.BlockKind_CODE,
+				},
+			},
+			expected: &v1alpha1.Assertion{
+				Name:   EndsWithCodeCellName,
+				Result: v1alpha1.AssertResult_PASSED,
+			},
+		},
+		{
+			name: "Failed",
+			doc: &v1alpha1.Doc{
+				Blocks: []*v1alpha1.Block{
+					{
+						Kind: v1alpha1.BlockKind_MARKUP,
+					},
+				},
+			},
+			examples: []*v1alpha1.Example{},
+			answer: []*v1alpha1.Block{
+				{
+					Kind: v1alpha1.BlockKind_CODE,
+				},
+				{
+					Kind: v1alpha1.BlockKind_MARKUP,
+				},
+			},
+			expected: &v1alpha1.Assertion{
+				Name:   EndsWithCodeCellName,
+				Result: v1alpha1.AssertResult_FAILED,
+			},
+		},
+	}
+
+	for _, c := range cases {
+		a := &AssertEndsWithCodeCell{}
+		t.Run(c.name, func(t *testing.T) {
+			got, err := a.Assert(context.Background(), c.doc, c.examples, c.answer)
+			if err != nil {
+				t.Fatalf("Error: %v", err)
+			}
+			if got.Result != c.expected.Result {
+				t.Fatalf("Expected %v but got %v", c.expected.Result, got.Result)
+			}
+		})
+	}
+}
diff --git a/app/pkg/eval/assertor.go b/app/pkg/eval/assertor.go
@@ -32,6 +32,8 @@ func NewAssertRunner(config config.Config) (*AssertRunner, error) {
 	// Load the assertions
 	runner.assertions = make([]Assertion, 0, 10)
 	runner.assertions = append(runner.assertions, &AssertCodeAfterMarkdown{})
+	runner.assertions = append(runner.assertions, &AssertOneCodeCell{})
+	runner.assertions = append(runner.assertions, &AssertEndsWithCodeCell{})
 	return runner, nil
 }
 

diff --git a/app/pkg/eval/service.go b/app/pkg/eval/service.go
@@ -150,6 +150,10 @@ func toAssertionRow(result *v1alpha1.EvalResult) (*v1alpha1.AssertionRow, error)
 		switch a.Name {
 		case CodeAfterMarkdownName:
 			row.CodeAfterMarkdown = a.GetResult()
+		case OneCodeCellName:
+			row.OneCodeCell = a.GetResult()
+		case EndsWithCodeCellName:
+			row.EndsWithCodeCell = a.GetResult()
 		default:
 			log.Info("Unknown assertion", "name", a.Name)
 		}

diff --git a/developer_guides/eval.md b/developer_guides/eval.md
@@ -89,4 +89,11 @@ foyle apply ~/git_foyle/experiments/norag.yaml
 
 ```sh {"id":"01HZ38QWPZ565XH11CCKYCF1M7"}
 foyle apply ~/git_foyle/experiments/rag.yaml
-```
+```
+
+### Adding Level 1 Evals
+
+1. Define the Assertion in [eval/assertions.go](../app/pkg/eval/assertions.go)
+2. Update [Assertor in assertor.go](../app/pkg/eval/assertor.go) to include the new assertion
+3. Update [AssertRow proto](../protos/eval/eval.proto) to include the new assertion
+4. Update [toAssertionRow](../app/pkg/eval/service.go) to include the new assertion in `AssertRow`
diff --git a/developer_guides/runme.md b/developer_guides/runme.md
@@ -43,7 +43,7 @@ Now we can install the extension using the vscode binary
    * It also seemed like when I didn't bump the version I might have actually been using an old version of the extension
 
 ```bash {"id":"01HYZVG8KZKYSTFS4R1RJZDS7P"}
-/Applications/Visual\ Studio\ Code.app/Contents/Resources/app/bin/code --force --install-extension ~/git_vscode-runme/runme-3.5.9.vsix
+/Applications/Visual\ Studio\ Code.app/Contents/Resources/app/bin/code --force --install-extension ~/git_vscode-runme/runme-extension.vsix
 ```
 
 ```sh {"id":"01HY264KZTS4J9NHJASJT1GYJ7"}
@@ -60,10 +60,6 @@ Lets try installing and reinstalling it
 
 ```
 
-```bash {"id":"01HY75KYKE3SFAM5EXMDAVJDTQ"}
-echo "hello world"sdfds
-```
-
 ## Debugging the Runme Extension in vscode
 
 * It seems like you may need to run `yarn build` for changes to get picked up; running `F5` doesn't always seem to work

diff --git a/protos/foyle/v1alpha1/eval.proto b/protos/foyle/v1alpha1/eval.proto
@@ -81,6 +81,8 @@ message AssertionRow {
 
   // TODO(jeremy): How can we avoid having to add each assertion here
   AssertResult code_after_markdown = 5;
+  AssertResult one_code_cell = 6;
+  AssertResult ends_with_code_cell = 7;
 }
 
 message AssertionTableRequest {