Level 1 Evals. (#181)

Initial stab at level 1 Evals. This PR includes * Initial technote describing how we will implement level 1 Evals * Implementation of the code to run level 1 evals * An initial assertion that verifies that the AI responds with a code cell if the current cell is a markdown cell * A developer doc describing how to run level 1 evals This PR introduces some new patterns for how we do evaluation * Spin up a separate instance of the Agent to test any changes * This also gives our Agent isolated logs so that we don't interfere with the production instance * Evaluation should communicate with the Agent via RPC * To support this we added a Connect Handler for the Generate method * Analyze/Display the results using RunMe * We will ad RPC methods to return the results as JSON in the format suitable for display * We can then use vscode renderers like Data Table Renderers to render them
jlewi · Aug 5, 2024 · be666aa · be666aa
1 parent f71718a
commit be666aa
Show file tree

Hide file tree

Showing 27 changed files with 1,668 additions and 108 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,4 +12,5 @@ dist/
 **/bin
 **/plugin
 **/cover.out
-**/*.wasm
+**/*.wasm
+experiments/runs/**
diff --git a/app/api/assert.go b/app/api/assert.go
@@ -0,0 +1,39 @@
+package api
+
+import "k8s.io/apimachinery/pkg/runtime/schema"
+
+var (
+	AssertJobGVK = schema.FromAPIVersionAndKind(Group+"/"+Version, "AssertJob")
+)
+
+// AssertJob is a struct that represents an assert job. This is a job that runs level one evaluations.
+type AssertJob struct {
+	Metadata Metadata      `json:"metadata" yaml:"metadata"`
+	Spec     AssertJobSpec `json:"spec"    yaml:"spec"`
+}
+
+type AssertJobSpec struct {
+	// Sources is a list of sources to get the data from
+	Sources []EvalSource `json:"sources" yaml:"sources"`
+
+	// AgentAddress is the address of the agent to use to generate completions
+	AgentAddress string `json:"agentAddress" yaml:"agentAddress"`
+
+	// DBDir is the directory for the pebble database that will store the results
+	DBDir string `json:"dbDir" yaml:"dbDir"`
+
+	// SheetID is the ID of the Google Sheet to update with the results.
+	SheetID string `json:"sheetID" yaml:"sheetID"`
+
+	// SheetName is the name of the sheet to update.
+	SheetName string `json:"sheetName" yaml:"sheetName"`
+}
+
+type EvalSource struct {
+	MarkdownSource *MarkdownSource `json:"markdownSource,omitempty" yaml:"markdownSource,omitempty"`
+}
+
+type MarkdownSource struct {
+	// Path to the markdown files to use as evaluation data.
+	Path string `json:"path" yaml:"path"`
+}
diff --git a/app/cmd/apply.go b/app/cmd/apply.go
@@ -40,11 +40,8 @@ func NewApplyCmd() *cobra.Command {
 					return err
 				}
 
-				// DBs can only be opened in a single process.
-				if err := app.OpenDBs(); err != nil {
-					return err
-				}
-
+				// Since DBs can only be opened in a single process; each resource should decide whether it needs
+				// to open the DBs.
 				if err := app.SetupRegistry(); err != nil {
 					return err
 				}

diff --git a/app/pkg/application/app.go b/app/pkg/application/app.go
@@ -213,11 +213,19 @@ func (a *App) SetupRegistry() error {
 	}
 	a.Registry = &controllers.Registry{}
 
-	eval, err := eval.NewEvaluator(*a.Config)
+	evaluator, err := eval.NewEvaluator(*a.Config)
 	if err != nil {
 		return err
 	}
-	if err := a.Registry.Register(api.ExperimentGVK, eval); err != nil {
+	if err := a.Registry.Register(api.ExperimentGVK, evaluator); err != nil {
+		return err
+	}
+
+	assertor, err := eval.NewAssertRunner(*a.Config)
+	if err != nil {
+		return err
+	}
+	if err := a.Registry.Register(api.AssertJobGVK, assertor); err != nil {
 		return err
 	}
 	return nil

diff --git a/app/pkg/eval/assert.go b/app/pkg/eval/assert.go
@@ -0,0 +1,20 @@
+package eval
+
+import (
+	"context"
+
+	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
+)
+
+// Assertion is an interface for evaluating AI generations.
+type Assertion interface {
+	Assert(ctx context.Context, doc *v1alpha1.Doc, examples []*v1alpha1.Example, answer []*v1alpha1.Block) (*v1alpha1.Assertion, error)
+	// Name returns the name of the assertion.
+	Name() string
+}
+
+type AssertResult string
+
+const AssertPassed AssertResult = "passed"
+const AssertFailed AssertResult = "failed"
+const AssertSkipped AssertResult = "skipped"
diff --git a/app/pkg/eval/assertions.go b/app/pkg/eval/assertions.go
@@ -0,0 +1,52 @@
+package eval
+
+import (
+	"context"
+
+	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
+)
+
+const (
+	CodeAfterMarkdownName = "AssertCodeAfterMarkdown"
+)
+
+// AssertCodeAfterMarkdown is an assertion that checks that if the prompt ends in a markdown cell then the response
+// starts with a code cell.
+type AssertCodeAfterMarkdown struct {
+}
+
+func (a *AssertCodeAfterMarkdown) Assert(ctx context.Context, doc *v1alpha1.Doc, examples []*v1alpha1.Example, answer []*v1alpha1.Block) (*v1alpha1.Assertion, error) {
+	assertion := &v1alpha1.Assertion{
+		Name: a.Name(),
+	}
+
+	if len(doc.Blocks) == 0 {
+		assertion.Result = v1alpha1.AssertResult_SKIPPED
+		return assertion, nil
+	}
+
+	last := doc.Blocks[len(doc.Blocks)-1]
+	if last.GetKind() != v1alpha1.BlockKind_MARKUP {
+		assertion.Result = v1alpha1.AssertResult_SKIPPED
+		return assertion, nil
+	}
+
+	if len(answer) == 0 {
+		assertion.Result = v1alpha1.AssertResult_FAILED
+		assertion.Detail = "Answer is empty"
+		return assertion, nil
+	}
+
+	if answer[0].GetKind() != v1alpha1.BlockKind_CODE {
+		assertion.Result = v1alpha1.AssertResult_FAILED
+		assertion.Detail = "Answer doesn't start with a code cell"
+		return assertion, nil
+	}
+
+	assertion.Result = v1alpha1.AssertResult_PASSED
+	return assertion, nil
+}
+
+func (a *AssertCodeAfterMarkdown) Name() string {
+	return CodeAfterMarkdownName
+}
diff --git a/app/pkg/eval/assertions_test.go b/app/pkg/eval/assertions_test.go
@@ -0,0 +1,84 @@
+package eval
+
+import (
+	"context"
+	"testing"
+
+	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
+)
+
+type testCase struct {
+	name     string
+	doc      *v1alpha1.Doc
+	examples []*v1alpha1.Example
+	answer   []*v1alpha1.Block
+	expected *v1alpha1.Assertion
+}
+
+func TestAssertCodeAfterMarkdown(t *testing.T) {
+	cases := []testCase{
+		{
+			name:     "Empty",
+			doc:      &v1alpha1.Doc{},
+			examples: []*v1alpha1.Example{},
+			answer:   []*v1alpha1.Block{},
+			expected: &v1alpha1.Assertion{
+				Name:   "AssertCodeAfterMarkdown",
+				Result: v1alpha1.AssertResult_SKIPPED,
+			},
+		},
+		{
+			name: "Passed",
+			doc: &v1alpha1.Doc{
+				Blocks: []*v1alpha1.Block{
+					{
+						Kind: v1alpha1.BlockKind_MARKUP,
+					},
+				},
+			},
+			examples: []*v1alpha1.Example{},
+			answer: []*v1alpha1.Block{
+				{
+					Kind: v1alpha1.BlockKind_CODE,
+				},
+			},
+			expected: &v1alpha1.Assertion{
+				Name:   "AssertCodeAfterMarkdown",
+				Result: v1alpha1.AssertResult_PASSED,
+			},
+		},
+		{
+			name: "Passed",
+			doc: &v1alpha1.Doc{
+				Blocks: []*v1alpha1.Block{
+					{
+						Kind: v1alpha1.BlockKind_MARKUP,
+					},
+				},
+			},
+			examples: []*v1alpha1.Example{},
+			answer: []*v1alpha1.Block{
+				{
+					Kind: v1alpha1.BlockKind_MARKUP,
+				},
+			},
+			expected: &v1alpha1.Assertion{
+				Name:   "AssertCodeAfterMarkdown",
+				Result: v1alpha1.AssertResult_FAILED,
+			},
+		},
+	}
+
+	for _, c := range cases {
+		a := &AssertCodeAfterMarkdown{}
+		t.Run(c.name, func(t *testing.T) {
+			got, err := a.Assert(context.Background(), c.doc, c.examples, c.answer)
+			if err != nil {
+				t.Fatalf("Error: %v", err)
+			}
+			if got.Result != c.expected.Result {
+				t.Fatalf("Expected %v but got %v", c.expected.Result, got.Result)
+			}
+		})
+	}
+}