Skip to content

Commit

Permalink
Level 1 Evals. (#181)
Browse files Browse the repository at this point in the history
Initial stab at level 1 Evals. This PR includes

* Initial technote describing how we will implement level 1 Evals
* Implementation of the code to run level 1 evals
* An initial assertion that verifies that the AI responds with a code
cell if the current cell is a markdown cell
* A developer doc describing how to run level 1 evals

This PR introduces some new patterns for how we do evaluation

* Spin up a separate instance of the Agent to test any changes
* This also gives our Agent isolated logs so that we don't interfere
with the production instance
* Evaluation should communicate with the Agent via RPC
  * To support this we added a Connect Handler for the Generate method
* Analyze/Display the results using RunMe
* We will ad RPC methods to return the results as JSON in the format
suitable for display
* We can then use vscode renderers like Data Table Renderers to render
them
  • Loading branch information
jlewi authored Aug 5, 2024
1 parent f71718a commit be666aa
Show file tree
Hide file tree
Showing 27 changed files with 1,668 additions and 108 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ dist/
**/bin
**/plugin
**/cover.out
**/*.wasm
**/*.wasm
experiments/runs/**
39 changes: 39 additions & 0 deletions app/api/assert.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package api

import "k8s.io/apimachinery/pkg/runtime/schema"

var (
AssertJobGVK = schema.FromAPIVersionAndKind(Group+"/"+Version, "AssertJob")
)

// AssertJob is a struct that represents an assert job. This is a job that runs level one evaluations.
type AssertJob struct {
Metadata Metadata `json:"metadata" yaml:"metadata"`
Spec AssertJobSpec `json:"spec" yaml:"spec"`
}

type AssertJobSpec struct {
// Sources is a list of sources to get the data from
Sources []EvalSource `json:"sources" yaml:"sources"`

// AgentAddress is the address of the agent to use to generate completions
AgentAddress string `json:"agentAddress" yaml:"agentAddress"`

// DBDir is the directory for the pebble database that will store the results
DBDir string `json:"dbDir" yaml:"dbDir"`

// SheetID is the ID of the Google Sheet to update with the results.
SheetID string `json:"sheetID" yaml:"sheetID"`

// SheetName is the name of the sheet to update.
SheetName string `json:"sheetName" yaml:"sheetName"`
}

type EvalSource struct {
MarkdownSource *MarkdownSource `json:"markdownSource,omitempty" yaml:"markdownSource,omitempty"`
}

type MarkdownSource struct {
// Path to the markdown files to use as evaluation data.
Path string `json:"path" yaml:"path"`
}
7 changes: 2 additions & 5 deletions app/cmd/apply.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,8 @@ func NewApplyCmd() *cobra.Command {
return err
}

// DBs can only be opened in a single process.
if err := app.OpenDBs(); err != nil {
return err
}

// Since DBs can only be opened in a single process; each resource should decide whether it needs
// to open the DBs.
if err := app.SetupRegistry(); err != nil {
return err
}
Expand Down
12 changes: 10 additions & 2 deletions app/pkg/application/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,11 +213,19 @@ func (a *App) SetupRegistry() error {
}
a.Registry = &controllers.Registry{}

eval, err := eval.NewEvaluator(*a.Config)
evaluator, err := eval.NewEvaluator(*a.Config)
if err != nil {
return err
}
if err := a.Registry.Register(api.ExperimentGVK, eval); err != nil {
if err := a.Registry.Register(api.ExperimentGVK, evaluator); err != nil {
return err
}

assertor, err := eval.NewAssertRunner(*a.Config)
if err != nil {
return err
}
if err := a.Registry.Register(api.AssertJobGVK, assertor); err != nil {
return err
}
return nil
Expand Down
20 changes: 20 additions & 0 deletions app/pkg/eval/assert.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package eval

import (
"context"

"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
)

// Assertion is an interface for evaluating AI generations.
type Assertion interface {
Assert(ctx context.Context, doc *v1alpha1.Doc, examples []*v1alpha1.Example, answer []*v1alpha1.Block) (*v1alpha1.Assertion, error)
// Name returns the name of the assertion.
Name() string
}

type AssertResult string

const AssertPassed AssertResult = "passed"
const AssertFailed AssertResult = "failed"
const AssertSkipped AssertResult = "skipped"
52 changes: 52 additions & 0 deletions app/pkg/eval/assertions.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package eval

import (
"context"

"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
)

const (
CodeAfterMarkdownName = "AssertCodeAfterMarkdown"
)

// AssertCodeAfterMarkdown is an assertion that checks that if the prompt ends in a markdown cell then the response
// starts with a code cell.
type AssertCodeAfterMarkdown struct {
}

func (a *AssertCodeAfterMarkdown) Assert(ctx context.Context, doc *v1alpha1.Doc, examples []*v1alpha1.Example, answer []*v1alpha1.Block) (*v1alpha1.Assertion, error) {
assertion := &v1alpha1.Assertion{
Name: a.Name(),
}

if len(doc.Blocks) == 0 {
assertion.Result = v1alpha1.AssertResult_SKIPPED
return assertion, nil
}

last := doc.Blocks[len(doc.Blocks)-1]
if last.GetKind() != v1alpha1.BlockKind_MARKUP {
assertion.Result = v1alpha1.AssertResult_SKIPPED
return assertion, nil
}

if len(answer) == 0 {
assertion.Result = v1alpha1.AssertResult_FAILED
assertion.Detail = "Answer is empty"
return assertion, nil
}

if answer[0].GetKind() != v1alpha1.BlockKind_CODE {
assertion.Result = v1alpha1.AssertResult_FAILED
assertion.Detail = "Answer doesn't start with a code cell"
return assertion, nil
}

assertion.Result = v1alpha1.AssertResult_PASSED
return assertion, nil
}

func (a *AssertCodeAfterMarkdown) Name() string {
return CodeAfterMarkdownName
}
84 changes: 84 additions & 0 deletions app/pkg/eval/assertions_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package eval

import (
"context"
"testing"

"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
)

type testCase struct {
name string
doc *v1alpha1.Doc
examples []*v1alpha1.Example
answer []*v1alpha1.Block
expected *v1alpha1.Assertion
}

func TestAssertCodeAfterMarkdown(t *testing.T) {
cases := []testCase{
{
name: "Empty",
doc: &v1alpha1.Doc{},
examples: []*v1alpha1.Example{},
answer: []*v1alpha1.Block{},
expected: &v1alpha1.Assertion{
Name: "AssertCodeAfterMarkdown",
Result: v1alpha1.AssertResult_SKIPPED,
},
},
{
name: "Passed",
doc: &v1alpha1.Doc{
Blocks: []*v1alpha1.Block{
{
Kind: v1alpha1.BlockKind_MARKUP,
},
},
},
examples: []*v1alpha1.Example{},
answer: []*v1alpha1.Block{
{
Kind: v1alpha1.BlockKind_CODE,
},
},
expected: &v1alpha1.Assertion{
Name: "AssertCodeAfterMarkdown",
Result: v1alpha1.AssertResult_PASSED,
},
},
{
name: "Passed",
doc: &v1alpha1.Doc{
Blocks: []*v1alpha1.Block{
{
Kind: v1alpha1.BlockKind_MARKUP,
},
},
},
examples: []*v1alpha1.Example{},
answer: []*v1alpha1.Block{
{
Kind: v1alpha1.BlockKind_MARKUP,
},
},
expected: &v1alpha1.Assertion{
Name: "AssertCodeAfterMarkdown",
Result: v1alpha1.AssertResult_FAILED,
},
},
}

for _, c := range cases {
a := &AssertCodeAfterMarkdown{}
t.Run(c.name, func(t *testing.T) {
got, err := a.Assert(context.Background(), c.doc, c.examples, c.answer)
if err != nil {
t.Fatalf("Error: %v", err)
}
if got.Result != c.expected.Result {
t.Fatalf("Expected %v but got %v", c.expected.Result, got.Result)
}
})
}
}
Loading

0 comments on commit be666aa

Please sign in to comment.