-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial stab at level 1 Evals. This PR includes * Initial technote describing how we will implement level 1 Evals * Implementation of the code to run level 1 evals * An initial assertion that verifies that the AI responds with a code cell if the current cell is a markdown cell * A developer doc describing how to run level 1 evals This PR introduces some new patterns for how we do evaluation * Spin up a separate instance of the Agent to test any changes * This also gives our Agent isolated logs so that we don't interfere with the production instance * Evaluation should communicate with the Agent via RPC * To support this we added a Connect Handler for the Generate method * Analyze/Display the results using RunMe * We will ad RPC methods to return the results as JSON in the format suitable for display * We can then use vscode renderers like Data Table Renderers to render them
- Loading branch information
Showing
27 changed files
with
1,668 additions
and
108 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,4 +12,5 @@ dist/ | |
**/bin | ||
**/plugin | ||
**/cover.out | ||
**/*.wasm | ||
**/*.wasm | ||
experiments/runs/** |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
package api | ||
|
||
import "k8s.io/apimachinery/pkg/runtime/schema" | ||
|
||
var ( | ||
AssertJobGVK = schema.FromAPIVersionAndKind(Group+"/"+Version, "AssertJob") | ||
) | ||
|
||
// AssertJob is a struct that represents an assert job. This is a job that runs level one evaluations. | ||
type AssertJob struct { | ||
Metadata Metadata `json:"metadata" yaml:"metadata"` | ||
Spec AssertJobSpec `json:"spec" yaml:"spec"` | ||
} | ||
|
||
type AssertJobSpec struct { | ||
// Sources is a list of sources to get the data from | ||
Sources []EvalSource `json:"sources" yaml:"sources"` | ||
|
||
// AgentAddress is the address of the agent to use to generate completions | ||
AgentAddress string `json:"agentAddress" yaml:"agentAddress"` | ||
|
||
// DBDir is the directory for the pebble database that will store the results | ||
DBDir string `json:"dbDir" yaml:"dbDir"` | ||
|
||
// SheetID is the ID of the Google Sheet to update with the results. | ||
SheetID string `json:"sheetID" yaml:"sheetID"` | ||
|
||
// SheetName is the name of the sheet to update. | ||
SheetName string `json:"sheetName" yaml:"sheetName"` | ||
} | ||
|
||
type EvalSource struct { | ||
MarkdownSource *MarkdownSource `json:"markdownSource,omitempty" yaml:"markdownSource,omitempty"` | ||
} | ||
|
||
type MarkdownSource struct { | ||
// Path to the markdown files to use as evaluation data. | ||
Path string `json:"path" yaml:"path"` | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
package eval | ||
|
||
import ( | ||
"context" | ||
|
||
"github.com/jlewi/foyle/protos/go/foyle/v1alpha1" | ||
) | ||
|
||
// Assertion is an interface for evaluating AI generations. | ||
type Assertion interface { | ||
Assert(ctx context.Context, doc *v1alpha1.Doc, examples []*v1alpha1.Example, answer []*v1alpha1.Block) (*v1alpha1.Assertion, error) | ||
// Name returns the name of the assertion. | ||
Name() string | ||
} | ||
|
||
type AssertResult string | ||
|
||
const AssertPassed AssertResult = "passed" | ||
const AssertFailed AssertResult = "failed" | ||
const AssertSkipped AssertResult = "skipped" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package eval | ||
|
||
import ( | ||
"context" | ||
|
||
"github.com/jlewi/foyle/protos/go/foyle/v1alpha1" | ||
) | ||
|
||
const ( | ||
CodeAfterMarkdownName = "AssertCodeAfterMarkdown" | ||
) | ||
|
||
// AssertCodeAfterMarkdown is an assertion that checks that if the prompt ends in a markdown cell then the response | ||
// starts with a code cell. | ||
type AssertCodeAfterMarkdown struct { | ||
} | ||
|
||
func (a *AssertCodeAfterMarkdown) Assert(ctx context.Context, doc *v1alpha1.Doc, examples []*v1alpha1.Example, answer []*v1alpha1.Block) (*v1alpha1.Assertion, error) { | ||
assertion := &v1alpha1.Assertion{ | ||
Name: a.Name(), | ||
} | ||
|
||
if len(doc.Blocks) == 0 { | ||
assertion.Result = v1alpha1.AssertResult_SKIPPED | ||
return assertion, nil | ||
} | ||
|
||
last := doc.Blocks[len(doc.Blocks)-1] | ||
if last.GetKind() != v1alpha1.BlockKind_MARKUP { | ||
assertion.Result = v1alpha1.AssertResult_SKIPPED | ||
return assertion, nil | ||
} | ||
|
||
if len(answer) == 0 { | ||
assertion.Result = v1alpha1.AssertResult_FAILED | ||
assertion.Detail = "Answer is empty" | ||
return assertion, nil | ||
} | ||
|
||
if answer[0].GetKind() != v1alpha1.BlockKind_CODE { | ||
assertion.Result = v1alpha1.AssertResult_FAILED | ||
assertion.Detail = "Answer doesn't start with a code cell" | ||
return assertion, nil | ||
} | ||
|
||
assertion.Result = v1alpha1.AssertResult_PASSED | ||
return assertion, nil | ||
} | ||
|
||
func (a *AssertCodeAfterMarkdown) Name() string { | ||
return CodeAfterMarkdownName | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
package eval | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
|
||
"github.com/jlewi/foyle/protos/go/foyle/v1alpha1" | ||
) | ||
|
||
type testCase struct { | ||
name string | ||
doc *v1alpha1.Doc | ||
examples []*v1alpha1.Example | ||
answer []*v1alpha1.Block | ||
expected *v1alpha1.Assertion | ||
} | ||
|
||
func TestAssertCodeAfterMarkdown(t *testing.T) { | ||
cases := []testCase{ | ||
{ | ||
name: "Empty", | ||
doc: &v1alpha1.Doc{}, | ||
examples: []*v1alpha1.Example{}, | ||
answer: []*v1alpha1.Block{}, | ||
expected: &v1alpha1.Assertion{ | ||
Name: "AssertCodeAfterMarkdown", | ||
Result: v1alpha1.AssertResult_SKIPPED, | ||
}, | ||
}, | ||
{ | ||
name: "Passed", | ||
doc: &v1alpha1.Doc{ | ||
Blocks: []*v1alpha1.Block{ | ||
{ | ||
Kind: v1alpha1.BlockKind_MARKUP, | ||
}, | ||
}, | ||
}, | ||
examples: []*v1alpha1.Example{}, | ||
answer: []*v1alpha1.Block{ | ||
{ | ||
Kind: v1alpha1.BlockKind_CODE, | ||
}, | ||
}, | ||
expected: &v1alpha1.Assertion{ | ||
Name: "AssertCodeAfterMarkdown", | ||
Result: v1alpha1.AssertResult_PASSED, | ||
}, | ||
}, | ||
{ | ||
name: "Passed", | ||
doc: &v1alpha1.Doc{ | ||
Blocks: []*v1alpha1.Block{ | ||
{ | ||
Kind: v1alpha1.BlockKind_MARKUP, | ||
}, | ||
}, | ||
}, | ||
examples: []*v1alpha1.Example{}, | ||
answer: []*v1alpha1.Block{ | ||
{ | ||
Kind: v1alpha1.BlockKind_MARKUP, | ||
}, | ||
}, | ||
expected: &v1alpha1.Assertion{ | ||
Name: "AssertCodeAfterMarkdown", | ||
Result: v1alpha1.AssertResult_FAILED, | ||
}, | ||
}, | ||
} | ||
|
||
for _, c := range cases { | ||
a := &AssertCodeAfterMarkdown{} | ||
t.Run(c.name, func(t *testing.T) { | ||
got, err := a.Assert(context.Background(), c.doc, c.examples, c.answer) | ||
if err != nil { | ||
t.Fatalf("Error: %v", err) | ||
} | ||
if got.Result != c.expected.Result { | ||
t.Fatalf("Expected %v but got %v", c.expected.Result, got.Result) | ||
} | ||
}) | ||
} | ||
} |
Oops, something went wrong.