From be666aa47b4ca10934bcfbcf66ba7e04f4007ff4 Mon Sep 17 00:00:00 2001
From: Jeremy Lewi <jeremy@lewi.us>
Date: Mon, 5 Aug 2024 14:02:56 -0700
Subject: [PATCH] Level 1 Evals. (#181)

Initial stab at level 1 Evals. This PR includes

* Initial technote describing how we will implement level 1 Evals
* Implementation of the code to run level 1 evals
* An initial assertion that verifies that the AI responds with a code
cell if the current cell is a markdown cell
* A developer doc describing how to run level 1 evals

This PR introduces some new patterns for how we do evaluation

* Spin up a separate instance of the Agent to test any changes
* This also gives our Agent isolated logs so that we don't interfere
with the production instance
* Evaluation should communicate with the Agent via RPC
  * To support this we added a Connect Handler for the Generate method
* Analyze/Display the results using RunMe
* We will ad RPC methods to return the results as JSON in the format
suitable for display
* We can then use vscode renderers like Data Table Renderers to render
them
---
 .gitignore                                    |   3 +-
 app/api/assert.go                             |  39 ++
 app/cmd/apply.go                              |   7 +-
 app/pkg/application/app.go                    |  12 +-
 app/pkg/eval/assert.go                        |  20 +
 app/pkg/eval/assertions.go                    |  52 ++
 app/pkg/eval/assertions_test.go               |  84 +++
 app/pkg/eval/assertor.go                      | 180 +++++++
 app/pkg/eval/evaluator.go                     |  26 +-
 app/pkg/eval/reconcilers.go                   |  83 +++
 app/pkg/eval/service.go                       |  88 +++
 app/pkg/eval/service_test.go                  |  72 +++
 app/pkg/server/server.go                      |  88 +--
 developer_guides/create_pr.md                 |  45 ++
 developer_guides/eval.md                      |  79 +++
 .../en/docs/tech-notes/tn001_logging.md       |   3 +-
 .../en/docs/tech-notes/tn010_eval_lvl1.md     | 157 ++++++
 experiments/assertions.yaml                   |   9 +
 protos/foyle/v1alpha1/agent.proto             |   5 +-
 protos/foyle/v1alpha1/eval.proto              |  44 ++
 protos/foyle/v1alpha1/trainer.proto           |   1 +
 protos/go/foyle/logs/traces.zap.go            |   2 +-
 protos/go/foyle/v1alpha1/eval.pb.go           | 505 ++++++++++++++++--
 protos/go/foyle/v1alpha1/eval.zap.go          |  99 ++++
 protos/go/foyle/v1alpha1/eval_grpc.pb.go      |  36 ++
 protos/go/foyle/v1alpha1/trainer.pb.go        |   1 +
 .../v1alpha1/v1alpha1connect/eval.connect.go  |  36 +-
 27 files changed, 1668 insertions(+), 108 deletions(-)
 create mode 100644 app/api/assert.go
 create mode 100644 app/pkg/eval/assert.go
 create mode 100644 app/pkg/eval/assertions.go
 create mode 100644 app/pkg/eval/assertions_test.go
 create mode 100644 app/pkg/eval/assertor.go
 create mode 100644 app/pkg/eval/reconcilers.go
 create mode 100644 app/pkg/eval/service_test.go
 create mode 100644 developer_guides/create_pr.md
 create mode 100644 docs/content/en/docs/tech-notes/tn010_eval_lvl1.md
 create mode 100644 experiments/assertions.yaml

diff --git a/.gitignore b/.gitignore
index 59c08513..ed928182 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,5 @@ dist/
 **/bin
 **/plugin
 **/cover.out
-**/*.wasm
\ No newline at end of file
+**/*.wasm
+experiments/runs/**
\ No newline at end of file
diff --git a/app/api/assert.go b/app/api/assert.go
new file mode 100644
index 00000000..28c7006f
--- /dev/null
+++ b/app/api/assert.go
@@ -0,0 +1,39 @@
+package api
+
+import "k8s.io/apimachinery/pkg/runtime/schema"
+
+var (
+	AssertJobGVK = schema.FromAPIVersionAndKind(Group+"/"+Version, "AssertJob")
+)
+
+// AssertJob is a struct that represents an assert job. This is a job that runs level one evaluations.
+type AssertJob struct {
+	Metadata Metadata      `json:"metadata" yaml:"metadata"`
+	Spec     AssertJobSpec `json:"spec"    yaml:"spec"`
+}
+
+type AssertJobSpec struct {
+	// Sources is a list of sources to get the data from
+	Sources []EvalSource `json:"sources" yaml:"sources"`
+
+	// AgentAddress is the address of the agent to use to generate completions
+	AgentAddress string `json:"agentAddress" yaml:"agentAddress"`
+
+	// DBDir is the directory for the pebble database that will store the results
+	DBDir string `json:"dbDir" yaml:"dbDir"`
+
+	// SheetID is the ID of the Google Sheet to update with the results.
+	SheetID string `json:"sheetID" yaml:"sheetID"`
+
+	// SheetName is the name of the sheet to update.
+	SheetName string `json:"sheetName" yaml:"sheetName"`
+}
+
+type EvalSource struct {
+	MarkdownSource *MarkdownSource `json:"markdownSource,omitempty" yaml:"markdownSource,omitempty"`
+}
+
+type MarkdownSource struct {
+	// Path to the markdown files to use as evaluation data.
+	Path string `json:"path" yaml:"path"`
+}
diff --git a/app/cmd/apply.go b/app/cmd/apply.go
index 626465da..2cbcc651 100644
--- a/app/cmd/apply.go
+++ b/app/cmd/apply.go
@@ -40,11 +40,8 @@ func NewApplyCmd() *cobra.Command {
 					return err
 				}
 
-				// DBs can only be opened in a single process.
-				if err := app.OpenDBs(); err != nil {
-					return err
-				}
-
+				// Since DBs can only be opened in a single process; each resource should decide whether it needs
+				// to open the DBs.
 				if err := app.SetupRegistry(); err != nil {
 					return err
 				}
diff --git a/app/pkg/application/app.go b/app/pkg/application/app.go
index 05dc86d2..ab71c237 100644
--- a/app/pkg/application/app.go
+++ b/app/pkg/application/app.go
@@ -213,11 +213,19 @@ func (a *App) SetupRegistry() error {
 	}
 	a.Registry = &controllers.Registry{}
 
-	eval, err := eval.NewEvaluator(*a.Config)
+	evaluator, err := eval.NewEvaluator(*a.Config)
 	if err != nil {
 		return err
 	}
-	if err := a.Registry.Register(api.ExperimentGVK, eval); err != nil {
+	if err := a.Registry.Register(api.ExperimentGVK, evaluator); err != nil {
+		return err
+	}
+
+	assertor, err := eval.NewAssertRunner(*a.Config)
+	if err != nil {
+		return err
+	}
+	if err := a.Registry.Register(api.AssertJobGVK, assertor); err != nil {
 		return err
 	}
 	return nil
diff --git a/app/pkg/eval/assert.go b/app/pkg/eval/assert.go
new file mode 100644
index 00000000..a4000cc7
--- /dev/null
+++ b/app/pkg/eval/assert.go
@@ -0,0 +1,20 @@
+package eval
+
+import (
+	"context"
+
+	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
+)
+
+// Assertion is an interface for evaluating AI generations.
+type Assertion interface {
+	Assert(ctx context.Context, doc *v1alpha1.Doc, examples []*v1alpha1.Example, answer []*v1alpha1.Block) (*v1alpha1.Assertion, error)
+	// Name returns the name of the assertion.
+	Name() string
+}
+
+type AssertResult string
+
+const AssertPassed AssertResult = "passed"
+const AssertFailed AssertResult = "failed"
+const AssertSkipped AssertResult = "skipped"
diff --git a/app/pkg/eval/assertions.go b/app/pkg/eval/assertions.go
new file mode 100644
index 00000000..972fca81
--- /dev/null
+++ b/app/pkg/eval/assertions.go
@@ -0,0 +1,52 @@
+package eval
+
+import (
+	"context"
+
+	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
+)
+
+const (
+	CodeAfterMarkdownName = "AssertCodeAfterMarkdown"
+)
+
+// AssertCodeAfterMarkdown is an assertion that checks that if the prompt ends in a markdown cell then the response
+// starts with a code cell.
+type AssertCodeAfterMarkdown struct {
+}
+
+func (a *AssertCodeAfterMarkdown) Assert(ctx context.Context, doc *v1alpha1.Doc, examples []*v1alpha1.Example, answer []*v1alpha1.Block) (*v1alpha1.Assertion, error) {
+	assertion := &v1alpha1.Assertion{
+		Name: a.Name(),
+	}
+
+	if len(doc.Blocks) == 0 {
+		assertion.Result = v1alpha1.AssertResult_SKIPPED
+		return assertion, nil
+	}
+
+	last := doc.Blocks[len(doc.Blocks)-1]
+	if last.GetKind() != v1alpha1.BlockKind_MARKUP {
+		assertion.Result = v1alpha1.AssertResult_SKIPPED
+		return assertion, nil
+	}
+
+	if len(answer) == 0 {
+		assertion.Result = v1alpha1.AssertResult_FAILED
+		assertion.Detail = "Answer is empty"
+		return assertion, nil
+	}
+
+	if answer[0].GetKind() != v1alpha1.BlockKind_CODE {
+		assertion.Result = v1alpha1.AssertResult_FAILED
+		assertion.Detail = "Answer doesn't start with a code cell"
+		return assertion, nil
+	}
+
+	assertion.Result = v1alpha1.AssertResult_PASSED
+	return assertion, nil
+}
+
+func (a *AssertCodeAfterMarkdown) Name() string {
+	return CodeAfterMarkdownName
+}
diff --git a/app/pkg/eval/assertions_test.go b/app/pkg/eval/assertions_test.go
new file mode 100644
index 00000000..daef7484
--- /dev/null
+++ b/app/pkg/eval/assertions_test.go
@@ -0,0 +1,84 @@
+package eval
+
+import (
+	"context"
+	"testing"
+
+	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
+)
+
+type testCase struct {
+	name     string
+	doc      *v1alpha1.Doc
+	examples []*v1alpha1.Example
+	answer   []*v1alpha1.Block
+	expected *v1alpha1.Assertion
+}
+
+func TestAssertCodeAfterMarkdown(t *testing.T) {
+	cases := []testCase{
+		{
+			name:     "Empty",
+			doc:      &v1alpha1.Doc{},
+			examples: []*v1alpha1.Example{},
+			answer:   []*v1alpha1.Block{},
+			expected: &v1alpha1.Assertion{
+				Name:   "AssertCodeAfterMarkdown",
+				Result: v1alpha1.AssertResult_SKIPPED,
+			},
+		},
+		{
+			name: "Passed",
+			doc: &v1alpha1.Doc{
+				Blocks: []*v1alpha1.Block{
+					{
+						Kind: v1alpha1.BlockKind_MARKUP,
+					},
+				},
+			},
+			examples: []*v1alpha1.Example{},
+			answer: []*v1alpha1.Block{
+				{
+					Kind: v1alpha1.BlockKind_CODE,
+				},
+			},
+			expected: &v1alpha1.Assertion{
+				Name:   "AssertCodeAfterMarkdown",
+				Result: v1alpha1.AssertResult_PASSED,
+			},
+		},
+		{
+			name: "Passed",
+			doc: &v1alpha1.Doc{
+				Blocks: []*v1alpha1.Block{
+					{
+						Kind: v1alpha1.BlockKind_MARKUP,
+					},
+				},
+			},
+			examples: []*v1alpha1.Example{},
+			answer: []*v1alpha1.Block{
+				{
+					Kind: v1alpha1.BlockKind_MARKUP,
+				},
+			},
+			expected: &v1alpha1.Assertion{
+				Name:   "AssertCodeAfterMarkdown",
+				Result: v1alpha1.AssertResult_FAILED,
+			},
+		},
+	}
+
+	for _, c := range cases {
+		a := &AssertCodeAfterMarkdown{}
+		t.Run(c.name, func(t *testing.T) {
+			got, err := a.Assert(context.Background(), c.doc, c.examples, c.answer)
+			if err != nil {
+				t.Fatalf("Error: %v", err)
+			}
+			if got.Result != c.expected.Result {
+				t.Fatalf("Expected %v but got %v", c.expected.Result, got.Result)
+			}
+		})
+	}
+}
diff --git a/app/pkg/eval/assertor.go b/app/pkg/eval/assertor.go
new file mode 100644
index 00000000..0a6b0e8b
--- /dev/null
+++ b/app/pkg/eval/assertor.go
@@ -0,0 +1,180 @@
+package eval
+
+import (
+	"context"
+	"crypto/tls"
+	"net"
+	"net/http"
+
+	"github.com/cockroachdb/pebble"
+	"github.com/jlewi/foyle/app/api"
+	"github.com/jlewi/foyle/app/pkg/config"
+	"github.com/jlewi/foyle/app/pkg/logs"
+	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
+	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1/v1alpha1connect"
+	"github.com/jlewi/monogo/helpers"
+	"github.com/pkg/errors"
+	"golang.org/x/net/http2"
+	"google.golang.org/protobuf/proto"
+	"sigs.k8s.io/kustomize/kyaml/yaml"
+)
+
+// AssertRunner runs assertions in batch mode
+type AssertRunner struct {
+	config config.Config
+
+	assertions []Assertion
+}
+
+func NewAssertRunner(config config.Config) (*AssertRunner, error) {
+	runner := &AssertRunner{config: config}
+
+	// Load the assertions
+	runner.assertions = make([]Assertion, 0, 10)
+	runner.assertions = append(runner.assertions, &AssertCodeAfterMarkdown{})
+	return runner, nil
+}
+
+func newHTTPClient() *http.Client {
+	// N.B. We need to use HTTP2 if we want to support bidirectional streaming
+	//http.DefaultClient,
+	return &http.Client{
+		Transport: &http2.Transport{
+			AllowHTTP: true,
+			DialTLSContext: func(ctx context.Context, network, addr string, _ *tls.Config) (net.Conn, error) {
+				// Use the standard Dial function to create a plain TCP connection
+				return net.Dial(network, addr)
+			},
+		},
+	}
+}
+func newGenerateClient(baseURL string) v1alpha1connect.GenerateServiceClient {
+	// Create a new client
+	client := v1alpha1connect.NewGenerateServiceClient(
+		newHTTPClient(),
+		baseURL,
+	)
+	return client
+}
+
+func (r *AssertRunner) ReconcileNode(ctx context.Context, node *yaml.RNode) error {
+	job := &api.AssertJob{}
+	if err := node.YNode().Decode(job); err != nil {
+		return errors.Wrapf(err, "Failed to decode AssertJob")
+	}
+
+	return r.Reconcile(ctx, *job)
+}
+
+func (r *AssertRunner) Reconcile(ctx context.Context, job api.AssertJob) error {
+	log := logs.FromContext(ctx).WithValues("job", job.Metadata.Name)
+	log.Info("Opening database", "database", job.Spec.DBDir)
+	db, err := pebble.Open(job.Spec.DBDir, &pebble.Options{})
+	if err != nil {
+		return err
+	}
+	defer helpers.DeferIgnoreError(db.Close)
+
+	if job.Spec.AgentAddress == "" {
+		return errors.New("AgentAddress is required")
+	}
+
+	if len(job.Spec.Sources) == 0 {
+		return errors.New("Sources must be specified")
+	}
+
+	client := newGenerateClient(job.Spec.AgentAddress)
+
+	// Process all the sources
+	for _, source := range job.Spec.Sources {
+		if source.MarkdownSource == nil {
+			return errors.New("Only MarkdownSource is supported")
+		}
+		files, err := listEvalFiles(ctx, source.MarkdownSource.Path)
+		if err != nil {
+			return err
+		}
+
+		log.Info("Found eval files", "numFiles", len(files))
+
+		// Now iterate over the DB and figure out which files haven't  been loaded into the db.
+
+		unloadedFiles, err := findUnloadedFiles(ctx, db, files)
+		if err != nil {
+			return err
+		}
+		log.Info("Found unloaded files", "numFiles", len(unloadedFiles))
+
+		// We need to load the evaluation data into the database.
+		if err := loadMarkdownFiles(ctx, db, unloadedFiles); err != nil {
+			return err
+		}
+	}
+
+	// Now generate predictions for any results that are missing them.
+	if err := reconcilePredictions(ctx, db, client); err != nil {
+		return err
+	}
+
+	if err := reconcileAssertions(ctx, r.assertions, db); err != nil {
+		return err
+	}
+	return nil
+}
+
+// reconcileAssertions reconciles the assertions with the results
+func reconcileAssertions(ctx context.Context, assertions []Assertion, db *pebble.DB) error {
+	olog := logs.FromContext(ctx)
+	iter, err := db.NewIterWithContext(ctx, nil)
+	if err != nil {
+		return err
+	}
+	defer iter.Close()
+
+	for iter.First(); iter.Valid(); iter.Next() {
+		key := iter.Key()
+		if key == nil {
+			break
+		}
+
+		log := olog.WithValues("id", string(key))
+		value, err := iter.ValueAndErr()
+		if err != nil {
+			return errors.Wrapf(err, "Failed to read value for key %s", string(key))
+		}
+
+		result := &v1alpha1.EvalResult{}
+		if err := proto.Unmarshal(value, result); err != nil {
+			return errors.Wrapf(err, "Failed to unmarshal value for key %s", string(key))
+		}
+
+		actual := make(map[string]bool)
+		for _, a := range result.GetAssertions() {
+			actual[a.GetName()] = true
+		}
+
+		if result.Assertions == nil {
+			result.Assertions = make([]*v1alpha1.Assertion, 0, len(assertions))
+		}
+
+		for _, a := range assertions {
+			if _, ok := actual[a.Name()]; ok {
+				continue
+			}
+
+			// Run the assertion
+			newA, err := a.Assert(ctx, result.Example.Query, nil, result.Actual)
+
+			if err != nil {
+				log.Error(err, "Failed to run assertion", "name", a.Name())
+			}
+
+			result.Assertions = append(result.Assertions, newA)
+		}
+
+		if err := updateResult(ctx, string(key), result, db); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/app/pkg/eval/evaluator.go b/app/pkg/eval/evaluator.go
index 35ef452e..68bc6bcf 100644
--- a/app/pkg/eval/evaluator.go
+++ b/app/pkg/eval/evaluator.go
@@ -44,6 +44,9 @@ type Evaluator struct {
 // The evaluator assumes that the analyzer is already running in the background and processing logs.
 // TODO(https://github.com/jlewi/foyle/issues/140): The evaluator may need to be updated now that we continuously
 // process logs in the background.
+//
+// TODO(jeremy): We should probably redo the Evaluator so that instead of setting up the Agent we just
+// communicate with the Agent via RPC.
 func NewEvaluator(cfg config.Config) (*Evaluator, error) {
 	parser, err := executor.NewBashishParser()
 
@@ -84,7 +87,7 @@ func (e *Evaluator) Reconcile(ctx context.Context, experiment api.Experiment) er
 	}
 
 	// List all the files
-	files, err := e.listEvalFiles(ctx, experiment.Spec.EvalDir)
+	files, err := listEvalFiles(ctx, experiment.Spec.EvalDir)
 	if err != nil {
 		return err
 	}
@@ -93,14 +96,14 @@ func (e *Evaluator) Reconcile(ctx context.Context, experiment api.Experiment) er
 
 	// Now iterate over the DB and figure out which files haven't  been loaded into the db.
 
-	unloadedFiles, err := e.findUnloadedFiles(ctx, db, files)
+	unloadedFiles, err := findUnloadedFiles(ctx, db, files)
 	if err != nil {
 		return err
 	}
 	log.Info("Found unloaded files", "numFiles", len(unloadedFiles))
 
 	// We need to load the evaluation data into the database.
-	if err := e.loadMarkdownFiles(ctx, db, unloadedFiles); err != nil {
+	if err := loadMarkdownFiles(ctx, db, unloadedFiles); err != nil {
 		return err
 	}
 
@@ -109,6 +112,8 @@ func (e *Evaluator) Reconcile(ctx context.Context, experiment api.Experiment) er
 		return err
 	}
 
+	// TODO(jeremy): We should get the traces via API because only one process can access the pebble DB at a time.
+	// And the agent needs access to the pebble DB traces.
 	tracesDB, err := pebble.Open(e.config.GetTracesDBDir(), &pebble.Options{})
 	if err != nil {
 		return err
@@ -166,6 +171,7 @@ func (e *Evaluator) setupAgent(ctx context.Context, agentConfig api.AgentConfig)
 	return agent, nil
 }
 
+// TODO(jeremy): We should use reconcilePredictions which uses the client to generate the predictions.
 func (e *Evaluator) reconcilePredictions(ctx context.Context, db *pebble.DB, agent *agent.Agent) error {
 	olog := logs.FromContext(ctx)
 	iter, err := db.NewIterWithContext(ctx, nil)
@@ -218,7 +224,7 @@ func (e *Evaluator) reconcilePredictions(ctx context.Context, db *pebble.DB, age
 			result.GenTraceId = resp.GetTraceId()
 
 			log.Info("Writing result to DB")
-			if err := e.updateResult(ctx, string(key), result, db); err != nil {
+			if err := updateResult(ctx, string(key), result, db); err != nil {
 				return errors.Wrapf(err, "Failed to write result to DB")
 			}
 		}
@@ -226,7 +232,7 @@ func (e *Evaluator) reconcilePredictions(ctx context.Context, db *pebble.DB, age
 	return nil
 }
 
-func (e *Evaluator) updateResult(ctx context.Context, id string, result *v1alpha1.EvalResult, db *pebble.DB) error {
+func updateResult(ctx context.Context, id string, result *v1alpha1.EvalResult, db *pebble.DB) error {
 	b, err := proto.Marshal(result)
 	if err != nil {
 		return errors.Wrapf(err, "Failed to marshal result")
@@ -269,7 +275,7 @@ func (e *Evaluator) reconcileDistance(ctx context.Context, db *pebble.DB) error
 
 		updateEvalResultDistance(ctx, e.parser, result)
 		log.Info("Updating distance", "distance", result.Distance)
-		if err := e.updateResult(ctx, string(key), result, db); err != nil {
+		if err := updateResult(ctx, string(key), result, db); err != nil {
 			log.Error(err, "Failed to update result")
 		}
 	}
@@ -340,7 +346,7 @@ func (e *Evaluator) reconcileBestRAGResult(ctx context.Context, db *pebble.DB, t
 		if result.BestRagResult == nil {
 			continue
 		}
-		if err := e.updateResult(ctx, string(key), result, db); err != nil {
+		if err := updateResult(ctx, string(key), result, db); err != nil {
 			log.Error(err, "Failed to update result")
 		}
 	}
@@ -527,7 +533,7 @@ func (e *Evaluator) updateGoogleSheet(ctx context.Context, experiment api.Experi
 	return nil
 }
 
-func (e *Evaluator) findUnloadedFiles(ctx context.Context, db *pebble.DB, files []string) ([]string, error) {
+func findUnloadedFiles(ctx context.Context, db *pebble.DB, files []string) ([]string, error) {
 	unprocessed := map[string]bool{}
 
 	iter, err := db.NewIterWithContext(ctx, nil)
@@ -571,7 +577,7 @@ func (e *Evaluator) findUnloadedFiles(ctx context.Context, db *pebble.DB, files
 }
 
 // listEvalFiles returns a list of the all the markdown files in the eval directory.
-func (e *Evaluator) listEvalFiles(ctx context.Context, evalDir string) ([]string, error) {
+func listEvalFiles(ctx context.Context, evalDir string) ([]string, error) {
 	examples := make([]string, 0, 100)
 	err := filepath.Walk(evalDir, func(path string, info os.FileInfo, err error) error {
 		if info.IsDir() {
@@ -591,7 +597,7 @@ func (e *Evaluator) listEvalFiles(ctx context.Context, evalDir string) ([]string
 
 // loadMarkdownFiles loads a bunch of markdown files representing evaluation data and converts them into example
 // protos.
-func (e *Evaluator) loadMarkdownFiles(ctx context.Context, db *pebble.DB, files []string) error {
+func loadMarkdownFiles(ctx context.Context, db *pebble.DB, files []string) error {
 	oLog := logs.FromContext(ctx)
 
 	allErrors := &helpers.ListOfErrors{}
diff --git a/app/pkg/eval/reconcilers.go b/app/pkg/eval/reconcilers.go
new file mode 100644
index 00000000..ba84cb5b
--- /dev/null
+++ b/app/pkg/eval/reconcilers.go
@@ -0,0 +1,83 @@
+package eval
+
+import (
+	"context"
+
+	"connectrpc.com/connect"
+	"github.com/cockroachdb/pebble"
+	"github.com/jlewi/foyle/app/pkg/logs"
+	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
+	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1/v1alpha1connect"
+	"github.com/pkg/errors"
+	"google.golang.org/protobuf/proto"
+)
+
+// reconcilePredictions reconciles predictions for examples in the database.
+func reconcilePredictions(ctx context.Context, db *pebble.DB, client v1alpha1connect.GenerateServiceClient) error {
+	olog := logs.FromContext(ctx)
+	iter, err := db.NewIterWithContext(ctx, nil)
+	if err != nil {
+		return err
+	}
+	defer iter.Close()
+
+	for iter.First(); iter.Valid(); iter.Next() {
+		key := iter.Key()
+		if key == nil {
+			break
+		}
+
+		log := olog.WithValues("id", string(key))
+		value, err := iter.ValueAndErr()
+		if err != nil {
+			return errors.Wrapf(err, "Failed to read value for key %s", string(key))
+		}
+
+		result := &v1alpha1.EvalResult{}
+		if err := proto.Unmarshal(value, result); err != nil {
+			return errors.Wrapf(err, "Failed to unmarshal value for key %s", string(key))
+		}
+
+		if len(result.GetActual()) > 0 {
+			log.V(logs.Debug).Info("not generating a completion; already have answer", "path", result.ExampleFile)
+			// We have the answer so we don't need to generate it.
+			continue
+		}
+
+		if len(result.Actual) == 0 {
+			// Initialize a trace
+			resp, err := func() (*connect.Response[v1alpha1.GenerateResponse], error) {
+				newCtx, span := tracer().Start(ctx, "(*Evaluator).reconcilePredictions")
+				defer span.End()
+
+				req := connect.NewRequest(&v1alpha1.GenerateRequest{
+					Doc: result.Example.Query,
+				})
+				// We need to generate the answer.
+				return client.Generate(newCtx, req)
+			}()
+
+			if err != nil {
+				connectErr, ok := err.(*connect.Error)
+				if ok {
+					// If this is a permanent error we want to abort with an error
+					if connectErr.Code() == connect.CodeUnavailable || connectErr.Code() == connect.CodeUnimplemented {
+						return errors.Wrap(err, "Unable to connect to the agent.")
+					}
+				}
+				result.Error = err.Error()
+				result.Status = v1alpha1.EvalResultStatus_ERROR
+				continue
+			}
+
+			result.Actual = resp.Msg.GetBlocks()
+			result.GenTraceId = resp.Msg.GetTraceId()
+
+			log.Info("Writing result to DB")
+			if err := updateResult(ctx, string(key), result, db); err != nil {
+				return errors.Wrapf(err, "Failed to write result to DB")
+			}
+		}
+	}
+	return nil
+}
diff --git a/app/pkg/eval/service.go b/app/pkg/eval/service.go
index 464ff8e0..a5b9ec50 100644
--- a/app/pkg/eval/service.go
+++ b/app/pkg/eval/service.go
@@ -3,6 +3,10 @@ package eval
 import (
 	"context"
 
+	"github.com/go-logr/zapr"
+	"github.com/jlewi/foyle/app/pkg/docs"
+	"go.uber.org/zap"
+
 	"connectrpc.com/connect"
 	"github.com/cockroachdb/pebble"
 	"github.com/jlewi/foyle/app/pkg/logs"
@@ -69,3 +73,87 @@ func (s *EvalServer) List(
 	res.Header().Set("Eval-Version", "v1alpha1")
 	return res, nil
 }
+
+func (s *EvalServer) AssertionTable(
+	ctx context.Context,
+	req *connect.Request[v1alpha1.AssertionTableRequest],
+) (*connect.Response[v1alpha1.AssertionTableResponse], error) {
+	log := logs.FromContext(ctx)
+
+	if req.Msg.GetDatabase() == "" {
+		err := connect.NewError(connect.CodeInvalidArgument, errors.New("Request is missing database"))
+		log.Error(err, "Invalid EvalResultListRequest")
+		return nil, err
+	}
+
+	db, err := pebble.Open(req.Msg.GetDatabase(), &pebble.Options{})
+	if err != nil {
+		log.Error(err, "Failed to open database")
+		return nil, connect.NewError(connect.CodeInternal, err)
+	}
+	defer helpers.DeferIgnoreError(db.Close)
+
+	iter, err := db.NewIterWithContext(ctx, nil)
+	if err != nil {
+		return nil, connect.NewError(connect.CodeInternal, err)
+	}
+	defer iter.Close()
+
+	results := &v1alpha1.AssertionTableResponse{
+		Rows: make([]*v1alpha1.AssertionRow, 0, 100),
+	}
+
+	for iter.First(); iter.Valid(); iter.Next() {
+		key := iter.Key()
+		if key == nil {
+			break
+		}
+
+		value, err := iter.ValueAndErr()
+		if err != nil {
+			log.Error(err, "Failed to read value for key", "key", string(key))
+			continue
+		}
+
+		result := &v1alpha1.EvalResult{}
+		if err := proto.Unmarshal(value, result); err != nil {
+			log.Error(err, "Failed to unmarshal value for", "key", string(key))
+			continue
+		}
+
+		row, err := toAssertionRow(result)
+		if err != nil {
+			// TODO(jeremy): Should we put this in the response
+			log.Error(err, "Failed to convert to assertion row", "key", string(key))
+			continue
+		}
+		results.Rows = append(results.Rows, row)
+	}
+
+	res := connect.NewResponse(results)
+	res.Header().Set("Eval-Version", "v1alpha1")
+	return res, nil
+}
+
+func toAssertionRow(result *v1alpha1.EvalResult) (*v1alpha1.AssertionRow, error) {
+	log := zapr.NewLogger(zap.L())
+
+	row := &v1alpha1.AssertionRow{
+		Id:          result.Example.GetId(),
+		ExampleFile: result.GetExampleFile(),
+	}
+
+	row.DocMd = docs.DocToMarkdown(result.GetExample().GetQuery())
+	row.AnswerMd = docs.BlocksToMarkdown(result.GetActual())
+
+	for _, a := range result.GetAssertions() {
+		switch a.Name {
+		case CodeAfterMarkdownName:
+			row.CodeAfterMarkdown = a.GetResult()
+		default:
+			log.Info("Unknown assertion", "name", a.Name)
+		}
+	}
+
+	return row, nil
+}
diff --git a/app/pkg/eval/service_test.go b/app/pkg/eval/service_test.go
new file mode 100644
index 00000000..e4e30bda
--- /dev/null
+++ b/app/pkg/eval/service_test.go
@@ -0,0 +1,72 @@
+package eval
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
+)
+
+func Test_ToAssertRow(t *testing.T) {
+	type testCase struct {
+		evalResult *v1alpha1.EvalResult
+		expected   *v1alpha1.AssertionRow
+	}
+
+	cases := []testCase{
+		{
+			evalResult: &v1alpha1.EvalResult{
+				Example: &v1alpha1.Example{
+					Id: "1234",
+					Query: &v1alpha1.Doc{
+						Blocks: []*v1alpha1.Block{
+							{
+								Kind:     v1alpha1.BlockKind_MARKUP,
+								Contents: "Hello world",
+							},
+						},
+					},
+				},
+				Actual: []*v1alpha1.Block{
+					{
+						Kind:     v1alpha1.BlockKind_MARKUP,
+						Contents: "word",
+					},
+				},
+				Assertions: []*v1alpha1.Assertion{
+					{
+						Name:   "AssertCodeAfterMarkdown",
+						Result: v1alpha1.AssertResult_PASSED,
+					},
+				},
+			},
+			expected: &v1alpha1.AssertionRow{
+				Id:                "1234",
+				DocMd:             "Hello world\n",
+				AnswerMd:          "word\n",
+				CodeAfterMarkdown: v1alpha1.AssertResult_PASSED,
+			},
+		},
+	}
+
+	for i, tc := range cases {
+		t.Run(fmt.Sprintf("Case %d", i), func(t *testing.T) {
+			actual, err := toAssertionRow(tc.evalResult)
+			if err != nil {
+				t.Errorf("Error converting %v; %v", tc.evalResult, err)
+			}
+			if actual.Id != tc.expected.Id {
+				t.Errorf("Unexpected Id; got %v, want %v", actual.Id, tc.expected.Id)
+			}
+			if actual.DocMd != tc.expected.DocMd {
+				t.Errorf("Unexpected DocMd; got %v, want %v", actual.DocMd, tc.expected.DocMd)
+			}
+			if actual.AnswerMd != tc.expected.AnswerMd {
+				t.Errorf("Unexpected AnswerMd; got %v, want %v", actual.AnswerMd, tc.expected.AnswerMd)
+			}
+			if actual.CodeAfterMarkdown != tc.expected.CodeAfterMarkdown {
+				t.Errorf("Unexpected CodeAfterMarkdown; got %v, want %v", actual.CodeAfterMarkdown, tc.expected.CodeAfterMarkdown)
+			}
+		})
+	}
+}
diff --git a/app/pkg/server/server.go b/app/pkg/server/server.go
index f4dd3f4b..35a1ccc2 100644
--- a/app/pkg/server/server.go
+++ b/app/pkg/server/server.go
@@ -6,6 +6,8 @@ import (
 	"fmt"
 	"time"
 
+	"github.com/gin-contrib/cors"
+
 	"connectrpc.com/connect"
 
 	"golang.org/x/net/http2"
@@ -34,7 +36,6 @@ import (
 	"syscall"
 
 	"connectrpc.com/otelconnect"
-	"github.com/gin-contrib/cors"
 	"github.com/gin-gonic/gin"
 	"github.com/go-logr/zapr"
 	"github.com/grpc-ecosystem/grpc-gateway/v2/runtime"
@@ -124,10 +125,55 @@ func (s *Server) createGinEngine() error {
 		c.JSON(http.StatusNotFound, gin.H{"message": "Not found", "path": c.Request.URL.Path})
 	})
 
+	// TODO(jeremy): We disabled setting up the vscode server because we weren't using it and it requires assets
+	// to be setup. Should we get rid of it? Provide a flag to enable it? I do think being able to run vscode
+	// in the browser so it is a feature I'd like to add back at some point; see
+	// https://github.com/stateful/runme/issues/616
+	if false {
+		if err := s.serveVSCode(router); err != nil {
+			return err
+		}
+	}
+
+	// Add REST handlers for blocklogs
+	// TODO(jeremy): We should probably standardize on connect-rpc
+	apiPrefix := s.config.APIPrefix()
+	router.GET(apiPrefix+"/blocklogs/:id", s.logsCrud.GetBlockLog)
+
+	// Set  up the connect-rpc handlers for the EvalServer
+	otelInterceptor, err := otelconnect.NewInterceptor()
+	if err != nil {
+		return errors.Wrapf(err, "Failed to create otel interceptor")
+	}
+	path, handler := v1alpha1connect.NewEvalServiceHandler(&eval.EvalServer{}, connect.WithInterceptors(otelInterceptor))
+	log.Info("Setting up eval service", "path", path)
+	// Since we want to add the prefix apiPrefix we need to strip it before passing it to the connect-rpc handler
+	// Refer to https://connectrpc.com/docs/go/routing#prefixing-routes. Note that grpc-go clients don't
+	// support prefixes.
+	router.Any(apiPrefix+"/"+path+"*any", gin.WrapH(http.StripPrefix("/"+apiPrefix, handler)))
+
+	generatePath, generateHandler := v1alpha1connect.NewGenerateServiceHandler(s, connect.WithInterceptors(otelInterceptor))
+	log.Info("Setting up generate service", "path", apiPrefix+"/"+generatePath)
+	router.Any(apiPrefix+"/"+generatePath+"*any", gin.WrapH(http.StripPrefix("/"+apiPrefix, generateHandler)))
+
+	aiSvcPath, aiSvcHandler := v1alpha1connect.NewAIServiceHandler(s.agent, connect.WithInterceptors(otelInterceptor))
+	log.Info("Setting up AI service", "path", apiPrefix+"/"+aiSvcPath)
+	router.Any(apiPrefix+"/"+aiSvcPath+"*any", gin.WrapH(http.StripPrefix("/"+apiPrefix, aiSvcHandler)))
+
+	s.engine = router
+
+	// Setup the logs viewer
+	if err := s.setupViewerApp(router); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (s *Server) serveVSCode(router *gin.Engine) error {
+	log := zapr.NewLogger(zap.L())
 	// Serve the static assets for vscode.
 	// There should be several directories located in ${ASSETS_DIR}/vscode
 	// The second argument to Static is the directory to act as the root for the static files.
-
 	vsCodeRPath := "/out"
 	extensionsMapping := staticMapping{
 		relativePath: extensionsRPath,
@@ -233,34 +279,6 @@ func (s *Server) createGinEngine() error {
 		corsMiddleWare := cors.New(corsConfig)
 		router.Use(corsMiddleWare)
 	}
-
-	// Add REST handlers for blocklogs
-	// TODO(jeremy): We should probably standardize on connect-rpc
-	apiPrefix := s.config.APIPrefix()
-	router.GET(apiPrefix+"/blocklogs/:id", s.logsCrud.GetBlockLog)
-
-	// Set  up the connect-rpc handlers for the EvalServer
-	otelInterceptor, err := otelconnect.NewInterceptor()
-	if err != nil {
-		return errors.Wrapf(err, "Failed to create otel interceptor")
-	}
-	path, handler := v1alpha1connect.NewEvalServiceHandler(&eval.EvalServer{}, connect.WithInterceptors(otelInterceptor))
-	log.Info("Setting up eval service", "path", path)
-	// Since we want to add the prefix apiPrefix we need to strip it before passing it to the connect-rpc handler
-	// Refer to https://connectrpc.com/docs/go/routing#prefixing-routes. Note that grpc-go clients don't
-	// support prefixes.
-	router.Any(apiPrefix+"/"+path+"*any", gin.WrapH(http.StripPrefix("/"+apiPrefix, handler)))
-
-	generatePath, generateHandler := v1alpha1connect.NewAIServiceHandler(s.agent, connect.WithInterceptors(otelInterceptor))
-	log.Info("Setting up generate service", "path", apiPrefix+"/"+generatePath)
-	router.Any(apiPrefix+"/"+generatePath+"*any", gin.WrapH(http.StripPrefix("/"+apiPrefix, generateHandler)))
-
-	s.engine = router
-
-	// Setup the logs viewer
-	if err := s.setupViewerApp(router); err != nil {
-		return err
-	}
 	return nil
 }
 
@@ -595,3 +613,13 @@ func (s *Server) healthCheck(ctx *gin.Context) {
 	}
 	ctx.JSON(code, d)
 }
+
+// Generate a completion request.
+// TODO(https://github.com/jlewi/foyle/issues/173). We should move this function into the Agent structure.
+// Its only here on the server because Agent already has a Generate method for the GRPC protocol. We can get
+// rid of that method once we get rid of GRPC and GRPCGateway and just use connect.
+func (s *Server) Generate(ctx context.Context, req *connect.Request[v1alpha1.GenerateRequest]) (*connect.Response[v1alpha1.GenerateResponse], error) {
+	resp, err := s.agent.Generate(ctx, req.Msg)
+	cResp := connect.NewResponse(resp)
+	return cResp, err
+}
diff --git a/developer_guides/create_pr.md b/developer_guides/create_pr.md
new file mode 100644
index 00000000..90fa3428
--- /dev/null
+++ b/developer_guides/create_pr.md
@@ -0,0 +1,45 @@
+# Create a PR
+
+* This is a recipe for creating PR descriptions using AI.
+* Its a work in progress
+* Current version is way to verbose - ends up describing low level changes in too much detail
+
+1. Find the mergepoint with the main branch
+
+```sh {"id":"01J4J3KN50MB5Z444F0BJAKQYR"}
+git fetch origin
+export FORKPOINT=$(git merge-base --fork-point origin/main)
+```
+
+2. Create a file with the log messages of all the changes up to this point
+
+```sh {"id":"01J4J3PJJJEKCV17HVPTT40C3R"}
+git log ${FORKPOINT}..HEAD > /tmp/commitlog
+```
+
+3. Use the llmtool to summarize the commit messages
+
+```sh {"id":"01J4J3RFE8GYFK8EZZZ7ZV1MZ2"}
+cat /tmp/commitlog  | llm "Here is the commit log for a bunch of messages. Please turn them into a detailed message suitable for the PR description for a PR containing all the changes. Do not include low value messages like 'fix lint' or 'fix tests'. Avoid superlatives and flowery language; just state what the change does and the reasoning behind it." > /tmp/commitsummary
+cat /tmp/commitsummary
+```
+
+```sh {"id":"01J4J3WZN61KKNQ514AJAHFBQE","interactive":"false"}
+cat /tmp/commitsummary
+```
+
+```sh {"id":"01J4J40733AKEC1A9ERV9CWZ5S","interactive":"false"}
+REPOROOT=$(git rev-parse --show-toplevel)
+cd ${REPOROOT}
+CHANGEDFILES=$(git diff --name-only origin/main | grep -v -E '^protos/.*\.go$')
+rm -f /tmp/changes.txt
+while IFS= read -r file; do    
+    git diff origin/main ${file} | llm "Here is the diff for a file. Create a bulleted list summarizing the changes. This list should be suitable for a git commit message or PR description" >> /tmp/changes.txt
+    #echo "" >> /tmp/changes.txt
+done <<< "${CHANGEDFILES}"
+
+```
+
+```sh {"id":"01J4J4R0FS9B93ADDB54HN4CNA","interactive":"false"}
+cat /tmp/changes.txt
+```
\ No newline at end of file
diff --git a/developer_guides/eval.md b/developer_guides/eval.md
index e869f613..4da372bd 100644
--- a/developer_guides/eval.md
+++ b/developer_guides/eval.md
@@ -1,5 +1,84 @@
 ## Running Evaluation
 
+## Running Level 1 Evaluation
+
+Level 1 evaluations are assertions that run on AI responses.
+
+To evaluate changes to the agent first setup an instance of the agent with the changes you want.
+Be sure to configure it so that it stores logs and responses in a different directory than your production
+agent because you don't want the evaluation data to contaminate the learning process.
+
+```sh {"id":"01J4DJT0G24YH9K4F8YRTSZD8N"}
+export REPOROOT=$(git rev-parse --show-toplevel)
+export RUNDIR=${REPOROOT}/experiments/runs/$(date +%Y%m%d_%H%M%S)
+echo "Using run directory: ${RUNDIR}"
+```
+
+### Setup the configuration for the agent in this run
+
+```sh {"id":"01J4DKE3M85ETKNHFH4G0HT0M6"}
+mkdir -p ${RUNDIR}
+cp ~/.foyle/config.yaml ${RUNDIR}/config.yaml
+```
+
+* Adjust the ports used by the agent to avoid conflicts with the production agent
+
+```sh {"id":"01J4DKK0N36XN2HV4GQK7YRXCC"}
+yq e '.server.httpPort = 55080' -i ${RUNDIR}/config.yaml
+yq e '.server.grpcPort = 55090' -i ${RUNDIR}/config.yaml
+```
+
+* We make a copy of the training directory to a new directory for this evaluation run.
+
+```sh {"id":"01J4DKP9P59GCGNG6QXX6KR9AF"}
+cp -r ~/.foyle/training ${RUNDIR}/
+```
+
+```sh {"id":"01J4DKQXXB8P7CV7VS4YS5DHDD"}
+yq e ".learner.exampleDirs=[\"${RUNDIR}/training\"]" -i ${RUNDIR}/config.yaml
+```
+
+* Remove the RunMe directory for the extra log directory
+* We don't want to reprocess RunMe logs
+* Since we aren't actually using the Frontend there are no RunMe logs to process anyway
+
+```sh {"id":"01J4F79ZE8YAAKV252G2T7XD25"}
+yq e ".learner.logDirs=[]" -i ${RUNDIR}/config.yaml
+```
+
+* configure the assertions
+
+```sh {"id":"01J4F896JP8FZ3N8BGVPZ7VHJ4"}
+cp -f ${REPOROOT}/experiments/assertions.yaml ${RUNDIR}/assertions.yaml
+yq e ".spec.agentAddress=http://localhost:55080/api" -i ${RUNDIR}/assertions.yaml
+yq e ".spec.dbDir=\"${RUNDIR}/evalDB\"" -i ${RUNDIR}/assertions.yaml
+
+```
+
+### Run the agent
+
+* Start the agent containing the changes you want to evaluate
+
+```sh {"id":"01J4DM107F0GJWJKFV4P77TAQY"}
+cd ${REPOROOT}/app
+export CONFIGFILE=${RUNDIR}/config.yaml
+go run github.com/jlewi/foyle/app serve --config=${CONFIGFILE}
+```
+
+### Run evaluation driver
+
+```sh {"id":"01J4F8KQ7N5DE3JQRX33T60BB0"}
+cd ${REPOROOT}/app
+export CONFIGFILE=${RUNDIR}/config.yaml
+go run github.com/jlewi/foyle/app apply --config=${CONFIGFILE} ${RUNDIR}/assertions.yaml
+```
+
+### Analyze the results
+
+```sh {"id":"01J4HN72G5EY98MYPCZG7V02WZ","interactive":"false","mimeType":"application/json"}
+curl -s -H "Content-Type: application/json" http://localhost:55080/api/EvalService/AssertionTable -d "{\"database\":\"${RUNDIR}/evalDB\"}" | jq .rows
+```
+
 ## Run baseline experiment
 
 ```sh {"id":"01HZ38BC6WJF5RB9ZYTXBJE38M"}
diff --git a/docs/content/en/docs/tech-notes/tn001_logging.md b/docs/content/en/docs/tech-notes/tn001_logging.md
index e7c32348..2640e8f8 100644
--- a/docs/content/en/docs/tech-notes/tn001_logging.md
+++ b/docs/content/en/docs/tech-notes/tn001_logging.md
@@ -4,10 +4,9 @@ description: Design logging to support capturing human feedback.
 weight: 1
 author: "[Jeremy Lewi](https://lewi.us/about)"
 date: 2024-04-10
+status: Published
 ---
 
-* **Status**: Being Drafted
-
 ## Objective
 
 Design logging to support capturing human feedback.
diff --git a/docs/content/en/docs/tech-notes/tn010_eval_lvl1.md b/docs/content/en/docs/tech-notes/tn010_eval_lvl1.md
new file mode 100644
index 00000000..0e221ea6
--- /dev/null
+++ b/docs/content/en/docs/tech-notes/tn010_eval_lvl1.md
@@ -0,0 +1,157 @@
+---
+title: TN010 Level 1 Evaluation
+description: Level 1 Evaluation
+weight: 10
+author: "[Jeremy Lewi](https://lewi.us/about)"
+date: 2024-08-02
+status: Being Drafted
+---
+
+# Objective:
+
+Design level 1 evaluation to optimize responses for AutoComplete
+
+# TL;DR
+
+As we roll out [AutoComplete](../tn008_auto_insert_cells) we are observing
+that AI quality is an issue [jlewi/foyle#170](https://github.com/jlewi/foyle/issues/170).
+Examples of quality issues we are seeing are
+
+* AI suggests a code cell rather than a markdown cell when a user is editing a markdown cell
+* AI splits commands across multiple code cells rather than using a single code cell
+
+We can catch these issues using [Level 1 Evals](https://hamel.dev/blog/posts/evals/#level-1-unit-tests)
+which are basically assertions applied to the AI responses.
+
+To implement Level 1 Evals we can do the following
+
+1. We can generate an evaluation from logs of actual user interactions
+1. We can create scripts to run the assertions on the AI responses
+1. We can use RunMe to create playbooks to run evaluations and visualize the results as well as data
+
+## Background: 
+
+### Existing Eval Infrastructure
+
+[TN003](../tn003_learning_eval/) described how we could do evaluation given a golden dataset of examples.
+The [implementation](https://github.com/jlewi/foyle/tree/main/app/pkg/eval) was motivated by a number of factors.
+
+We want a resilient design because batch evaluation is a long running, flaky process because it depends on an external 
+service (i.e. the LLM provider). Since LLM invocations cost money we want to avoid needlessly recomputing 
+generations if nothing had changed. Therefore, we want to be able to checkpoint progress and resume from where we left off. 
+
+We want to run the same codepaths as in production. Importantly, we want to reuse logging and visualization tooling
+so we can inspect evaluation results and understand why the AI succeeded. 
+
+To run experiments, we need to modify the code and deploy a new instance of Foyle just for evaluation.
+We don't want the evaluation logs to be mixed with the production logs; in particular we don't want to learn from
+evaluation data.
+
+Evaluation was designed around a controller pattern. The controller figures out which computations need to be run
+for each data point and then runs them. Thus, once a data point is processed, rerunning evaluation becomes a null-op.
+For example, if an LLM generation has already been computed, we don't need to recompute it. Each data point
+was an instance of the [EvalResult Proto](https://github.com/jlewi/foyle/blob/f71718a50ce131a464d884f97bd0de18c24bafc5/protos/foyle/v1alpha1/eval.proto#L17). A Pebble Database was used to store the results. 
+
+Google Sheets was used for reporting of results.
+
+Experiments were defined using the [Experiment Resource](https://github.com/jlewi/foyle/blob/main/app/api/experiment.go) via [YAML files](https://github.com/jlewi/foyle/blob/f71718a50ce131a464d884f97bd0de18c24bafc5/experiments/rag.yaml). An experiment could
+be run using the Foyle CLI.
+
+## Defining Assertions
+
+As noted in [Your AI Product Needs Evals](https://hamel.dev/blog/posts/evals/#level-1-unit-tests) we'd like
+to design our assertions so they can be run online and offline. 
+
+We can use the following interface to define assertions
+
+```go
+type Assertion interface {  
+  Assert(ctx context.Context, doc *v1alpha1.Doc, examples []*v1alpha1.Example, answer []*v1alpha1.Block) (AssertResult, error)
+  // Name returns the name of the assertion.
+  Name() string
+}
+
+type AssertResult string
+AssertPassed AssertResult = "passed"
+AssertFailed AssertResult = "failed"
+AssertSkipped AssertResult = "skipped"
+```
+
+The `Assert` method takes a document, examples, and the AI response and returns a triplet indicating whether the assertion passed
+or was skipped. Context can be used to pass along the `traceId` of the actual request.
+
+### Online Evaluation
+
+For online execution, we can run the assertions asynchronously in a thread. We can log the assertions using existing logging patterns. This will allow us to fetch the assertion results as part of the trace. Reporting the results should not be the responsibility of each Assertion; we should handle that centrally. We will use OTEL to report the results as well; each 
+assertion will be added as an attribute to the trace. This will make it easy to monitor performance over time.
+
+## Batch Evaluation
+
+For quickly iterating on the AI, we need to be able to do offline, batch evaluation. Following
+the existing patterns for evaluation, we can define a new `AssertJob` resource to run the assertions.
+
+```yaml
+kind: AssertJob
+apiVersion: foyle.io/v1alpha1
+metadata:
+  name: "learning"
+spec:
+  sources:
+    - traceServer:
+        address: "http://localhost:8080"
+    - mdFiles:
+        path: "foyle/evalData"
+
+  # Pebble database used to store the results
+  dbDir: /Users/jlewi/foyle_experiments/20250530-1612/learning
+  agent: "http://localhost:108080"
+  sheetID: "1iJbkdUSxEkEX24xMH2NYpxqYcM_7-0koSRuANccDAb8"
+  sheetName: "WithRAG"  
+```
+
+The `source` field specifies the sources for the evaluation dataset. There are two different kinds of sources. A traceServer
+specifies an instance of a Foyle Agent that makes its traces available via an API. This can be used to generate examples
+based on actual user interactions. The `Traces` need to be read via API and not directly from the pebble database because the pebble database is not designed for concurrent access. 
+
+The `mdFiles` field allows examples to be provided as markdown files. This will be used to create a handcrafted curated
+dataset of examples.
+
+The `Agent` field specifies the address of the instance of Foyle to be evaluated. This instance should be configured to store its data in a different location. 
+
+The `SheetID` and `SheetName` fields specify the Google Sheet where the results will be stored.
+
+To perform the evaluation, we can implement a controller modeled on our existing [Evaluator](https://github.com/jlewi/foyle/blob/main/app/pkg/eval/evaluator.go).
+
+## Traces Service
+
+We need to introduce a Trace service to allow the evaluation to access traces.
+
+```proto
+
+service TracesService {
+  rpc ListTraces(ListTracesRequest) returns (ListTracesResponse) {
+  }
+}
+```
+
+We'll need to support filtering and pagination. The most obvious way to filter would be on time range.
+A crude way to support time based filtering would be as follows
+* Raw log entries are written in timestamp order
+* Use the raw logs to read log entries based on time range
+* Get the unique set of TraceIDs in that time range
+* Look up each trace in the traces database
+
+## Initial Assertions
+
+Here are some initial assertions we can define
+
+* If human is editing a markdown cell, suggestion should start with a code cell
+* The response should contain one code cell
+* Use regexes to check if interactive metadata is set correctly [jlewi/foyle#157](https://github.com/jlewi/foyle/issues/157)
+  * interactive should be false unless the command matches a regex for an interactive command e.g. "kubectl.*exec.*", "docker.*run.*" etc...
+* Ensure the AI doesn't generate any cells for empty input
+
+## Reference
+
+[Your AI Product Needs Evals](https://hamel.dev/blog/posts/evals/#level-1-unit-tests) Blog post describing the Level 1, 2, and 3 evals.
+
diff --git a/experiments/assertions.yaml b/experiments/assertions.yaml
new file mode 100644
index 00000000..4b85224f
--- /dev/null
+++ b/experiments/assertions.yaml
@@ -0,0 +1,9 @@
+apiVersion: foyle.io/v1alpha1
+kind: AssertJob
+metadata:
+  name: assertions
+spec:
+  agentAddress: http://localhost:55080/api
+  sources:
+    - markdownSource:
+      path: /Users/jlewi/git_foyle/data/eval
diff --git a/protos/foyle/v1alpha1/agent.proto b/protos/foyle/v1alpha1/agent.proto
index 8bed2155..0ca56766 100644
--- a/protos/foyle/v1alpha1/agent.proto
+++ b/protos/foyle/v1alpha1/agent.proto
@@ -54,8 +54,9 @@ service ExecuteService {
   }
 }
 
-// TODO(jeremy): This should move to https://github.com/stateful/runme/tree/main/pkg/api/proto/runme/ai/v1alpha1
-// This was to quickly hack so we could see what the generated client looks like for the connect protocol.
+// TODO(jeremy): Should we rename this? Maybe NotebookAIService? I think it make sense to keep this
+// Separate from the GenerateService because the GenerateService is using the Foyle protos; where as this
+// uses the RunMe protos.
 service AIService {
   // StreamGenerate is a bidirectional streaming RPC for generating completions
   rpc StreamGenerate (stream StreamGenerateRequest) returns (stream StreamGenerateResponse) {}
diff --git a/protos/foyle/v1alpha1/eval.proto b/protos/foyle/v1alpha1/eval.proto
index 84261315..2548e001 100644
--- a/protos/foyle/v1alpha1/eval.proto
+++ b/protos/foyle/v1alpha1/eval.proto
@@ -13,6 +13,13 @@ enum EvalResultStatus {
   ERROR = 2;
 }
 
+enum AssertResult {
+  UNKNOWN_AssertResult = 0;
+  PASSED = 1;
+  FAILED = 2;
+  SKIPPED = 3;
+}
+
 // EvalResult represents an evaluation result
 message EvalResult {
   // Example is the answer and expected result
@@ -37,6 +44,16 @@ message EvalResult {
 
   // Best matching RAG result
   RAGResult best_rag_result = 9;
+
+  repeated Assertion assertions = 10;
+}
+
+message Assertion {
+  // Name of the assertion
+  string name = 1;
+  AssertResult result = 2;
+  // Human readable detail of the assertion. If there was an error this should contain the error message.
+  string detail = 3;
 }
 
 message EvalResultListRequest {
@@ -49,6 +66,33 @@ message EvalResultListResponse {
 }
 
 
+// AssertionRow represents a row in the assertion table.
+// It is intended for returning the results of assertions. In a way that makes it easy to view the assertions
+// in a table inside a RunMe notebook. So we need to flatten the data.
+message AssertionRow {
+  // id of the example
+  string id = 1;
+
+  string exampleFile = 2;
+
+  // Document markdown
+  string doc_md = 3;
+  string answer_md =4;
+
+  // TODO(jeremy): How can we avoid having to add each assertion here
+  AssertResult code_after_markdown = 5;
+}
+
+message AssertionTableRequest {
+  // The path of the database to fetch results for
+  string database = 1;
+}
+
+message AssertionTableResponse {
+  repeated AssertionRow rows = 1;
+}
+
 service EvalService {
   rpc List(EvalResultListRequest) returns (EvalResultListResponse) {}
+  rpc AssertionTable(AssertionTableRequest) returns (AssertionTableResponse) {}
 }
\ No newline at end of file
diff --git a/protos/foyle/v1alpha1/trainer.proto b/protos/foyle/v1alpha1/trainer.proto
index 514aa3fa..a2562099 100644
--- a/protos/foyle/v1alpha1/trainer.proto
+++ b/protos/foyle/v1alpha1/trainer.proto
@@ -7,6 +7,7 @@ import "google/protobuf/struct.proto";
 option go_package = "github.com/jlewi/foyle/protos/go/foyle/v1alpha1";
 
 // Example represents an example to be used in few shot learning
+// It is also used to represent examples during evaluation.
 message Example {
   string id = 1;
   repeated float embedding = 2;
diff --git a/protos/go/foyle/logs/traces.zap.go b/protos/go/foyle/logs/traces.zap.go
index 306d1de1..410f9a75 100644
--- a/protos/go/foyle/logs/traces.zap.go
+++ b/protos/go/foyle/logs/traces.zap.go
@@ -7,10 +7,10 @@ import (
 	fmt "fmt"
 	math "math"
 	proto "github.com/golang/protobuf/proto"
-	_ "google.golang.org/protobuf/types/known/timestamppb"
 	_ "github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
 	_ "github.com/stateful/runme/v3/pkg/api/gen/proto/go/runme/runner/v1"
 	_ "google.golang.org/protobuf/types/known/structpb"
+	_ "google.golang.org/protobuf/types/known/timestamppb"
 	go_uber_org_zap_zapcore "go.uber.org/zap/zapcore"
 	github_com_golang_protobuf_ptypes "github.com/golang/protobuf/ptypes"
 )
diff --git a/protos/go/foyle/v1alpha1/eval.pb.go b/protos/go/foyle/v1alpha1/eval.pb.go
index 0888b54a..87dbd014 100644
--- a/protos/go/foyle/v1alpha1/eval.pb.go
+++ b/protos/go/foyle/v1alpha1/eval.pb.go
@@ -70,6 +70,58 @@ func (EvalResultStatus) EnumDescriptor() ([]byte, []int) {
 	return file_foyle_v1alpha1_eval_proto_rawDescGZIP(), []int{0}
 }
 
+type AssertResult int32
+
+const (
+	AssertResult_UNKNOWN_AssertResult AssertResult = 0
+	AssertResult_PASSED               AssertResult = 1
+	AssertResult_FAILED               AssertResult = 2
+	AssertResult_SKIPPED              AssertResult = 3
+)
+
+// Enum value maps for AssertResult.
+var (
+	AssertResult_name = map[int32]string{
+		0: "UNKNOWN_AssertResult",
+		1: "PASSED",
+		2: "FAILED",
+		3: "SKIPPED",
+	}
+	AssertResult_value = map[string]int32{
+		"UNKNOWN_AssertResult": 0,
+		"PASSED":               1,
+		"FAILED":               2,
+		"SKIPPED":              3,
+	}
+)
+
+func (x AssertResult) Enum() *AssertResult {
+	p := new(AssertResult)
+	*p = x
+	return p
+}
+
+func (x AssertResult) String() string {
+	return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x))
+}
+
+func (AssertResult) Descriptor() protoreflect.EnumDescriptor {
+	return file_foyle_v1alpha1_eval_proto_enumTypes[1].Descriptor()
+}
+
+func (AssertResult) Type() protoreflect.EnumType {
+	return &file_foyle_v1alpha1_eval_proto_enumTypes[1]
+}
+
+func (x AssertResult) Number() protoreflect.EnumNumber {
+	return protoreflect.EnumNumber(x)
+}
+
+// Deprecated: Use AssertResult.Descriptor instead.
+func (AssertResult) EnumDescriptor() ([]byte, []int) {
+	return file_foyle_v1alpha1_eval_proto_rawDescGZIP(), []int{1}
+}
+
 // EvalResult represents an evaluation result
 type EvalResult struct {
 	state         protoimpl.MessageState
@@ -91,7 +143,8 @@ type EvalResult struct {
 	// The ID of the generate trace
 	GenTraceId string `protobuf:"bytes,8,opt,name=gen_trace_id,json=genTraceId,proto3" json:"gen_trace_id,omitempty"`
 	// Best matching RAG result
-	BestRagResult *RAGResult `protobuf:"bytes,9,opt,name=best_rag_result,json=bestRagResult,proto3" json:"best_rag_result,omitempty"`
+	BestRagResult *RAGResult   `protobuf:"bytes,9,opt,name=best_rag_result,json=bestRagResult,proto3" json:"best_rag_result,omitempty"`
+	Assertions    []*Assertion `protobuf:"bytes,10,rep,name=assertions,proto3" json:"assertions,omitempty"`
 }
 
 func (x *EvalResult) Reset() {
@@ -189,6 +242,78 @@ func (x *EvalResult) GetBestRagResult() *RAGResult {
 	return nil
 }
 
+func (x *EvalResult) GetAssertions() []*Assertion {
+	if x != nil {
+		return x.Assertions
+	}
+	return nil
+}
+
+type Assertion struct {
+	state         protoimpl.MessageState
+	sizeCache     protoimpl.SizeCache
+	unknownFields protoimpl.UnknownFields
+
+	// Name of the assertion
+	Name   string       `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"`
+	Result AssertResult `protobuf:"varint,2,opt,name=result,proto3,enum=AssertResult" json:"result,omitempty"`
+	// Human readable detail of the assertion. If there was an error this should contain the error message.
+	Detail string `protobuf:"bytes,3,opt,name=detail,proto3" json:"detail,omitempty"`
+}
+
+func (x *Assertion) Reset() {
+	*x = Assertion{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_foyle_v1alpha1_eval_proto_msgTypes[1]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *Assertion) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*Assertion) ProtoMessage() {}
+
+func (x *Assertion) ProtoReflect() protoreflect.Message {
+	mi := &file_foyle_v1alpha1_eval_proto_msgTypes[1]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use Assertion.ProtoReflect.Descriptor instead.
+func (*Assertion) Descriptor() ([]byte, []int) {
+	return file_foyle_v1alpha1_eval_proto_rawDescGZIP(), []int{1}
+}
+
+func (x *Assertion) GetName() string {
+	if x != nil {
+		return x.Name
+	}
+	return ""
+}
+
+func (x *Assertion) GetResult() AssertResult {
+	if x != nil {
+		return x.Result
+	}
+	return AssertResult_UNKNOWN_AssertResult
+}
+
+func (x *Assertion) GetDetail() string {
+	if x != nil {
+		return x.Detail
+	}
+	return ""
+}
+
 type EvalResultListRequest struct {
 	state         protoimpl.MessageState
 	sizeCache     protoimpl.SizeCache
@@ -201,7 +326,7 @@ type EvalResultListRequest struct {
 func (x *EvalResultListRequest) Reset() {
 	*x = EvalResultListRequest{}
 	if protoimpl.UnsafeEnabled {
-		mi := &file_foyle_v1alpha1_eval_proto_msgTypes[1]
+		mi := &file_foyle_v1alpha1_eval_proto_msgTypes[2]
 		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 		ms.StoreMessageInfo(mi)
 	}
@@ -214,7 +339,7 @@ func (x *EvalResultListRequest) String() string {
 func (*EvalResultListRequest) ProtoMessage() {}
 
 func (x *EvalResultListRequest) ProtoReflect() protoreflect.Message {
-	mi := &file_foyle_v1alpha1_eval_proto_msgTypes[1]
+	mi := &file_foyle_v1alpha1_eval_proto_msgTypes[2]
 	if protoimpl.UnsafeEnabled && x != nil {
 		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 		if ms.LoadMessageInfo() == nil {
@@ -227,7 +352,7 @@ func (x *EvalResultListRequest) ProtoReflect() protoreflect.Message {
 
 // Deprecated: Use EvalResultListRequest.ProtoReflect.Descriptor instead.
 func (*EvalResultListRequest) Descriptor() ([]byte, []int) {
-	return file_foyle_v1alpha1_eval_proto_rawDescGZIP(), []int{1}
+	return file_foyle_v1alpha1_eval_proto_rawDescGZIP(), []int{2}
 }
 
 func (x *EvalResultListRequest) GetDatabase() string {
@@ -248,7 +373,7 @@ type EvalResultListResponse struct {
 func (x *EvalResultListResponse) Reset() {
 	*x = EvalResultListResponse{}
 	if protoimpl.UnsafeEnabled {
-		mi := &file_foyle_v1alpha1_eval_proto_msgTypes[2]
+		mi := &file_foyle_v1alpha1_eval_proto_msgTypes[3]
 		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 		ms.StoreMessageInfo(mi)
 	}
@@ -261,7 +386,7 @@ func (x *EvalResultListResponse) String() string {
 func (*EvalResultListResponse) ProtoMessage() {}
 
 func (x *EvalResultListResponse) ProtoReflect() protoreflect.Message {
-	mi := &file_foyle_v1alpha1_eval_proto_msgTypes[2]
+	mi := &file_foyle_v1alpha1_eval_proto_msgTypes[3]
 	if protoimpl.UnsafeEnabled && x != nil {
 		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 		if ms.LoadMessageInfo() == nil {
@@ -274,7 +399,7 @@ func (x *EvalResultListResponse) ProtoReflect() protoreflect.Message {
 
 // Deprecated: Use EvalResultListResponse.ProtoReflect.Descriptor instead.
 func (*EvalResultListResponse) Descriptor() ([]byte, []int) {
-	return file_foyle_v1alpha1_eval_proto_rawDescGZIP(), []int{2}
+	return file_foyle_v1alpha1_eval_proto_rawDescGZIP(), []int{3}
 }
 
 func (x *EvalResultListResponse) GetItems() []*EvalResult {
@@ -284,6 +409,186 @@ func (x *EvalResultListResponse) GetItems() []*EvalResult {
 	return nil
 }
 
+// AssertionRow represents a row in the assertion table.
+// It is intended for returning the results of assertions. In a way that makes it easy to view the assertions
+// in a table inside a RunMe notebook. So we need to flatten the data.
+type AssertionRow struct {
+	state         protoimpl.MessageState
+	sizeCache     protoimpl.SizeCache
+	unknownFields protoimpl.UnknownFields
+
+	// id of the example
+	Id          string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"`
+	ExampleFile string `protobuf:"bytes,2,opt,name=exampleFile,proto3" json:"exampleFile,omitempty"`
+	// Document markdown
+	DocMd    string `protobuf:"bytes,3,opt,name=doc_md,json=docMd,proto3" json:"doc_md,omitempty"`
+	AnswerMd string `protobuf:"bytes,4,opt,name=answer_md,json=answerMd,proto3" json:"answer_md,omitempty"`
+	// TODO(jeremy): How can we avoid having to add each assertion here
+	CodeAfterMarkdown AssertResult `protobuf:"varint,5,opt,name=code_after_markdown,json=codeAfterMarkdown,proto3,enum=AssertResult" json:"code_after_markdown,omitempty"`
+}
+
+func (x *AssertionRow) Reset() {
+	*x = AssertionRow{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_foyle_v1alpha1_eval_proto_msgTypes[4]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *AssertionRow) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*AssertionRow) ProtoMessage() {}
+
+func (x *AssertionRow) ProtoReflect() protoreflect.Message {
+	mi := &file_foyle_v1alpha1_eval_proto_msgTypes[4]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use AssertionRow.ProtoReflect.Descriptor instead.
+func (*AssertionRow) Descriptor() ([]byte, []int) {
+	return file_foyle_v1alpha1_eval_proto_rawDescGZIP(), []int{4}
+}
+
+func (x *AssertionRow) GetId() string {
+	if x != nil {
+		return x.Id
+	}
+	return ""
+}
+
+func (x *AssertionRow) GetExampleFile() string {
+	if x != nil {
+		return x.ExampleFile
+	}
+	return ""
+}
+
+func (x *AssertionRow) GetDocMd() string {
+	if x != nil {
+		return x.DocMd
+	}
+	return ""
+}
+
+func (x *AssertionRow) GetAnswerMd() string {
+	if x != nil {
+		return x.AnswerMd
+	}
+	return ""
+}
+
+func (x *AssertionRow) GetCodeAfterMarkdown() AssertResult {
+	if x != nil {
+		return x.CodeAfterMarkdown
+	}
+	return AssertResult_UNKNOWN_AssertResult
+}
+
+type AssertionTableRequest struct {
+	state         protoimpl.MessageState
+	sizeCache     protoimpl.SizeCache
+	unknownFields protoimpl.UnknownFields
+
+	// The path of the database to fetch results for
+	Database string `protobuf:"bytes,1,opt,name=database,proto3" json:"database,omitempty"`
+}
+
+func (x *AssertionTableRequest) Reset() {
+	*x = AssertionTableRequest{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_foyle_v1alpha1_eval_proto_msgTypes[5]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *AssertionTableRequest) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*AssertionTableRequest) ProtoMessage() {}
+
+func (x *AssertionTableRequest) ProtoReflect() protoreflect.Message {
+	mi := &file_foyle_v1alpha1_eval_proto_msgTypes[5]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use AssertionTableRequest.ProtoReflect.Descriptor instead.
+func (*AssertionTableRequest) Descriptor() ([]byte, []int) {
+	return file_foyle_v1alpha1_eval_proto_rawDescGZIP(), []int{5}
+}
+
+func (x *AssertionTableRequest) GetDatabase() string {
+	if x != nil {
+		return x.Database
+	}
+	return ""
+}
+
+type AssertionTableResponse struct {
+	state         protoimpl.MessageState
+	sizeCache     protoimpl.SizeCache
+	unknownFields protoimpl.UnknownFields
+
+	Rows []*AssertionRow `protobuf:"bytes,1,rep,name=rows,proto3" json:"rows,omitempty"`
+}
+
+func (x *AssertionTableResponse) Reset() {
+	*x = AssertionTableResponse{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_foyle_v1alpha1_eval_proto_msgTypes[6]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *AssertionTableResponse) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*AssertionTableResponse) ProtoMessage() {}
+
+func (x *AssertionTableResponse) ProtoReflect() protoreflect.Message {
+	mi := &file_foyle_v1alpha1_eval_proto_msgTypes[6]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use AssertionTableResponse.ProtoReflect.Descriptor instead.
+func (*AssertionTableResponse) Descriptor() ([]byte, []int) {
+	return file_foyle_v1alpha1_eval_proto_rawDescGZIP(), []int{6}
+}
+
+func (x *AssertionTableResponse) GetRows() []*AssertionRow {
+	if x != nil {
+		return x.Rows
+	}
+	return nil
+}
+
 var File_foyle_v1alpha1_eval_proto protoreflect.FileDescriptor
 
 var file_foyle_v1alpha1_eval_proto_rawDesc = []byte{
@@ -294,7 +599,7 @@ var file_foyle_v1alpha1_eval_proto_rawDesc = []byte{
 	0x6c, 0x70, 0x68, 0x61, 0x31, 0x2f, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x70, 0x72,
 	0x6f, 0x74, 0x6f, 0x1a, 0x1c, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74,
 	0x6f, 0x62, 0x75, 0x66, 0x2f, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74,
-	0x6f, 0x22, 0xd7, 0x02, 0x0a, 0x0a, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74,
+	0x6f, 0x22, 0x83, 0x03, 0x0a, 0x0a, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74,
 	0x12, 0x22, 0x0a, 0x07, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28,
 	0x0b, 0x32, 0x08, 0x2e, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x52, 0x07, 0x65, 0x78, 0x61,
 	0x6d, 0x70, 0x6c, 0x65, 0x12, 0x21, 0x0a, 0x0c, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x5f,
@@ -315,28 +620,65 @@ var file_foyle_v1alpha1_eval_proto_rawDesc = []byte{
 	0x54, 0x72, 0x61, 0x63, 0x65, 0x49, 0x64, 0x12, 0x32, 0x0a, 0x0f, 0x62, 0x65, 0x73, 0x74, 0x5f,
 	0x72, 0x61, 0x67, 0x5f, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x18, 0x09, 0x20, 0x01, 0x28, 0x0b,
 	0x32, 0x0a, 0x2e, 0x52, 0x41, 0x47, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x0d, 0x62, 0x65,
-	0x73, 0x74, 0x52, 0x61, 0x67, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x33, 0x0a, 0x15, 0x45,
-	0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x52, 0x65, 0x71,
-	0x75, 0x65, 0x73, 0x74, 0x12, 0x1a, 0x0a, 0x08, 0x64, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65,
-	0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x64, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65,
-	0x22, 0x3b, 0x0a, 0x16, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x4c, 0x69,
-	0x73, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x21, 0x0a, 0x05, 0x69, 0x74,
-	0x65, 0x6d, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0b, 0x2e, 0x45, 0x76, 0x61, 0x6c,
-	0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x05, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x2a, 0x47, 0x0a,
-	0x10, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x53, 0x74, 0x61, 0x74, 0x75,
-	0x73, 0x12, 0x1e, 0x0a, 0x1a, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x5f, 0x45, 0x56, 0x41,
-	0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x10,
-	0x00, 0x12, 0x08, 0x0a, 0x04, 0x44, 0x4f, 0x4e, 0x45, 0x10, 0x01, 0x12, 0x09, 0x0a, 0x05, 0x45,
-	0x52, 0x52, 0x4f, 0x52, 0x10, 0x02, 0x32, 0x48, 0x0a, 0x0b, 0x45, 0x76, 0x61, 0x6c, 0x53, 0x65,
-	0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x39, 0x0a, 0x04, 0x4c, 0x69, 0x73, 0x74, 0x12, 0x16, 0x2e,
+	0x73, 0x74, 0x52, 0x61, 0x67, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x2a, 0x0a, 0x0a, 0x61,
+	0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x0a, 0x20, 0x03, 0x28, 0x0b, 0x32,
+	0x0a, 0x2e, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x0a, 0x61, 0x73, 0x73,
+	0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x22, 0x5e, 0x0a, 0x09, 0x41, 0x73, 0x73, 0x65, 0x72,
+	0x74, 0x69, 0x6f, 0x6e, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01,
+	0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x25, 0x0a, 0x06, 0x72, 0x65, 0x73, 0x75,
+	0x6c, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0d, 0x2e, 0x41, 0x73, 0x73, 0x65, 0x72,
+	0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x06, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12,
+	0x16, 0x0a, 0x06, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52,
+	0x06, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x22, 0x33, 0x0a, 0x15, 0x45, 0x76, 0x61, 0x6c, 0x52,
+	0x65, 0x73, 0x75, 0x6c, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74,
+	0x12, 0x1a, 0x0a, 0x08, 0x64, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x18, 0x01, 0x20, 0x01,
+	0x28, 0x09, 0x52, 0x08, 0x64, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x22, 0x3b, 0x0a, 0x16,
 	0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x52, 0x65,
-	0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x17, 0x2e, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75,
-	0x6c, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00,
-	0x42, 0x3e, 0x42, 0x09, 0x45, 0x76, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x50, 0x01, 0x5a,
-	0x2f, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x6a, 0x6c, 0x65, 0x77,
-	0x69, 0x2f, 0x66, 0x6f, 0x79, 0x6c, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x73, 0x2f, 0x67,
-	0x6f, 0x2f, 0x66, 0x6f, 0x79, 0x6c, 0x65, 0x2f, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x31,
-	0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
+	0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x21, 0x0a, 0x05, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x18,
+	0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0b, 0x2e, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75,
+	0x6c, 0x74, 0x52, 0x05, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x22, 0xb3, 0x01, 0x0a, 0x0c, 0x41, 0x73,
+	0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x6f, 0x77, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64,
+	0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x20, 0x0a, 0x0b, 0x65, 0x78,
+	0x61, 0x6d, 0x70, 0x6c, 0x65, 0x46, 0x69, 0x6c, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52,
+	0x0b, 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x46, 0x69, 0x6c, 0x65, 0x12, 0x15, 0x0a, 0x06,
+	0x64, 0x6f, 0x63, 0x5f, 0x6d, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x64, 0x6f,
+	0x63, 0x4d, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x61, 0x6e, 0x73, 0x77, 0x65, 0x72, 0x5f, 0x6d, 0x64,
+	0x18, 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x61, 0x6e, 0x73, 0x77, 0x65, 0x72, 0x4d, 0x64,
+	0x12, 0x3d, 0x0a, 0x13, 0x63, 0x6f, 0x64, 0x65, 0x5f, 0x61, 0x66, 0x74, 0x65, 0x72, 0x5f, 0x6d,
+	0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0d, 0x2e,
+	0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x11, 0x63, 0x6f,
+	0x64, 0x65, 0x41, 0x66, 0x74, 0x65, 0x72, 0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x22,
+	0x33, 0x0a, 0x15, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x54, 0x61, 0x62, 0x6c,
+	0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x1a, 0x0a, 0x08, 0x64, 0x61, 0x74, 0x61,
+	0x62, 0x61, 0x73, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x64, 0x61, 0x74, 0x61,
+	0x62, 0x61, 0x73, 0x65, 0x22, 0x3b, 0x0a, 0x16, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f,
+	0x6e, 0x54, 0x61, 0x62, 0x6c, 0x65, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x21,
+	0x0a, 0x04, 0x72, 0x6f, 0x77, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0d, 0x2e, 0x41,
+	0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x6f, 0x77, 0x52, 0x04, 0x72, 0x6f, 0x77,
+	0x73, 0x2a, 0x47, 0x0a, 0x10, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x53,
+	0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x1e, 0x0a, 0x1a, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e,
+	0x5f, 0x45, 0x56, 0x41, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x54, 0x41,
+	0x54, 0x55, 0x53, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x44, 0x4f, 0x4e, 0x45, 0x10, 0x01, 0x12,
+	0x09, 0x0a, 0x05, 0x45, 0x52, 0x52, 0x4f, 0x52, 0x10, 0x02, 0x2a, 0x4d, 0x0a, 0x0c, 0x41, 0x73,
+	0x73, 0x65, 0x72, 0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x18, 0x0a, 0x14, 0x55, 0x4e,
+	0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x5f, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x52, 0x65, 0x73, 0x75,
+	0x6c, 0x74, 0x10, 0x00, 0x12, 0x0a, 0x0a, 0x06, 0x50, 0x41, 0x53, 0x53, 0x45, 0x44, 0x10, 0x01,
+	0x12, 0x0a, 0x0a, 0x06, 0x46, 0x41, 0x49, 0x4c, 0x45, 0x44, 0x10, 0x02, 0x12, 0x0b, 0x0a, 0x07,
+	0x53, 0x4b, 0x49, 0x50, 0x50, 0x45, 0x44, 0x10, 0x03, 0x32, 0x8d, 0x01, 0x0a, 0x0b, 0x45, 0x76,
+	0x61, 0x6c, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x39, 0x0a, 0x04, 0x4c, 0x69, 0x73,
+	0x74, 0x12, 0x16, 0x2e, 0x45, 0x76, 0x61, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x4c, 0x69,
+	0x73, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x17, 0x2e, 0x45, 0x76, 0x61, 0x6c,
+	0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e,
+	0x73, 0x65, 0x22, 0x00, 0x12, 0x43, 0x0a, 0x0e, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f,
+	0x6e, 0x54, 0x61, 0x62, 0x6c, 0x65, 0x12, 0x16, 0x2e, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69,
+	0x6f, 0x6e, 0x54, 0x61, 0x62, 0x6c, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x17,
+	0x2e, 0x41, 0x73, 0x73, 0x65, 0x72, 0x74, 0x69, 0x6f, 0x6e, 0x54, 0x61, 0x62, 0x6c, 0x65, 0x52,
+	0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x3e, 0x42, 0x09, 0x45, 0x76, 0x61,
+	0x6c, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x50, 0x01, 0x5a, 0x2f, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62,
+	0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x6a, 0x6c, 0x65, 0x77, 0x69, 0x2f, 0x66, 0x6f, 0x79, 0x6c, 0x65,
+	0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x73, 0x2f, 0x67, 0x6f, 0x2f, 0x66, 0x6f, 0x79, 0x6c, 0x65,
+	0x2f, 0x76, 0x31, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x31, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f,
+	0x33,
 }
 
 var (
@@ -351,30 +693,41 @@ func file_foyle_v1alpha1_eval_proto_rawDescGZIP() []byte {
 	return file_foyle_v1alpha1_eval_proto_rawDescData
 }
 
-var file_foyle_v1alpha1_eval_proto_enumTypes = make([]protoimpl.EnumInfo, 1)
-var file_foyle_v1alpha1_eval_proto_msgTypes = make([]protoimpl.MessageInfo, 3)
+var file_foyle_v1alpha1_eval_proto_enumTypes = make([]protoimpl.EnumInfo, 2)
+var file_foyle_v1alpha1_eval_proto_msgTypes = make([]protoimpl.MessageInfo, 7)
 var file_foyle_v1alpha1_eval_proto_goTypes = []interface{}{
 	(EvalResultStatus)(0),          // 0: EvalResultStatus
-	(*EvalResult)(nil),             // 1: EvalResult
-	(*EvalResultListRequest)(nil),  // 2: EvalResultListRequest
-	(*EvalResultListResponse)(nil), // 3: EvalResultListResponse
-	(*Example)(nil),                // 4: Example
-	(*Block)(nil),                  // 5: Block
-	(*RAGResult)(nil),              // 6: RAGResult
+	(AssertResult)(0),              // 1: AssertResult
+	(*EvalResult)(nil),             // 2: EvalResult
+	(*Assertion)(nil),              // 3: Assertion
+	(*EvalResultListRequest)(nil),  // 4: EvalResultListRequest
+	(*EvalResultListResponse)(nil), // 5: EvalResultListResponse
+	(*AssertionRow)(nil),           // 6: AssertionRow
+	(*AssertionTableRequest)(nil),  // 7: AssertionTableRequest
+	(*AssertionTableResponse)(nil), // 8: AssertionTableResponse
+	(*Example)(nil),                // 9: Example
+	(*Block)(nil),                  // 10: Block
+	(*RAGResult)(nil),              // 11: RAGResult
 }
 var file_foyle_v1alpha1_eval_proto_depIdxs = []int32{
-	4, // 0: EvalResult.example:type_name -> Example
-	5, // 1: EvalResult.actual:type_name -> Block
-	0, // 2: EvalResult.status:type_name -> EvalResultStatus
-	6, // 3: EvalResult.best_rag_result:type_name -> RAGResult
-	1, // 4: EvalResultListResponse.items:type_name -> EvalResult
-	2, // 5: EvalService.List:input_type -> EvalResultListRequest
-	3, // 6: EvalService.List:output_type -> EvalResultListResponse
-	6, // [6:7] is the sub-list for method output_type
-	5, // [5:6] is the sub-list for method input_type
-	5, // [5:5] is the sub-list for extension type_name
-	5, // [5:5] is the sub-list for extension extendee
-	0, // [0:5] is the sub-list for field type_name
+	9,  // 0: EvalResult.example:type_name -> Example
+	10, // 1: EvalResult.actual:type_name -> Block
+	0,  // 2: EvalResult.status:type_name -> EvalResultStatus
+	11, // 3: EvalResult.best_rag_result:type_name -> RAGResult
+	3,  // 4: EvalResult.assertions:type_name -> Assertion
+	1,  // 5: Assertion.result:type_name -> AssertResult
+	2,  // 6: EvalResultListResponse.items:type_name -> EvalResult
+	1,  // 7: AssertionRow.code_after_markdown:type_name -> AssertResult
+	6,  // 8: AssertionTableResponse.rows:type_name -> AssertionRow
+	4,  // 9: EvalService.List:input_type -> EvalResultListRequest
+	7,  // 10: EvalService.AssertionTable:input_type -> AssertionTableRequest
+	5,  // 11: EvalService.List:output_type -> EvalResultListResponse
+	8,  // 12: EvalService.AssertionTable:output_type -> AssertionTableResponse
+	11, // [11:13] is the sub-list for method output_type
+	9,  // [9:11] is the sub-list for method input_type
+	9,  // [9:9] is the sub-list for extension type_name
+	9,  // [9:9] is the sub-list for extension extendee
+	0,  // [0:9] is the sub-list for field type_name
 }
 
 func init() { file_foyle_v1alpha1_eval_proto_init() }
@@ -398,7 +751,7 @@ func file_foyle_v1alpha1_eval_proto_init() {
 			}
 		}
 		file_foyle_v1alpha1_eval_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} {
-			switch v := v.(*EvalResultListRequest); i {
+			switch v := v.(*Assertion); i {
 			case 0:
 				return &v.state
 			case 1:
@@ -410,6 +763,18 @@ func file_foyle_v1alpha1_eval_proto_init() {
 			}
 		}
 		file_foyle_v1alpha1_eval_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*EvalResultListRequest); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			default:
+				return nil
+			}
+		}
+		file_foyle_v1alpha1_eval_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} {
 			switch v := v.(*EvalResultListResponse); i {
 			case 0:
 				return &v.state
@@ -421,14 +786,50 @@ func file_foyle_v1alpha1_eval_proto_init() {
 				return nil
 			}
 		}
+		file_foyle_v1alpha1_eval_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*AssertionRow); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			default:
+				return nil
+			}
+		}
+		file_foyle_v1alpha1_eval_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*AssertionTableRequest); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			default:
+				return nil
+			}
+		}
+		file_foyle_v1alpha1_eval_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*AssertionTableResponse); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			default:
+				return nil
+			}
+		}
 	}
 	type x struct{}
 	out := protoimpl.TypeBuilder{
 		File: protoimpl.DescBuilder{
 			GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
 			RawDescriptor: file_foyle_v1alpha1_eval_proto_rawDesc,
-			NumEnums:      1,
-			NumMessages:   3,
+			NumEnums:      2,
+			NumMessages:   7,
 			NumExtensions: 0,
 			NumServices:   1,
 		},
diff --git a/protos/go/foyle/v1alpha1/eval.zap.go b/protos/go/foyle/v1alpha1/eval.zap.go
index b005cc9f..2cc8840a 100644
--- a/protos/go/foyle/v1alpha1/eval.zap.go
+++ b/protos/go/foyle/v1alpha1/eval.zap.go
@@ -72,6 +72,40 @@ func (m *EvalResult) MarshalLogObject(enc go_uber_org_zap_zapcore.ObjectEncoder)
 		}
 	}
 
+	keyName = "assertions" // field assertions = 10
+	enc.AddArray(keyName, go_uber_org_zap_zapcore.ArrayMarshalerFunc(func(aenc go_uber_org_zap_zapcore.ArrayEncoder) error {
+		for _, rv := range m.Assertions {
+			_ = rv
+			if rv != nil {
+				var vv interface{} = rv
+				if marshaler, ok := vv.(go_uber_org_zap_zapcore.ObjectMarshaler); ok {
+					aenc.AppendObject(marshaler)
+				}
+			}
+		}
+		return nil
+	}))
+
+	return nil
+}
+
+func (m *Assertion) MarshalLogObject(enc go_uber_org_zap_zapcore.ObjectEncoder) error {
+	var keyName string
+	_ = keyName
+
+	if m == nil {
+		return nil
+	}
+
+	keyName = "name" // field name = 1
+	enc.AddString(keyName, m.Name)
+
+	keyName = "result" // field result = 2
+	enc.AddString(keyName, m.Result.String())
+
+	keyName = "detail" // field detail = 3
+	enc.AddString(keyName, m.Detail)
+
 	return nil
 }
 
@@ -113,3 +147,68 @@ func (m *EvalResultListResponse) MarshalLogObject(enc go_uber_org_zap_zapcore.Ob
 
 	return nil
 }
+
+func (m *AssertionRow) MarshalLogObject(enc go_uber_org_zap_zapcore.ObjectEncoder) error {
+	var keyName string
+	_ = keyName
+
+	if m == nil {
+		return nil
+	}
+
+	keyName = "id" // field id = 1
+	enc.AddString(keyName, m.Id)
+
+	keyName = "exampleFile" // field exampleFile = 2
+	enc.AddString(keyName, m.ExampleFile)
+
+	keyName = "doc_md" // field doc_md = 3
+	enc.AddString(keyName, m.DocMd)
+
+	keyName = "answer_md" // field answer_md = 4
+	enc.AddString(keyName, m.AnswerMd)
+
+	keyName = "code_after_markdown" // field code_after_markdown = 5
+	enc.AddString(keyName, m.CodeAfterMarkdown.String())
+
+	return nil
+}
+
+func (m *AssertionTableRequest) MarshalLogObject(enc go_uber_org_zap_zapcore.ObjectEncoder) error {
+	var keyName string
+	_ = keyName
+
+	if m == nil {
+		return nil
+	}
+
+	keyName = "database" // field database = 1
+	enc.AddString(keyName, m.Database)
+
+	return nil
+}
+
+func (m *AssertionTableResponse) MarshalLogObject(enc go_uber_org_zap_zapcore.ObjectEncoder) error {
+	var keyName string
+	_ = keyName
+
+	if m == nil {
+		return nil
+	}
+
+	keyName = "rows" // field rows = 1
+	enc.AddArray(keyName, go_uber_org_zap_zapcore.ArrayMarshalerFunc(func(aenc go_uber_org_zap_zapcore.ArrayEncoder) error {
+		for _, rv := range m.Rows {
+			_ = rv
+			if rv != nil {
+				var vv interface{} = rv
+				if marshaler, ok := vv.(go_uber_org_zap_zapcore.ObjectMarshaler); ok {
+					aenc.AppendObject(marshaler)
+				}
+			}
+		}
+		return nil
+	}))
+
+	return nil
+}
diff --git a/protos/go/foyle/v1alpha1/eval_grpc.pb.go b/protos/go/foyle/v1alpha1/eval_grpc.pb.go
index 13687545..f8ea47c6 100644
--- a/protos/go/foyle/v1alpha1/eval_grpc.pb.go
+++ b/protos/go/foyle/v1alpha1/eval_grpc.pb.go
@@ -23,6 +23,7 @@ const _ = grpc.SupportPackageIsVersion7
 // For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
 type EvalServiceClient interface {
 	List(ctx context.Context, in *EvalResultListRequest, opts ...grpc.CallOption) (*EvalResultListResponse, error)
+	AssertionTable(ctx context.Context, in *AssertionTableRequest, opts ...grpc.CallOption) (*AssertionTableResponse, error)
 }
 
 type evalServiceClient struct {
@@ -42,11 +43,21 @@ func (c *evalServiceClient) List(ctx context.Context, in *EvalResultListRequest,
 	return out, nil
 }
 
+func (c *evalServiceClient) AssertionTable(ctx context.Context, in *AssertionTableRequest, opts ...grpc.CallOption) (*AssertionTableResponse, error) {
+	out := new(AssertionTableResponse)
+	err := c.cc.Invoke(ctx, "/EvalService/AssertionTable", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
 // EvalServiceServer is the server API for EvalService service.
 // All implementations must embed UnimplementedEvalServiceServer
 // for forward compatibility
 type EvalServiceServer interface {
 	List(context.Context, *EvalResultListRequest) (*EvalResultListResponse, error)
+	AssertionTable(context.Context, *AssertionTableRequest) (*AssertionTableResponse, error)
 	mustEmbedUnimplementedEvalServiceServer()
 }
 
@@ -57,6 +68,9 @@ type UnimplementedEvalServiceServer struct {
 func (UnimplementedEvalServiceServer) List(context.Context, *EvalResultListRequest) (*EvalResultListResponse, error) {
 	return nil, status.Errorf(codes.Unimplemented, "method List not implemented")
 }
+func (UnimplementedEvalServiceServer) AssertionTable(context.Context, *AssertionTableRequest) (*AssertionTableResponse, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method AssertionTable not implemented")
+}
 func (UnimplementedEvalServiceServer) mustEmbedUnimplementedEvalServiceServer() {}
 
 // UnsafeEvalServiceServer may be embedded to opt out of forward compatibility for this service.
@@ -88,6 +102,24 @@ func _EvalService_List_Handler(srv interface{}, ctx context.Context, dec func(in
 	return interceptor(ctx, in, info, handler)
 }
 
+func _EvalService_AssertionTable_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(AssertionTableRequest)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(EvalServiceServer).AssertionTable(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/EvalService/AssertionTable",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(EvalServiceServer).AssertionTable(ctx, req.(*AssertionTableRequest))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
 // EvalService_ServiceDesc is the grpc.ServiceDesc for EvalService service.
 // It's only intended for direct use with grpc.RegisterService,
 // and not to be introspected or modified (even as a copy)
@@ -99,6 +131,10 @@ var EvalService_ServiceDesc = grpc.ServiceDesc{
 			MethodName: "List",
 			Handler:    _EvalService_List_Handler,
 		},
+		{
+			MethodName: "AssertionTable",
+			Handler:    _EvalService_AssertionTable_Handler,
+		},
 	},
 	Streams:  []grpc.StreamDesc{},
 	Metadata: "foyle/v1alpha1/eval.proto",
diff --git a/protos/go/foyle/v1alpha1/trainer.pb.go b/protos/go/foyle/v1alpha1/trainer.pb.go
index 9fb1daa8..b7153bb2 100644
--- a/protos/go/foyle/v1alpha1/trainer.pb.go
+++ b/protos/go/foyle/v1alpha1/trainer.pb.go
@@ -22,6 +22,7 @@ const (
 )
 
 // Example represents an example to be used in few shot learning
+// It is also used to represent examples during evaluation.
 type Example struct {
 	state         protoimpl.MessageState
 	sizeCache     protoimpl.SizeCache
diff --git a/protos/go/foyle/v1alpha1/v1alpha1connect/eval.connect.go b/protos/go/foyle/v1alpha1/v1alpha1connect/eval.connect.go
index 5cedccc5..579ec044 100644
--- a/protos/go/foyle/v1alpha1/v1alpha1connect/eval.connect.go
+++ b/protos/go/foyle/v1alpha1/v1alpha1connect/eval.connect.go
@@ -35,17 +35,22 @@ const (
 const (
 	// EvalServiceListProcedure is the fully-qualified name of the EvalService's List RPC.
 	EvalServiceListProcedure = "/EvalService/List"
+	// EvalServiceAssertionTableProcedure is the fully-qualified name of the EvalService's
+	// AssertionTable RPC.
+	EvalServiceAssertionTableProcedure = "/EvalService/AssertionTable"
 )
 
 // These variables are the protoreflect.Descriptor objects for the RPCs defined in this package.
 var (
-	evalServiceServiceDescriptor    = v1alpha1.File_foyle_v1alpha1_eval_proto.Services().ByName("EvalService")
-	evalServiceListMethodDescriptor = evalServiceServiceDescriptor.Methods().ByName("List")
+	evalServiceServiceDescriptor              = v1alpha1.File_foyle_v1alpha1_eval_proto.Services().ByName("EvalService")
+	evalServiceListMethodDescriptor           = evalServiceServiceDescriptor.Methods().ByName("List")
+	evalServiceAssertionTableMethodDescriptor = evalServiceServiceDescriptor.Methods().ByName("AssertionTable")
 )
 
 // EvalServiceClient is a client for the EvalService service.
 type EvalServiceClient interface {
 	List(context.Context, *connect.Request[v1alpha1.EvalResultListRequest]) (*connect.Response[v1alpha1.EvalResultListResponse], error)
+	AssertionTable(context.Context, *connect.Request[v1alpha1.AssertionTableRequest]) (*connect.Response[v1alpha1.AssertionTableResponse], error)
 }
 
 // NewEvalServiceClient constructs a client for the EvalService service. By default, it uses the
@@ -64,12 +69,19 @@ func NewEvalServiceClient(httpClient connect.HTTPClient, baseURL string, opts ..
 			connect.WithSchema(evalServiceListMethodDescriptor),
 			connect.WithClientOptions(opts...),
 		),
+		assertionTable: connect.NewClient[v1alpha1.AssertionTableRequest, v1alpha1.AssertionTableResponse](
+			httpClient,
+			baseURL+EvalServiceAssertionTableProcedure,
+			connect.WithSchema(evalServiceAssertionTableMethodDescriptor),
+			connect.WithClientOptions(opts...),
+		),
 	}
 }
 
 // evalServiceClient implements EvalServiceClient.
 type evalServiceClient struct {
-	list *connect.Client[v1alpha1.EvalResultListRequest, v1alpha1.EvalResultListResponse]
+	list           *connect.Client[v1alpha1.EvalResultListRequest, v1alpha1.EvalResultListResponse]
+	assertionTable *connect.Client[v1alpha1.AssertionTableRequest, v1alpha1.AssertionTableResponse]
 }
 
 // List calls EvalService.List.
@@ -77,9 +89,15 @@ func (c *evalServiceClient) List(ctx context.Context, req *connect.Request[v1alp
 	return c.list.CallUnary(ctx, req)
 }
 
+// AssertionTable calls EvalService.AssertionTable.
+func (c *evalServiceClient) AssertionTable(ctx context.Context, req *connect.Request[v1alpha1.AssertionTableRequest]) (*connect.Response[v1alpha1.AssertionTableResponse], error) {
+	return c.assertionTable.CallUnary(ctx, req)
+}
+
 // EvalServiceHandler is an implementation of the EvalService service.
 type EvalServiceHandler interface {
 	List(context.Context, *connect.Request[v1alpha1.EvalResultListRequest]) (*connect.Response[v1alpha1.EvalResultListResponse], error)
+	AssertionTable(context.Context, *connect.Request[v1alpha1.AssertionTableRequest]) (*connect.Response[v1alpha1.AssertionTableResponse], error)
 }
 
 // NewEvalServiceHandler builds an HTTP handler from the service implementation. It returns the path
@@ -94,10 +112,18 @@ func NewEvalServiceHandler(svc EvalServiceHandler, opts ...connect.HandlerOption
 		connect.WithSchema(evalServiceListMethodDescriptor),
 		connect.WithHandlerOptions(opts...),
 	)
+	evalServiceAssertionTableHandler := connect.NewUnaryHandler(
+		EvalServiceAssertionTableProcedure,
+		svc.AssertionTable,
+		connect.WithSchema(evalServiceAssertionTableMethodDescriptor),
+		connect.WithHandlerOptions(opts...),
+	)
 	return "/EvalService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		switch r.URL.Path {
 		case EvalServiceListProcedure:
 			evalServiceListHandler.ServeHTTP(w, r)
+		case EvalServiceAssertionTableProcedure:
+			evalServiceAssertionTableHandler.ServeHTTP(w, r)
 		default:
 			http.NotFound(w, r)
 		}
@@ -110,3 +136,7 @@ type UnimplementedEvalServiceHandler struct{}
 func (UnimplementedEvalServiceHandler) List(context.Context, *connect.Request[v1alpha1.EvalResultListRequest]) (*connect.Response[v1alpha1.EvalResultListResponse], error) {
 	return nil, connect.NewError(connect.CodeUnimplemented, errors.New("EvalService.List is not implemented"))
 }
+
+func (UnimplementedEvalServiceHandler) AssertionTable(context.Context, *connect.Request[v1alpha1.AssertionTableRequest]) (*connect.Response[v1alpha1.AssertionTableResponse], error) {
+	return nil, connect.NewError(connect.CodeUnimplemented, errors.New("EvalService.AssertionTable is not implemented"))
+}