-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
See tn003_learning_eval.md for more description of how we are doing eval. This is a first pass at evaluation. * Implement a distance metric based on edit distance * Implement the infrastructure to compute it. Add an apply command and use it to run different experiments * Requires starting to move some of the Agent config into the API package because we want to reuse it in the experiment type. Start an initial evaluation dataset. ### API Updates - Added `EvalResult` structure to represent the evaluation outcome, including the actual commands generated, the expected commands, and the evaluation distance. - Introduced `EvalResultStatus` to indicate the status of an evaluation, such as `DONE` or `ERROR`. ### Agent Updates - Updated the `Agent` service to support evaluation mode, allowing it to operate without impacting the learning process. ### Executor Updates - Enhanced the `Executor` service to handle execution in evaluation mode, ensuring that execution traces are marked accordingly. ### Evaluator Implementation - Implemented the `Evaluator` component responsible for orchestrating the evaluation process, loading evaluation examples, generating predictions with the Agent, calculating distances, and updating results. ### Google Sheets Integration - Added functionality to export evaluation results to Google Sheets, enabling easy review and analysis of Foyle's performance. ### CLI Tool Enhancements - Extended the CLI tool with commands for running evaluations ### Miscellaneous - Added necessary protobuf definitions for new data structures related to evaluations. - Updated server setup to handle evaluation logic and integrate with the learning mechanism. - Provided sample evaluation datasets for initial testing and validation of the evaluation process.
- Loading branch information
Showing
32 changed files
with
2,555 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
package api | ||
|
||
type AgentConfig struct { | ||
// Model is the name of the model to use to generate completions | ||
Model string `json:"model" yaml:"model"` | ||
|
||
// RAG is the configuration for the RAG model | ||
RAG *RAGConfig `json:"rag,omitempty" yaml:"rag,omitempty"` | ||
|
||
// EvalMode is whether to run in evaluation mode or not. | ||
// In EvalMode logs are specially marked so requests won't be used for training. | ||
EvalMode bool `json:"evalMode" yaml:"evalMode"` | ||
} | ||
|
||
// RAGConfig configures the RAG model | ||
type RAGConfig struct { | ||
// Enabled is whether to enable the RAG model or not | ||
Enabled bool `json:"enabled" yaml:"enabled"` | ||
// MaxResults is the maximum number of results to return | ||
MaxResults int `json:"maxResults" yaml:"maxResults"` | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
package api | ||
|
||
import "k8s.io/apimachinery/pkg/runtime/schema" | ||
|
||
var ( | ||
ExperimentGVK = schema.FromAPIVersionAndKind(Group+"/"+Version, "Experiment") | ||
) | ||
|
||
// Experiment is a struct that represents an experiment | ||
type Experiment struct { | ||
Metadata Metadata `json:"metadata" yaml:"metadata"` | ||
Spec ExperimentSpec `json:"spec" yaml:"spec"` | ||
} | ||
|
||
type ExperimentSpec struct { | ||
// EvalDir is the directory containing the evaluation the evaluation input | ||
EvalDir string `json:"evalDir" yaml:"evalDir"` | ||
|
||
// DBDir is the directory for the pebble database that will store the results | ||
DBDir string `json:"dbDir" yaml:"dbDir"` | ||
|
||
// SheetID is the ID of the Google Sheet to update with the results. | ||
SheetID string `json:"sheetID" yaml:"sheetID"` | ||
|
||
// SheetName is the name of the sheet to update. | ||
SheetName string `json:"sheetName" yaml:"sheetName"` | ||
|
||
// Agent is the configuration for the agent | ||
Agent *AgentConfig `json:"agent,omitempty" yaml:"agent,omitempty"` | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
package api | ||
|
||
const ( | ||
Group = "foyle.io" | ||
Version = "v1alpha1" | ||
) | ||
|
||
// N.B. We need to redefine Metadata and not reuse the version in the K8s libraries | ||
// because we want it to have yaml tags so we can serialize with the YAML library. | ||
|
||
// Metadata holds an optional name of the project. | ||
type Metadata struct { | ||
Name string `yaml:"name,omitempty"` | ||
Namespace string `yaml:"namespace,omitempty"` | ||
Labels map[string]string `yaml:"labels"` | ||
Annotations map[string]string `yaml:"annotations,omitempty"` | ||
// ResourceVersion is used for optimistic concurrency. | ||
// Ref: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#metadata | ||
// This should be treated as an opaque value by clients. | ||
ResourceVersion string `yaml:"resourceVersion,omitempty"` | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package cmd | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"os" | ||
|
||
"github.com/go-logr/zapr" | ||
"github.com/jlewi/foyle/app/pkg/application" | ||
"github.com/pkg/errors" | ||
"github.com/spf13/cobra" | ||
"go.uber.org/zap" | ||
) | ||
|
||
// NewApplyCmd create an apply command | ||
func NewApplyCmd() *cobra.Command { | ||
// TODO(jeremy): We should update apply to support the image resource. | ||
applyCmd := &cobra.Command{ | ||
Use: "apply <resource.yaml> <resourceDir> <resource.yaml> ...", | ||
Short: "Apply the specified resource.", | ||
Run: func(cmd *cobra.Command, args []string) { | ||
err := func() error { | ||
log := zapr.NewLogger(zap.L()) | ||
if len(args) == 0 { | ||
log.Info("apply takes at least one argument which should be the file or directory YAML to apply.") | ||
return errors.New("apply takes at least one argument which should be the file or directory YAML to apply.") | ||
} | ||
logVersion() | ||
|
||
app := application.NewApp() | ||
if err := app.LoadConfig(cmd); err != nil { | ||
return err | ||
} | ||
if err := app.SetupLogging(false); err != nil { | ||
return err | ||
} | ||
|
||
if err := app.SetupRegistry(); err != nil { | ||
return err | ||
} | ||
|
||
return app.ApplyPaths(context.Background(), args) | ||
}() | ||
if err != nil { | ||
fmt.Printf("Error running apply;\n %+v\n", err) | ||
os.Exit(1) | ||
} | ||
}, | ||
} | ||
|
||
return applyCmd | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.