Skip to content

Commit

Permalink
Merge pull request #524 from njhale/fix/smoke-flakes
Browse files Browse the repository at this point in the history
fix: smoke test flakes
  • Loading branch information
njhale authored Jun 21, 2024
2 parents c25c7be + 1500872 commit 8a283fd
Show file tree
Hide file tree
Showing 12 changed files with 2,021 additions and 3,264 deletions.
33 changes: 17 additions & 16 deletions pkg/tests/judge/judge.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@ import (
openai "github.com/gptscript-ai/chat-completion-client"
)

const instructions = `When given JSON objects that conform to the following JSONSchema:
const instructions = `"actual" is considered equivalent to "expected" if and only if the following rules are satisfied:
%s
Determine if "actual" is equal to "expected" based on the comparison constraints described by "criteria".
"actual" is considered equal to "expected" if and only if the all of the constraints described by "criteria" are satisfied.
When given JSON objects that conform to the following JSONSchema:
%s
Determine if "actual" is considered equivalent to "expected".
After making a determination, respond with a JSON object that conforms to the following JSONSchema:
Expand All @@ -28,7 +31,7 @@ After making a determination, respond with a JSON object that conforms to the fo
},
"reasoning": {
"type": "string",
"description": "The reasoning used to come to the determination, that points out all instances where the given criteria was violated"
"description": "The reasoning used to come to the determination"
}
},
"required": [
Expand All @@ -41,14 +44,13 @@ Your responses are concise and include only the json object described above.
`

type Judge[T any] struct {
client *openai.Client
instructions string
client *openai.Client
comparisonSchema string
}

type comparison[T any] struct {
Expected T `json:"expected"`
Actual T `json:"actual"`
Criteria string `json:"criteria"`
Expected T `json:"expected"`
Actual T `json:"actual"`
}

type ruling struct {
Expand All @@ -70,22 +72,21 @@ func New[T any](client *openai.Client) (*Judge[T], error) {
return nil, fmt.Errorf("failed to generate JSONSchema for %T: %w", new(T), err)
}

schemaJSON, err := json.MarshalIndent(schema, "", " ")
marshaled, err := json.MarshalIndent(schema, "", " ")
if err != nil {
return nil, fmt.Errorf("failed to marshal JSONSchema for %T: %w", new(T), err)
}

return &Judge[T]{
client: client,
instructions: fmt.Sprintf(instructions, schemaJSON),
client: client,
comparisonSchema: string(marshaled),
}, nil
}

func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria string) (equal bool, reasoning string, err error) {
comparisonJSON, err := json.MarshalIndent(&comparison[T]{
Expected: expected,
Actual: actual,
Criteria: criteria,
}, "", " ")
if err != nil {
return false, "", fmt.Errorf("failed to marshal judge testcase JSON: %w", err)
Expand All @@ -101,7 +102,7 @@ func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria strin
Messages: []openai.ChatCompletionMessage{
{
Role: openai.ChatMessageRoleSystem,
Content: j.instructions,
Content: fmt.Sprintf(instructions, criteria, j.comparisonSchema),
},
{
Role: openai.ChatMessageRoleUser,
Expand All @@ -111,11 +112,11 @@ func (j *Judge[T]) Equal(ctx context.Context, expected, actual T, criteria strin
}
response, err := j.client.CreateChatCompletion(ctx, request)
if err != nil {
return false, "", fmt.Errorf("failed to make judge chat completion request: %w", err)
return false, "", fmt.Errorf("failed to create chat completion request: %w", err)
}

if len(response.Choices) < 1 {
return false, "", fmt.Errorf("judge chat completion request returned no choices")
return false, "", fmt.Errorf("chat completion request returned no choices")
}

var equality ruling
Expand Down
9 changes: 7 additions & 2 deletions pkg/tests/smoke/smoke_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,13 @@ func TestSmoke(t *testing.T) {
ctx,
expectedEvents,
actualEvents,
`The field values of the elements of expected and actual must be roughly equivalent.
Ignore variations in timestamps, IDs, and verbiage when determining equivalence.`,
`
- disregard differences in timestamps, generated IDs, natural language verbiage, and event order
- omit callProgress events from the comparision
- the overall stream of events and set of tools called should roughly match
- arguments passed in tool calls should be roughly the same
- the final callFinish event should be semantically similar
`,
)
require.NoError(t, err, "error getting judge ruling on output")
require.True(t, equal, reasoning)
Expand Down
Loading

0 comments on commit 8a283fd

Please sign in to comment.