-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Evals #37
Evals #37
Changes from 32 commits
5177c29
e61e401
148a73f
ae6d163
e075c71
a077ad8
122efad
43e5e50
a518e5e
8dfe444
2bad0fe
399d80a
139770e
633ebfe
75a2f87
a4351af
022b9ce
98d3777
1d2309c
2cb4a14
58c01ac
311fc4a
4a9df8a
171e427
64263de
766e23f
75779f8
fa8934f
a8922db
2f0cb7a
cab2349
dd39b3b
cfe8c4d
fe11d1b
7ec1087
5d27598
a5d9302
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
name: Run Braintrust Evals | ||
|
||
on: | ||
workflow_run: | ||
workflows: ["Knip"] | ||
types: | ||
- completed | ||
|
||
permissions: write-all | ||
|
||
jobs: | ||
eval: | ||
name: Run evals | ||
runs-on: ubuntu-latest | ||
permissions: write-all | ||
|
||
steps: | ||
- uses: actions/setup-node@v4 | ||
with: | ||
node-version: "20.10.0" | ||
|
||
- uses: actions/checkout@master | ||
with: | ||
fetch-depth: 0 | ||
|
||
- name: Setup Bun | ||
uses: oven-sh/setup-bun@v2 | ||
|
||
- name: Install toolchain | ||
run: bun install --frozen-lockfile | ||
|
||
- name: Run Evals | ||
id: evals | ||
run: bun eval | ||
env: | ||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }} | ||
SUPABASE_KEY: ${{ secrets.SUPABASE_KEY }} | ||
VOYAGEAI_API_KEY: ${{ secrets.VOYAGEAI_API_KEY }} | ||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | ||
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} | ||
UBIQUITY_OS_APP_NAME: ${{ secrets.UBIQUITY_OS_APP_NAME }} | ||
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
|
||
- name: Add Evals Report to Github Comment | ||
if: always() | ||
run: echo "$(cat eval-results.md)" >> $GITHUB_STEP_SUMMARY |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,4 +15,5 @@ junit.xml | |
cypress/screenshots | ||
script.ts | ||
.wrangler | ||
test-dashboard.md | ||
test-dashboard.md | ||
eval-results.md |
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
{ | ||
"issueResponses": [ | ||
{ | ||
"scenario": "manifest.name mismatch ", | ||
"issue": { | ||
"body": "Manifests need to be updated so the name matches the intended name, which is the name of the repo it lives in.\n\nAny mismatch in manifest.name and the plugin repo, and we will not be able to install those plugins. The config will look like this:\n\nThis is because the worker URL contains the repo name, and we use that to match against manifest.name.", | ||
"number": 27, | ||
"html_url": "https://github.com/ubiquity-os/ubiquity-os-plugin-installer/issues/27/", | ||
"question": "@ubosshivaditya could you please provide a summary of the issue ?" | ||
}, | ||
"expectedResponse": "The manifest.name should match the name of the repo it lives in. This is because the worker URL contains the repo name, and we use that to match against manifest.name.", | ||
"sender": { | ||
"login": "sshivaditya2019", | ||
"type": "User" | ||
}, | ||
"repository": { | ||
"name": "ubiquity-os-plugin-installer", | ||
"owner": { | ||
"login": "ubiquity-os", | ||
"type": "Organization" | ||
} | ||
} | ||
} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
import { SupabaseClient } from "@supabase/supabase-js"; | ||
import { createAdapters } from "../../src/adapters"; | ||
import { CommentSimilaritySearchResult } from "../../src/adapters/supabase/helpers/comment"; | ||
import { IssueSimilaritySearchResult } from "../../src/adapters/supabase/helpers/issues"; | ||
import { fetchRepoLanguageStats, fetchRepoDependencies } from "../../src/handlers/ground-truths/chat-bot"; | ||
import { findGroundTruths } from "../../src/handlers/ground-truths/find-ground-truths"; | ||
import { logger } from "../../src/helpers/errors"; | ||
import { formatChatHistory } from "../../src/helpers/format-chat-history"; | ||
import { recursivelyFetchLinkedIssues } from "../../src/helpers/issue-fetching"; | ||
import { Context } from "../../src/types"; | ||
import { VoyageAIClient } from "voyageai"; | ||
import OpenAI from "openai"; | ||
|
||
const SEPERATOR = "######################################################\n"; | ||
|
||
export interface FetchContext { | ||
rerankedText: string[]; | ||
formattedChat: string[]; | ||
groundTruths: string[]; | ||
} | ||
|
||
export interface EvalClients { | ||
supabase: SupabaseClient; | ||
voyage: VoyageAIClient; | ||
openai: OpenAI; | ||
} | ||
|
||
export const initAdapters = (context: Context, clients: EvalClients): Context => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This shouldn't have passed linter settings. Make sure you run |
||
const adapters = createAdapters(clients.supabase, clients.voyage, clients.openai, context); | ||
context.adapters = adapters; | ||
|
||
// Update adapter contexts | ||
Object.values(adapters).forEach((adapterGroup) => { | ||
Object.values(adapterGroup).forEach((adapter) => { | ||
if (adapter && typeof adapter === "object" && "context" in adapter) { | ||
adapter.context = context; | ||
} | ||
}); | ||
}); | ||
return context; | ||
}; | ||
|
||
export async function fetchContext(context: Context, question: string): Promise<FetchContext> { | ||
const { | ||
config: { similarityThreshold }, | ||
adapters: { | ||
supabase: { comment, issue }, | ||
voyage: { reranker }, | ||
}, | ||
} = context; | ||
const { specAndBodies, streamlinedComments } = await recursivelyFetchLinkedIssues({ | ||
context, | ||
owner: context.payload.repository.owner.login, | ||
repo: context.payload.repository.name, | ||
}); | ||
let formattedChat = await formatChatHistory(context, streamlinedComments, specAndBodies); | ||
logger.info(`${formattedChat.join("")}`); | ||
// using db functions to find similar comments and issues | ||
const [similarComments, similarIssues] = await Promise.all([ | ||
comment.findSimilarComments(question, 1 - similarityThreshold, ""), | ||
issue.findSimilarIssues(question, 1 - similarityThreshold, ""), | ||
]); | ||
// combine the similar comments and issues into a single array | ||
const similarText = [ | ||
...(similarComments?.map((comment: CommentSimilaritySearchResult) => comment.comment_plaintext) || []), | ||
...(similarIssues?.map((issue: IssueSimilaritySearchResult) => issue.issue_plaintext) || []), | ||
]; | ||
// filter out any empty strings | ||
formattedChat = formattedChat.filter((text) => text); | ||
// rerank the similar text using voyageai | ||
const rerankedText = similarText.length > 0 ? await reranker.reRankResults(similarText, question) : []; | ||
// gather structural data about the payload repository | ||
const [languages, { dependencies, devDependencies }] = await Promise.all([fetchRepoLanguageStats(context), fetchRepoDependencies(context)]); | ||
let groundTruths: string[] = []; | ||
if (!languages.length) { | ||
groundTruths.push("No languages found in the repository"); | ||
} | ||
if (!Reflect.ownKeys(dependencies).length) { | ||
groundTruths.push("No dependencies found in the repository"); | ||
} | ||
if (!Reflect.ownKeys(devDependencies).length) { | ||
groundTruths.push("No devDependencies found in the repository"); | ||
} | ||
if (groundTruths.length > 3) { | ||
groundTruths = await findGroundTruths(context, "chat-bot", { languages, dependencies, devDependencies }); | ||
} | ||
return { | ||
rerankedText, | ||
formattedChat, | ||
groundTruths, | ||
}; | ||
} | ||
|
||
export function formattedHistory(fetchContext: FetchContext): string { | ||
//Iterate through the formatted chat history and add it to the final formatted chat | ||
let formattedChat = "#################### Chat History ####################\n"; | ||
fetchContext.formattedChat.forEach((chat) => { | ||
formattedChat += chat; | ||
}); | ||
formattedChat += SEPERATOR; | ||
//Iterate through the reranked text and add it to the final formatted chat | ||
formattedChat += "#################### Reranked Text ####################\n"; | ||
fetchContext.rerankedText.forEach((reranked) => { | ||
formattedChat += reranked; | ||
}); | ||
formattedChat += SEPERATOR; | ||
//Iterate through the ground truths and add it to the final formatted chat | ||
formattedChat += "#################### Ground Truths ####################\n"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are these many hashtags really necessary? I thought writing in plain markdown syntax is sufficient. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
fetchContext.groundTruths.forEach((truth) => { | ||
formattedChat += truth; | ||
}); | ||
formattedChat += SEPERATOR; | ||
return formattedChat; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will it be a problem to leave in
@ubosshivaditya
?Also this seems like a random example can you explain the context of this file further?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can modify the app name to whatever is stored in the secrets; it doesn't matter, as the
askQuestion
function will be triggered either way.This file primarily contains solid baseline examples, including "gold star" responses to questions. We run the model with the same context and should expect similar performance.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have the production and beta instance of the app so I'm not sure about dealing with secrets to save the names. Think through how this will be configured and let me know what you think makes sense
I guess it's "gold standard" I just messed up the terminology when I called it "gold star".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be better if we could just hard code names, and keep in consistent in the
workflow
.No, you were correct—it's called a "gold star response"1. "Gold standard" is a different approach, but not the one we're discussing here.
Footnotes
https://arxiv.org/html/2410.23214v1 ↩
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hard coding seems questionable for developers but generally yes I agree that it's easier to deal with vs secrets. @gentlementlegen please decide
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is no longer relevant, as we are using the LLM command router.