Merge pull request #34 from Snowflake-Labs/evalanche2

Update
Snowflake-Labs · Oct 29, 2024 · ee7a2c5 · ee7a2c5
2 parents 1645378 + b9a055c
commit ee7a2c5
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 15 deletions.
diff --git a/framework-evalanche/README.md b/framework-evalanche/README.md
@@ -86,8 +86,8 @@ To run a reference dataset through your desired LLM pipelines on the data page,
 will be passed to the stored procedure as:
 ```python
 {
-    "QUESTION": "What is the capital of France?",
-    "QUESTION_TYPE": "Geography"
+    "TASK": "Tell a joke",
+    "PERSONA": "Pirate"
 }
 ```
 A appropriately crafted stored procedure could look like the below.

diff --git a/framework-evalanche/pages/data.py b/framework-evalanche/pages/data.py
@@ -147,7 +147,8 @@ def preview_merge_data() -> None:
             st.error(f"Error: {e}")
     if data is not None:
         st.write(f"Limited to {limit} rows.")
-        st.dataframe(data, hide_index=True, use_container_width=True)
+        st.dataframe(data, hide_index=True, use_container_width=False)
+
 
 
 def data_spec(key_name: str, instructions: str, height=200, join_key=True) -> None:

diff --git a/framework-evalanche/src.zip b/framework-evalanche/src.zip
diff --git a/framework-evalanche/src/custom_metrics.py b/framework-evalanche/src/custom_metrics.py
@@ -1,5 +1,5 @@
 # Python 3.8 type hints
-from typing import Union
+from typing import Union, Optional
 
 from src.metrics import Metric
 from src.prompts import *
@@ -19,28 +19,30 @@
 """
 
 # Custom Metrics
-# class CustomRelevancy(Metric):
+# class CustomAnswerRelevancy(Metric):
 #     def __init__(
 #         self,
+#         model: str = "llama3.1-8b"
 #     ):
 #         super().__init__(
 #             name="CustomRelevancy",
-#             description="Evaluates the correctness, relevance, and helpfulness of a response compared to a reference answer.",
-#             prompt=Relevance_prompt,
+#             description="""
+# Evaluates the relevance of a response to a user question on a scale of 1-5.
+# 5 indicates the scorer strongly agrees that the response is relevant and 1 indicates strong disagreement.""",
+#             prompt=AnswerRelevancy_prompt,
 #             required={
 #                 "question": "User question",
-#                 "answer_ref": "Expected answer to the question",
 #                 "ai_response": "LLM-generated response to the question",
 #             },
 #         )
+#         self.model = model
 
 #     def get_prompt(
-#         self, question: str, answer_ref: str, ai_response: str
+#         self, question: str, ai_response: str
 #     ) -> Union[str, None]:
 #         if self.prompt is not None:
 #             fstrings = {
 #                 "question": question,
-#                 "answer_ref": answer_ref,
 #                 "ai_response": ai_response,
 #             }
 #             return self.prompt.format(**fstrings)
@@ -50,21 +52,22 @@
 #     def evaluate(
 #         self,
 #         question: str,
-#         answer_ref: str,
 #         ai_response: str,
-#         model: str = "llama3.1-8b",
+#         model: Optional[str] = None,
 #     ):
 #         import re
 
-#         prompt = self.get_prompt(question, answer_ref, ai_response)
+#         model_to_use = model if model else self.model  
 
-#         response = run_async_sql_complete(self.session, model, prompt)
+#         prompt = self.get_prompt(question, ai_response)
+
+#         response = run_async_sql_complete(self.session, model_to_use, prompt)
 #         values = [str(i) for i in range(1, 11)]
 #         pattern = f"[{''.join(values)}]"
 #         match = re.search(pattern, response)
 
 #         return int(match.group(0)) if match else None
 
 custom_metrics = [
-    # CustomRelevancy()
+    # CustomAnswerRelevancy()
 ]