diff --git a/framework-evalanche/home.py b/framework-evalanche/home.py index ef275ae..1dfba8c 100644 --- a/framework-evalanche/home.py +++ b/framework-evalanche/home.py @@ -110,9 +110,11 @@ def show_eval_details( label_visibility="collapsed", height=200, ) - for metric_name, assignments in evaluation["PARAM_ASSIGNMENTS"].items(): - with st.expander(f"Parameter Assignments for **{metric_name}**"): - st.write(assignments) + st.write("**Metrics**:") + for metric_name in evaluation["METRIC_NAMES"]: + with st.expander(f"{metric_name}"): + st.write(f"Model: {evaluation['MODELS'][metric_name]}") + st.write(evaluation["PARAM_ASSIGNMENTS"][metric_name]) button_container = row(5, vertical_align="center") if button_container.button("Run", use_container_width=True): click_func(evaluation) @@ -149,6 +151,7 @@ def run_saved_eval(evaluation: Dict[str, Any]) -> None: # wants to automate an already saved evaluation st.session_state["source_sql"] = evaluation["SOURCE_SQL"] st.session_state["param_selection"] = evaluation["PARAM_ASSIGNMENTS"] + st.session_state["model_selection"] = evaluation["MODELS"] st.switch_page("pages/results.py") @@ -171,6 +174,7 @@ def run_auto_eval(evaluation: Dict[str, Any]) -> None: metric for metric in metrics if metric.name in evaluation["METRIC_NAMES"] ] st.session_state["param_selection"] = evaluation["PARAM_ASSIGNMENTS"] + st.session_state["model_selection"] = evaluation["MODELS"] st.session_state["eval_funnel"] = "automated" try: result = st.session_state["session"].table( diff --git a/framework-evalanche/pages/data.py b/framework-evalanche/pages/data.py index ad6646d..be5d25e 100644 --- a/framework-evalanche/pages/data.py +++ b/framework-evalanche/pages/data.py @@ -15,6 +15,8 @@ fetch_columns, render_sidebar, table_data_selector, + select_model, + test_complete, ) from src.metric_utils import metric_runner from src.snowflake_utils import ( @@ -53,6 +55,17 @@ """ +def check_models(models: List[str]) -> None: + """Check if models are available in the Snowflake account region.""" + + for model in models: + available = test_complete(st.session_state["session"], + model) + if not available: + st.error(f"Model {model} not available in region. Please select another.") + st.stop() + + def run_sql(sql: str) -> Union[None, DataFrame]: """Run SQL query and return DataFrame or surfaces Streamlit error.""" @@ -286,7 +299,7 @@ def pipeline_runner_dialog() -> None: def configure_metrics() -> None: """Dialog to configure metric parameters/inputs to data source columns.""" - st.write("Select a column for each required parameter.") + st.write("Select a model and a column for each required parameter.") limit = 5 if st.session_state.get("single_source_data", None) is None: validate_data_inputs() @@ -306,9 +319,12 @@ def configure_metrics() -> None: except Exception as e: st.error(f"Error in pulling data: {e}") param_selection = {} # Track parameter-column assignments for each metric + model_selection = {} # Track model selection for each metric for metric in st.session_state["selected_metrics"]: st.divider() st.write(f"**{metric.name}**: {metric.description}") + model = select_model(default = metric.model, + keyname = metric.name) metric_params = ( OrderedDict() ) # Track each parameter assignment for a single metric @@ -322,8 +338,11 @@ def configure_metrics() -> None: help=desc, ) param_selection[metric.name] = metric_params + model_selection[metric.name] = model st.session_state["param_selection"] = param_selection + st.session_state["model_selection"] = model_selection if st.button("Run"): + check_models(st.session_state["model_selection"].values()) run_eval() @@ -356,6 +375,7 @@ def run_eval() -> None: st.session_state["metric_result_data"] = metric_runner( session=st.session_state["session"], metrics=st.session_state["selected_metrics"], + models=st.session_state["model_selection"], param_assignments=st.session_state["param_selection"], source_df=st.session_state["metric_result_data"], source_sql=None, diff --git a/framework-evalanche/pages/results.py b/framework-evalanche/pages/results.py index a3e6f4c..d7ed750 100644 --- a/framework-evalanche/pages/results.py +++ b/framework-evalanche/pages/results.py @@ -158,6 +158,7 @@ def save_eval() -> None: "METRIC_NAMES": [metric.name for metric in metrics], "DESCRIPTION": eval_description, # Not passed to object creation but just inserted into table "SOURCE_SQL": st.session_state["source_sql"], + "MODELS": st.session_state["model_selection"], "PARAM_ASSIGNMENTS": st.session_state["param_selection"], } @@ -171,6 +172,7 @@ def save_eval() -> None: eval_name=eval_metadata["EVAL_NAME"], metrics=metrics, source_sql=eval_metadata["SOURCE_SQL"], + models=eval_metadata["MODELS"], param_assignments=eval_metadata["PARAM_ASSIGNMENTS"], ) st.success( @@ -235,6 +237,7 @@ def automate_eval() -> None: "METRIC_NAMES": [metric.name for metric in metrics], "DESCRIPTION": eval_description, # Not passed to object creation but just inserted into table "SOURCE_SQL": st.session_state["source_sql"], + "MODELS": st.session_state["model_selection"], "PARAM_ASSIGNMENTS": st.session_state["param_selection"], } try: @@ -247,6 +250,7 @@ def automate_eval() -> None: warehouse=warehouse, eval_name=eval_metadata["EVAL_NAME"], metrics=metrics, + models=eval_metadata["MODELS"], source_sql=eval_metadata["SOURCE_SQL"], param_assignments=eval_metadata["PARAM_ASSIGNMENTS"], ) diff --git a/framework-evalanche/setup.sql b/framework-evalanche/setup.sql index 414a36c..2ba7989 100644 --- a/framework-evalanche/setup.sql +++ b/framework-evalanche/setup.sql @@ -15,6 +15,7 @@ CREATE TABLE IF NOT EXISTS CORTEX_ANALYST_UTILITIES.EVALUATION.SAVED_EVALUATIONS DESCRIPTION VARCHAR, METRIC_NAMES ARRAY, SOURCE_SQL VARCHAR, +MODELS VARIANT, PARAM_ASSIGNMENTS VARIANT, ASSOCIATED_OBJECTS VARIANT) COMMENT = '{"origin": "sf_sit", @@ -26,6 +27,7 @@ CREATE TABLE IF NOT EXISTS CORTEX_ANALYST_UTILITIES.EVALUATION.AUTO_EVALUATIONS DESCRIPTION VARCHAR, METRIC_NAMES ARRAY, SOURCE_SQL VARCHAR, +MODELS VARIANT, PARAM_ASSIGNMENTS VARIANT, ASSOCIATED_OBJECTS VARIANT) COMMENT = '{"origin": "sf_sit", diff --git a/framework-evalanche/src.zip b/framework-evalanche/src.zip index 96a947f..c7680a9 100644 Binary files a/framework-evalanche/src.zip and b/framework-evalanche/src.zip differ diff --git a/framework-evalanche/src/app_utils.py b/framework-evalanche/src/app_utils.py index dbc2428..401d301 100644 --- a/framework-evalanche/src/app_utils.py +++ b/framework-evalanche/src/app_utils.py @@ -22,6 +22,55 @@ } """ +models = [ + 'llama3.2-1b', + 'llama3.2-3b', + 'llama3.1-8b', + 'llama3.1-70b', + 'llama3.1-405b', + 'snowflake-arctic', + 'reka-core', + 'reka-flash', + 'mistral-large2', + 'mixtral-8x7b', + 'mistral-7b', + 'jamba-instruct', + 'jamba-1.5-mini', + 'jamba-1.5-large', + 'gemma-7b', +] + +def select_model( + keyname: str, + default: Optional[str] = None, + ) -> List[str]: + """Renders selectbox for model selection. + + Args: + default (string): Default model to select. + + Returns: + string: Selected model. + """ + + return st.selectbox("Select model", + models, + index=models.index(default) if default in models else None, + key=f"{keyname}_model_selector",) + + +def test_complete(session, model, prompt = "Repeat the word hello once and only once. Do not say anything else.") -> bool: + from snowflake.cortex import Complete + from snowflake.snowpark.exceptions import SnowparkSQLException + + """Returns True if selected model is supported in region and returns False otherwise.""" + try: + response = Complete(model, prompt, session = session) + return True + except SnowparkSQLException as e: + if 'unknown model' in str(e): + return False + def fetch_metrics() -> List[Metric]: """Combines metrics and custom metrics, if any, and returns list of metrics.""" @@ -287,7 +336,7 @@ def try_parse_json(value: str) -> Any: def fetch_evals( table_name: str, - json_cols=["METRIC_NAMES", "PARAM_ASSIGNMENTS", "ASSOCIATED_OBJECTS"], + json_cols=["METRIC_NAMES", "PARAM_ASSIGNMENTS", "MODELS", "ASSOCIATED_OBJECTS"], ) -> List[Optional[Dict[str, Optional[str]]]]: """ Returns evaluation metadata from tables. diff --git a/framework-evalanche/src/metric_utils.py b/framework-evalanche/src/metric_utils.py index e1e70cd..2941980 100644 --- a/framework-evalanche/src/metric_utils.py +++ b/framework-evalanche/src/metric_utils.py @@ -15,6 +15,7 @@ def run_metric( metrics: List[Metric], # List of metrics metric_result_data: DataFrame, + models: Dict[str, str], # Dictionary of metric model names params: Dict[str, Dict[str, str]], # Nested dictionary {metric_name: {key: value}} session: Session, ) -> DataFrame: @@ -31,10 +32,15 @@ def run_metric( input_name: column_name } ) + models is a dictionary with the following structure: + { + metric_name: model_name + ) Args: metrics (list[Metric]): Metric child classes metric_result_data (Dataframe): Snowpark dataframe with data to evaluate. + models (dict[str, str]): Dictionary of metric model names. params (dict[str, dict[str, str]]): Nested dictionary of metric parameter-column associations. session (Session): Snowpark session. @@ -60,7 +66,8 @@ def run_metric( *[ row[params[metric.name][key]] for key in params[metric.name] - ] + ], + model=models[metric.name] # Pass the correct model for each metric ) # Pass the correct params for each metric for metric in metrics }, @@ -76,6 +83,7 @@ def run_metric( def apply_metric( metrics: List[Metric], metric_result_data: DataFrame, + models: Dict[str, str], params: Dict[str, Dict[str, str]], session: Session, ): @@ -91,10 +99,15 @@ def apply_metric( input_name: column_name } ) + models is a dictionary with the following structure: + { + metric_name: model_name + ) Args: metrics (list[Metric]): Metric child classes metric_result_data (Dataframe): Snowpark dataframe with data to evaluate. + models (dict[str, str]): Dictionary of metric model names. params (dict[str, dict[str, str]]): Nested dictionary of metric parameter-column associations. session (Session): Snowpark session. @@ -102,7 +115,7 @@ def apply_metric( Dataframe: Snowpark dataframe of selected data with metric results. """ - metric_df = run_metric(metrics, metric_result_data, params, session) + metric_df = run_metric(metrics, metric_result_data, models, params, session) return metric_result_data.join(metric_df, on="ROW_ID", how="left") @@ -110,6 +123,7 @@ def apply_metric( def metric_runner( session: Session, metrics: List[Metric], + models: Dict[str, str], param_assignments: Dict[str, Dict[str, str]], source_sql: Optional[str] = None, source_df: Optional[DataFrame] = None, @@ -120,16 +134,21 @@ def metric_runner( Results are returned as a Snowpark dataframe. Either source_sql or source_df must be provided. - params is a nested dictionary with the following structure: + param_assignments is a nested dictionary with the following structure: { metric_name: { input_name: column_name } ) + models is a dictionary with the following structure: + { + metric_name: model_name + ) Args: session (Session): Snowpark session. metrics (list[Metric]): Metric child classes + models (dict[str, str]): Dictionary of metric model names. param_assignments (Dataframe): Snowpark dataframe with data to evaluate. source_sql (str, Optional): SQL to derive source data. source_df (Dataframe, Optional): Snowpark dataframe with source data. @@ -158,6 +177,7 @@ def metric_runner( df = apply_metric( metrics=metrics, metric_result_data=df, + models=models, params=param_assignments, session=session, ).drop("ROW_ID") @@ -196,6 +216,7 @@ def register_saved_eval_sproc( eval_name: str, metrics: List[Metric], source_sql: str, + models: Dict[str, str], param_assignments: Dict[str, Dict[str, str]], ) -> Dict[str, str]: """ @@ -208,6 +229,10 @@ def register_saved_eval_sproc( input_name: column_name } ) + models is a dictionary with the following structure: + { + metric_name: model_name + ) Returned ASSOCIATED_OBJECTS will contain the name of the stored procedure: { @@ -224,6 +249,7 @@ def register_saved_eval_sproc( Characters must follow Snowflake object naming rules. metrics (list[Metric]): Metric child classes source_sql (str, Optional): SQL to derive source data. + models (dict[str, str]): Dictionary of metric model names. param_assignments (Dataframe): Snowpark dataframe with data to evaluate. @@ -269,6 +295,7 @@ def metric_sproc(session: Session): df = metric_runner( session=session, metrics=metrics, + models=models, source_sql=source_sql, source_df=None, param_assignments=param_assignments, @@ -287,6 +314,7 @@ def register_auto_eval_sproc( output_table_name: str, metrics: List[Metric], source_sql: str, + models: Dict[str, str], param_assignments: Dict[str, Dict[str, str]], ) -> None: """ @@ -302,6 +330,11 @@ def register_auto_eval_sproc( input_name: column_name } ) + models is a dictionary with the following structure: + { + metric_name: model_name + ) + Args: session (Session): Snowpark session. @@ -310,6 +343,7 @@ def register_auto_eval_sproc( output_table_name (str): Fully-qualified Snowflake stored table name. metrics (list[Metric]): Metric child classes source_sql (str, Optional): SQL to derive source data. + models (dict[str, str]): Dictionary of metric model names. param_assignments (Dataframe): Snowpark dataframe with data to evaluate. @@ -349,6 +383,7 @@ def metric_sproc(session: Session) -> None: df = metric_runner( session=session, metrics=metrics, + models=models, source_sql=source_sql, source_df=None, param_assignments=param_assignments, @@ -455,13 +490,14 @@ def create_eval_task( def automate_eval_objects( session: Session, - stage: str, # Fully-qualified? + stage: str, warehouse: str, database: str, schema: str, eval_name: str, # Just the name source_sql: str, metrics: List[Metric], + models: Dict[str, str], param_assignments: Dict[str, Dict[str, str]], ) -> Dict[str, str]: """ @@ -473,6 +509,10 @@ def automate_eval_objects( input_name: column_name } ) + models is a dictionary with the following structure: + { + metric_name: model_name + ) Returned ASSOCIATED_OBJECTS will contain the following: { @@ -494,6 +534,7 @@ def automate_eval_objects( Characters must follow Snowflake object naming rules. source_sql (str, Optional): SQL to derive source data. metrics (list[Metric]): Metric child classes + models (dict[str, str]): Dictionary of metric model names. param_assignments (Dataframe): Snowpark dataframe with data to evaluate. Returns: @@ -533,6 +574,7 @@ def set_name(object_type: str): ASSOCIATED_OBJECTS["TABLE"], metrics, metric_source_sql, # This is the SQL that captures Stream INSERTS + models, param_assignments, ) diff --git a/framework-evalanche/src/metrics.py b/framework-evalanche/src/metrics.py index 4f84cf1..88c67ea 100644 --- a/framework-evalanche/src/metrics.py +++ b/framework-evalanche/src/metrics.py @@ -1,13 +1,11 @@ from abc import ABC, abstractmethod # Python 3.8 type hints -from typing import Dict, Union +from typing import Dict, Union, Optional from src.prompts import * from src.snowflake_utils import run_async_sql_complete -# from src.custom_metrics import custom_metrics - class Metric(ABC): def __init__( @@ -38,9 +36,10 @@ def evaluate(self, *args): class SQLAccuracy(Metric): def __init__( self, + model: str = "reka-flash" ): super().__init__( - name="SQLAccuracy", + name="SQL Accuracy", description=""" Evaluates if 2 SQL queries return the same data given a user question. Results are True or False. @@ -52,6 +51,7 @@ def __init__( "expected_sql": "Ground truth SQL statement", }, ) + self.model = model def get_prompt( self, question: str, inference_sql: str, expected_sql: str @@ -76,11 +76,14 @@ def evaluate( question: str, inference_sql: str, expected_sql: str, - model: str = "reka-flash", + model: Optional[str] = None, ): + + model_to_use = model if model else self.model + prompt = self.get_prompt(question, inference_sql, expected_sql) - response = run_async_sql_complete(self.session, model, prompt) + response = run_async_sql_complete(self.session, model_to_use, prompt) if "true" in response.lower(): return True else: @@ -88,22 +91,25 @@ def evaluate( # Knowledge Retrieval/Answer Metrics -class Relevancy(Metric): +class Correctness(Metric): def __init__( self, + model: str = "llama3.1-8b" ): super().__init__( - name="Relevancy", + name="Correctness", description=""" -Evaluates the correctness, relevance, and helpfulness of a response compared to a reference answer on a scale of 1-5. -5 indicates the scorer strongly agrees that the response is correct, relevant, and helpful and 1 indicates strong disagreement.""", - prompt=Relevance_prompt, +Evaluates the correctness of a response compared to a reference answer on a scale of 1-5. +5 indicates the scorer strongly agrees that the response is correct and 1 indicates strong disagreement.""", + prompt=Correctness_prompt, required={ "question": "User question", "answer_ref": "Expected answer to the question", "ai_response": "LLM-generated response to the question", }, + ) + self.model = model def get_prompt( self, question: str, answer_ref: str, ai_response: str @@ -123,13 +129,15 @@ def evaluate( question: str, answer_ref: str, ai_response: str, - model: str = "llama3.1-8b", + model: Optional[str] = None, ): import re + model_to_use = model if model else self.model + prompt = self.get_prompt(question, answer_ref, ai_response) - response = run_async_sql_complete(self.session, model, prompt) + response = run_async_sql_complete(self.session, model_to_use, prompt) values = [str(i) for i in range(1, 11)] pattern = f"[{''.join(values)}]" match = re.search(pattern, response) @@ -140,6 +148,7 @@ def evaluate( class Comprehensiveness(Metric): def __init__( self, + model: str = "llama3.1-8b" ): super().__init__( name="Comprehensiveness", @@ -153,6 +162,7 @@ def __init__( "ai_response": "LLM-generated response to the question", }, ) + self.model = model def get_prompt( self, question: str, answer_ref: str, ai_response: str @@ -172,13 +182,15 @@ def evaluate( question: str, answer_ref: str, ai_response: str, - model: str = "llama3.1-8b", + model: Optional[str] = None, ): import re + model_to_use = model if model else self.model + prompt = self.get_prompt(question, answer_ref, ai_response) - response = run_async_sql_complete(self.session, model, prompt) + response = run_async_sql_complete(self.session, model_to_use, prompt) values = [str(i) for i in range(1, 11)] pattern = f"[{''.join(values)}]" match = re.search(pattern, response) @@ -186,30 +198,32 @@ def evaluate( return int(match.group(0)) if match else None -class ContentAccuracy(Metric): +class Hallucination(Metric): def __init__( self, + model: str = "llama3.1-8b" ): super().__init__( - name="Content Accuracy", + name="Hallucination", description=""" -Evaluates the accuracy of a response given relevant facts on a scale of 1-5. -5 indicates the scorer strongly agrees that the response is factually accurate and 1 indicates strong disagreement.""", - prompt=Comprehensiveness_prompt, +Evaluates the prevalance of hallucination in a response based on reference context on a scale of 1-5. +5 indicates the scorer strongly agrees that the response is hallucination-free and 1 indicates strong disagreement.""", + prompt=Hallucination_prompt, required={ "question": "User question", - "content": "Relevant facts or content", + "context": "Applicable knowledge base context", "ai_response": "LLM-generated response to the question", }, ) + self.model = model def get_prompt( - self, question: str, content: str, ai_response: str + self, question: str, context: str, ai_response: str ) -> Union[str, None]: if self.prompt is not None: fstrings = { "question": question, - "content": content, + "context": context, "ai_response": ai_response, } return self.prompt.format(**fstrings) @@ -217,17 +231,15 @@ def get_prompt( return None def evaluate( - self, - question: str, - content: str, - ai_response: str, - model: str = "llama3.1-8b", + self, question: str, context: str, ai_response: str, model: Optional[str] = None, ): import re - prompt = self.get_prompt(question, content, ai_response) + model_to_use = model if model else self.model + + prompt = self.get_prompt(question, context, ai_response) - response = run_async_sql_complete(self.session, model, prompt) + response = run_async_sql_complete(self.session, model_to_use, prompt) values = [str(i) for i in range(1, 11)] pattern = f"[{''.join(values)}]" match = re.search(pattern, response) @@ -235,30 +247,75 @@ def evaluate( return int(match.group(0)) if match else None -class Hallucination(Metric): +# Non-Reference Prompt Metrics +class ConversationCohesiveness(Metric): def __init__( self, + model: str = "llama3.1-8b" ): super().__init__( - name="Hallucination", + name="Conversation Cohesiveness", description=""" -Evaluates the prevalance of hallucination in a response based on reference context on a scale of 1-5. -5 indicates the scorer strongly agrees that the response is hallucination-free and 1 indicates strong disagreement.""", - prompt=Hallucination_prompt, +Evaluates the cohesivenss and adherence to topics of AI responses in conversation on a scale of 1-5. +5 indicates the scorer strongly agrees that the conversation is cohesive and stays on topic and 1 indicates strong disagreement.""", + prompt=ConversationCohesiveness_prompt, + required={ + "exchange": "Conversation between user and AI", + }, + ) + self.model = model + + def get_prompt(self, exchange: str) -> Union[str, None]: + if self.prompt is not None: + fstrings = { + "exchange": exchange, + } + return self.prompt.format(**fstrings) + else: + return None + + def evaluate( + self, + exchange: str, + model: Optional[str] = None, + ): + import re + + model_to_use = model if model else self.model + + prompt = self.get_prompt(exchange) + + response = run_async_sql_complete(self.session, model_to_use, prompt) + values = [str(i) for i in range(1, 11)] + pattern = f"[{''.join(values)}]" + match = re.search(pattern, response) + + return int(match.group(0)) if match else None + +class AnswerRelevancy(Metric): + def __init__( + self, + model: str = "llama3.1-8b" + ): + super().__init__( + name="Answer Relevancy", + description=""" +Evaluates the relevance of a response to a user question on a scale of 1-5. +5 indicates the scorer strongly agrees that the response is relevant and 1 indicates strong disagreement.""", + prompt=AnswerRelevancy_prompt, required={ "question": "User question", - "context": "Applicable knowledge base context", "ai_response": "LLM-generated response to the question", }, ) + self.model = model def get_prompt( - self, question: str, context: str, ai_response: str + self, question: str, ai_response: str ) -> Union[str, None]: if self.prompt is not None: fstrings = { "question": question, - "context": context, "ai_response": ai_response, } return self.prompt.format(**fstrings) @@ -266,40 +323,49 @@ def get_prompt( return None def evaluate( - self, question: str, context: str, ai_response: str, model: str = "llama3.1-8b" + self, + question: str, + ai_response: str, + model: Optional[str] = None, ): import re - prompt = self.get_prompt(question, context, ai_response) + model_to_use = model if model else self.model + + prompt = self.get_prompt(question, ai_response) - response = run_async_sql_complete(self.session, model, prompt) + response = run_async_sql_complete(self.session, model_to_use, prompt) values = [str(i) for i in range(1, 11)] pattern = f"[{''.join(values)}]" match = re.search(pattern, response) return int(match.group(0)) if match else None - - -# Single Input Basic Prompt Metrics -class ConversationCohesiveness(Metric): + +class ContextualRelevancy(Metric): def __init__( self, + model: str = "llama3.1-8b" ): super().__init__( - name="Conversation Cohesiveness", + name="Contextual Relevancy", description=""" -Evaluates the cohesivenss and adherence to topics of AI responses in conversation on a scale of 1-5. -5 indicates the scorer strongly agrees that the conversation is cohesive and stays on topic and 1 indicates strong disagreement.""", - prompt=ConversationCohesiveness_prompt, +Evaluates the contextual relevance of retrieved content in response to a user question on a scale of 1-5. +5 indicates the scorer strongly agrees that the response is contextually relevant and 1 indicates strong disagreement.""", + prompt=ContextualRelevancy_prompt, required={ - "exchange": "Conversation between user and AI", + "question": "User question", + "retrieved_content": "Retrieved content in response to the question", }, ) + self.model = model - def get_prompt(self, exchange: str) -> Union[str, None]: + def get_prompt( + self, question: str, retrieved_content: str + ) -> Union[str, None]: if self.prompt is not None: fstrings = { - "exchange": exchange, + "question": question, + "retrieved_content": retrieved_content, } return self.prompt.format(**fstrings) else: @@ -307,14 +373,17 @@ def get_prompt(self, exchange: str) -> Union[str, None]: def evaluate( self, - exchange: str, - model: str = "llama3.1-8b", + question: str, + retrieved_content: str, + model: Optional[str] = None, ): import re - prompt = self.get_prompt(exchange) + model_to_use = model if model else self.model + + prompt = self.get_prompt(question, retrieved_content) - response = run_async_sql_complete(self.session, model, prompt) + response = run_async_sql_complete(self.session, model_to_use, prompt) values = [str(i) for i in range(1, 11)] pattern = f"[{''.join(values)}]" match = re.search(pattern, response) @@ -331,36 +400,38 @@ def evaluate( ], } knowledge_base_retrieval_metrics = { - "section_name": "Knowledge Base Reference Metrics", + "section_name": "Knowledge-Based Reference Metrics", "caption": """Suggested metrics to evaluate the quality of knowledge-based responses given reference material.""", "metrics": [ - Relevancy(), + Correctness(), Comprehensiveness(), Hallucination(), - ContentAccuracy(), ], } -singe_input_metrics = { - "section_name": "General Single Input Metrics", - "caption": """Suggested metrics to evaluate qualities of standalone inputs.""", +non_knowledge_base_retrieval_metrics = { + "section_name": "Non-Knowledge-Based Reference Metrics", + "caption": """Suggested metrics to evaluate the quality of responses without reference material.""", "metrics": [ ConversationCohesiveness(), + AnswerRelevancy(), + ContextualRelevancy() ], } # All metrics metrics = [ SQLAccuracy(), - Relevancy(), + Correctness(), Comprehensiveness(), - ContentAccuracy(), Hallucination(), ConversationCohesiveness(), + AnswerRelevancy(), + ContextualRelevancy() ] # Display metrics on homepage by section metric_display = [ cortex_analyst_metrics, knowledge_base_retrieval_metrics, - singe_input_metrics, + non_knowledge_base_retrieval_metrics, ] diff --git a/framework-evalanche/src/prompts.py b/framework-evalanche/src/prompts.py index ba027c7..5c938fe 100644 --- a/framework-evalanche/src/prompts.py +++ b/framework-evalanche/src/prompts.py @@ -17,10 +17,10 @@ [The End of the Ground Truth Data] """ -Relevance_prompt = """Please act as an impartial judge and evaluate the quality of the response provided by the AI Assistant to the user question displayed below. -Your evaluation should consider CORRECTNESS, RELEVANCE, and HELPFULNESS. You will be given a reference answer and the AI Assistant's answer. +Correctness_prompt = """Please act as an impartial judge and evaluate the quality of the response provided by the AI Assistant to the user question displayed below. +Your evaluation should consider CORRECTNESS. You will be given a reference answer and the AI Assistant's answer. Your job is to rate the assistant's answer from 1 to 5, where 5 indicates you strongly agree that the response is -CORRECT, RELEVANT, and HELPFUL +CORRECT and 1 indicates you strongly disagree. Avoid any position biases and ensure that the order in which the content presented does not affect your evaluation. Be as objective as possible. Output your rating with just the number rating value. @@ -36,19 +36,6 @@ [The End of the AI Assistant's Answer] """ -ConversationCohesiveness_prompt = """Please act as an impartial judge and evaluate the quality of the response(s) provided by the AI Assistant to the user question(s). -Your evaluation should consider COHESIVENESS and the degree to which the AI Assistant stays on topic in conversation. You will be given both user queries and the AI Assistant response(s). -Your job is to rate the assistant's conversation from 1 to 5, where 5 indicates you strongly agree that the assistant's response(s) were -COHESIVE and ON TOPIC -and 1 indicates you strongly disagree. -Avoid any position biases and ensure that the order in which the content presented does not affect your evaluation. -Be as objective as possible. Output your rating with just the number rating value. - -[The Start of the User and AI Exchange] -{exchange} -[The End of the User and AI Exchange] -""" - Comprehensiveness_prompt = """Please act as an impartial judge and evaluate the quality of the response provided by the AI Assistant to the user question displayed below. Your evaluation should consider THOROUGHNESS and COMPREHENSIVENESS. You will be given a reference answer and the AI Assistant's answer. Your job is to rate the assistant's response from 1 to 5, where 5 indicates you strongly agree that the response is @@ -68,10 +55,10 @@ [The End of the AI Assistant's Answer] """ -ContentAccuracy_prompt = """Please act as an impartial judge and evaluate the quality of the response provided by the AI Assistant to the user question displayed below. -Your evaluation should consider FACTUAL ACCURACY. You will be given reference content and the AI Assistant's answer. +Hallucination_prompt = """Please act as an impartial judge and evaluate the prevalance of hallucination in the response provided by the AI Assistant to the user question displayed below. +Your evaluation should consider only context provided in the reference material. You will be given reference content and the AI Assistant's response. Your job is to rate the assistant's response from 1 to 5, where 5 indicates you strongly agree that the response is -FACTUALLY ACCURATE +HALLUCINATION-FREE and 1 indicates you strongly disagree. Avoid any position biases and ensure that the order in which the content presented does not affect your evaluation. Be as objective as possible. Output your rating with just the number rating value. @@ -79,29 +66,54 @@ {question} [Reference Material] -{content} +{context} [The Start of the AI Assistant's Response] {ai_response} [The End of the AI Assistant's Response] """ -Hallucination_prompt = """Please act as an impartial judge and evaluate the prevalance of hallucination in the response provided by the AI Assistant to the user question displayed below. -Your evaluation should consider only context provided in the reference material. You will be given reference content and the AI Assistant's response. +ConversationCohesiveness_prompt = """Please act as an impartial judge and evaluate the quality of the response(s) provided by the AI Assistant to the user question(s). +Your evaluation should consider COHESIVENESS and the degree to which the AI Assistant stays on topic in conversation. You will be given both user queries and the AI Assistant response(s). +Your job is to rate the assistant's conversation from 1 to 5, where 5 indicates you strongly agree that the assistant's response(s) were +COHESIVE and ON TOPIC +and 1 indicates you strongly disagree. +Avoid any position biases and ensure that the order in which the content presented does not affect your evaluation. +Be as objective as possible. Output your rating with just the number rating value. + +[The Start of the User and AI Exchange] +{exchange} +[The End of the User and AI Exchange] +""" + +AnswerRelevancy_prompt = """Please act as an impartial judge and evaluate the quality of the response provided by the AI Assistant to the user question displayed below. +Your evaluation should consider RELEVANCY. You will be given a user question and the AI Assistant's answer. Your job is to rate the assistant's response from 1 to 5, where 5 indicates you strongly agree that the response is -HALLUCINATION-FREE +RELEVANT and 1 indicates you strongly disagree. Avoid any position biases and ensure that the order in which the content presented does not affect your evaluation. Be as objective as possible. Output your rating with just the number rating value. [User Question] {question} -[Reference Material] -{context} - -[The Start of the AI Assistant's Response] +[The Start of the AI Assistant's Answer] {ai_response} -[The End of the AI Assistant's Response] +[The End of the AI Assistant's Answer] +""" + +ContextualRelevancy_prompt = """Please act as an impartial judge and evaluate the quality of the retrieved content provided by a content retrieval mechanism in response to the user question displayed below. +Your evaluation should consider CONTEXTUAL RELEVANCY. You will be given a user question and the retrieved content. +Your job is to rate the assistant's response from 1 to 5, where 5 indicates you strongly agree that the retrieved content is +RELEVANT +and 1 indicates you strongly disagree. +Avoid any position biases and ensure that the order in which the content presented does not affect your evaluation. +Be as objective as possible. Output your rating with just the number rating value. +[User Question] +{question} + +[The Start of the Retrieved Content] +{retrieved_content} +[The End of the Retrieved Content] """ Recommendation_prompt = """You're an AI Assistant tasked with helping an analyst improve their generative AI evaluation results that use LLM-as-a-judge.