Evalanche release 11-25-2024

Snowflake-Labs · Nov 25, 2024 · 11ae44b · 11ae44b
1 parent e232017
commit 11ae44b
Show file tree

Hide file tree

Showing 10 changed files with 471 additions and 194 deletions.
diff --git a/framework-evalanche/README.md b/framework-evalanche/README.md
@@ -21,7 +21,7 @@ Please see TAGGING.md for details on object comments.
 * [Running](#release)
 * [Extras](#extras)
     + [Custom Metrics](#custom-metrics)
-    + [Crafting a LLM Pipeline Stored Procedure](#crafting-a-llm-pipeline-stored-procedure)
+    + [Generating Results to Evaluate](#generating-results-to-evaluate)
 
 # Overview
 Evalanche is a Streamlit in Snowflake (SiS) application that provides a single location to evaluate and compare generative AI use case outputs in a streamlined, on demand, and automated fashion. Regardless if your goal is to measure the quality of RAG-based LLM solutions or accuracy of SQL generation, Evalanche provides a scalable, customizable, and trackable way to do it.
@@ -95,7 +95,10 @@ CALL GENAI_UTILITIES.EVALUATION.DELETE_METRIC('Rudeness');
 
 Lastly, please be aware that Streamlit in Snowflake now supports multiple python versions. Custom metrics may only be available with consistent Python versions. For example, if you create a custom metric while running the app with Python version 3.11, the custom metric will only be available in subsequent sessions when running Python 3.11. 
 
-## Crafting a LLM Pipeline Stored Procedure
+## Generating Results to Evaluate
+Evalanche primarily assumes you've saved LLM outputs to table(s) in Snowflake for us to evaluate. That may not be the case. Evalanche supports two ways to generate outputs using either a custom LLM pipeline or a Cortex Analyst runner. Both options are available from the data page (under "Need to Generate Results?") once you've selected your desired Metric(s).  
+
+### Crafting a Stored Procedure for your Custom LLM Pipeline
 To run a reference dataset through your desired LLM pipelines on the data page, we must first encapsulated the pipeline logic in a Stored Procedure. To take advantage of this feature, the stored procedure must have a single VARIANT input type and return a single value. When we execute the stored procedure, a single row from the reference dataset will be passed in the form of a Python dictionary. In other words, a row in the reference dataset that looks like:
 ```markdown
 | TASK        | PERSONA |
@@ -109,7 +112,7 @@ will be passed to the stored procedure as:
     "PERSONA": "Pirate"
 }
 ```
-A appropriately crafted stored procedure could look like the below.
+An appropriately crafted stored procedure could look like the below.
 ```sql
 CREATE OR REPLACE PROCEDURE MY_PIPELINE(INPUT VARIANT)
   RETURNS STRING
@@ -131,3 +134,9 @@ def run(session, INPUT):
                     prompt = prompt)
 $$;
 ```
+
+### Using the Cortex Analyst Runner
+To run a gold or reference set of questions through Cortex Analyst, select the target semantic model and the table containing the reference questions. The SQL results will be written to a table for further evaluation with the Cortex Analyst-suggested metric. 
+
+# Feedback
+Please add issues to GitHub or email Jason Summer ([email protected]).
diff --git a/framework-evalanche/pages/data.py b/framework-evalanche/pages/data.py
diff --git a/framework-evalanche/pages/results.py b/framework-evalanche/pages/results.py
@@ -21,8 +21,9 @@
     SAVED_EVAL_TABLE, 
     STAGE_NAME,
     add_row_id,
+    run_async_sql_to_dataframe,
 )
-from src.metrics import Metric
+from src.metrics import Metric, SQLResultsAccuracy
 
 
 def get_result_title() -> str:
@@ -268,7 +269,7 @@ def give_recommendation_instruction() -> None:
     )
 
 
-def get_metric_cols(current_df: DataFrame) -> list:
+def get_metric_cols(current_df: Union[DataFrame, pd.DataFrame]) -> list:
     """Returns list of columns in dataframe that contain metric values.
     
     Some metric names have spaces and Snowpark keeps them in lower case with double quotes.
@@ -279,6 +280,7 @@ def get_metric_cols(current_df: DataFrame) -> list:
     return [c_name for c_name in df_columns if c_name.upper() in (m_name.upper() for m_name in metric_names)]
 
 
+
 def show_metric() -> None:
     """Renders metric KPIs based on selected metrics."""
 
@@ -297,12 +299,13 @@ def show_metric() -> None:
                  Please create a new evaluation or select an existing one from the homepage.""")
         st.stop()
 
-    if st.session_state.get("metric_result_data", None) is not None:
-        df = st.session_state["metric_result_data"]
+    if st.session_state.get("result_data", None) is not None:
+        df = st.session_state["result_data"]
         metric_names = [metric.get_column() for metric in st.session_state["metrics_in_results"]]
         kpi_row = row(6, vertical_align="top")
         # Placing entire dataframe in memory seems to be more stable than iterating over columns and averaging in snowpark
-        metric_values = df.select(*metric_names).to_pandas()
+        # metric_values = df.select(*metric_names).to_pandas()
+        metric_values = df[metric_names]
 
         for metric_name, metric_value in metric_values.mean().to_dict().items():
             kpi_row.metric(label=metric_name, value=round(metric_value, 2))
@@ -390,21 +393,44 @@ def update_record(table_update_inputs: Dict[str, str], selected_metric_name: str
     st.session_state["result_data"] = df
 
 
-# metrics = fetch_metrics(st.session_state["session"], STAGE_NAME)
+def show_cortex_analyst_sql_results(metric: Metric, prompt_inputs: Dict[str, str]) -> None:
+    """Displays data retrieved from SQL used in Cortex Analyst metrics.
+    
+    Shows results for generated_sql and expected_sql in the prompt_inputs dictionary.
+    Only shows results if metric matches the name property of SQLResultsAccuracy.
+
+    Args:
+        metric (Metric): Column name keys with updated values to replace in dataframe.
+        prompt_inputs (dict[str, str]): Dictionary of prompt inputs for the metric.
+    """
+
+    if type(metric).__name__ is (type(SQLResultsAccuracy()).__name__):
+        with st.expander("Retrieved Data", expanded=False):
+            st.caption("Results limited to 100 rows.")
+            for key in ["generated_sql", "expected_sql"]:
+                st.write(f"{key.upper()} Result")
+                if key in prompt_inputs:
+                    try:
+                        inference_data = run_async_sql_to_dataframe(metric.session, prompt_inputs[key])
+                        st.dataframe(inference_data,
+                                    hide_index = True,)
+                    except Exception as e:
+                        st.write(f"Error: {e}")
+                else:
+                    st.write("No data returned")
 
 @st.experimental_dialog("Review Record", width="large")
 def review_record() -> None:
     """Render dialog box to review a metric result record."""
 
-
+    st.write("Analyze and explore the selected record. Model selection will be used for analysis and metric rerunning. Updates can be saved to viewed results.")
     if st.session_state["selected_dict"] is None or len(st.session_state["selected_dict"]) == 0:
         st.write("Please select a record to review.")
     elif len(st.session_state["selected_dict"]) > 1:
         st.write("Please select only one record to review at a time.")
     else:
         # Only first record is selected for analysis
         selected_record = st.session_state["selected_dict"][0]
-        # metrics = fetch_metrics(st.session_state["session"], STAGE_NAME)
         metric_cols = get_metric_cols(st.session_state.get("metric_result_data", None))
 
         metric_col, model_col = st.columns(2)
@@ -435,7 +461,10 @@ def review_record() -> None:
             for key, value in st.session_state["param_selection"][
                     matching_metric.name
                 ].items():
-                entered_value = st.text_area(value, selected_record[value])
+                entered_value = st.text_area(value, 
+                                             selected_record[value],
+                                             key = value)
+
                 prompt_inputs[key] = entered_value
                 table_update_inputs[value] = entered_value
             metric_col, comment_col = st.columns((1, 4))
@@ -452,16 +481,29 @@ def review_record() -> None:
                                         on_click = rerun_metric, args = (prompt_inputs, matching_metric),
                                         use_container_width=True,)
         save = bottom_selection.button("Save", disabled = selected_metric_name is None,
-                                       use_container_width=True,)
+                                       use_container_width=True,
+                                       help = "Save changes to record in current view.")
+
+        # Unsaved changes in the dialog may linger if user navigates away and returns.
+        # Here we provide a reset button to clear out any unsaved changes.
+        reset = bottom_selection.button("Reset", disabled = selected_metric_name is None,
+                                       use_container_width=True,
+                                       help = "Reset all unsaved changed to selected record.")
 
         if st.session_state.get('analysis', None) is not None:
             st.write(f"**Analysis:** {st.session_state['analysis']}")
-
+
+        # If evaluating SQL, show SQL results of current inputs
+        show_cortex_analyst_sql_results(matching_metric, prompt_inputs)
+
         if save:
             update_record(table_update_inputs, 
                             selected_metric_name,
                             selected_record['ROW_ID'])
             st.rerun()
+        if reset:
+            st.rerun()
+
 
 def show_dataframe_results() -> Optional[pd.DataFrame]:
     """
@@ -477,15 +519,8 @@ def show_dataframe_results() -> Optional[pd.DataFrame]:
         pandas Dataframe 
     """
 
-    if st.session_state.get("metric_result_data", None) is not None:
-        if st.session_state.get('result_data', None) is None:
-            st.session_state["result_data"] = add_row_id(st.session_state["metric_result_data"])\
-                                        .withColumn("REVIEW", F.lit(False))\
-                                        .withColumn("COMMENT", F.lit(None)).to_pandas()
-
-            # Store available metrics in session state
-            st.session_state["metrics"] = fetch_metrics(st.session_state["session"], STAGE_NAME)
-
+
+    if st.session_state.get('result_data', None) is not None:    
         df_selection = st.data_editor(
             st.session_state["result_data"],
             hide_index=True,
@@ -498,7 +533,6 @@ def show_dataframe_results() -> Optional[pd.DataFrame]:
 
         return df_selection
     else:
-        st.session_state["result_data"] = None
         return None
 
 
@@ -509,18 +543,14 @@ def trend_avg_metrics() -> None:
     """
 
     if (
-        st.session_state.get("metric_result_data", None) is not None
+        st.session_state.get("result_data", None) is not None
         and st.session_state.get("metrics_in_results", None) is not None
     ):
-        metric_cols = get_metric_cols(st.session_state.get("metric_result_data", None))
+        metric_cols = get_metric_cols(st.session_state.get("result_data", None))
 
-        # We cast to variant in case the metric is a boolean
+        df = st.session_state["result_data"].groupby('METRIC_DATETIME')[metric_cols].mean()
+
         # METRIC_DATETIME is batched for every run so there should be many rows per metric calculation set
-        df = (
-            st.session_state["metric_result_data"]
-            .group_by("METRIC_DATETIME")
-            .agg(*[F.avg(F.to_variant(col)).alias(col) for col in metric_cols])
-        )
         st.write("Average Metric Scores over Time")
         st.line_chart(
             df,
@@ -536,12 +566,12 @@ def trend_count_metrics() -> None:
     """
 
     if (
-        st.session_state.get("metric_result_data", None) is not None
+        st.session_state.get("result_data", None) is not None
         and st.session_state.get("metrics_in_results", None) is not None
     ):
-        metric_cols = get_metric_cols(st.session_state.get("metric_result_data", None))
+        metric_cols = get_metric_cols(st.session_state.get("result_data", None))
 
-        df = st.session_state["metric_result_data"]
+        df = st.session_state["result_data"]
         st.write("Metric Scores over Time")
         st.bar_chart(
             df,
@@ -557,20 +587,16 @@ def bar_chart_metrics() -> None:
     """
 
     if (
-        st.session_state.get("metric_result_data", None) is not None
+        st.session_state.get("result_data", None) is not None
         and len(st.session_state.get("metrics_in_results", []))>0
     ):
-        metric_cols = get_metric_cols(st.session_state.get("metric_result_data", None))
+        metric_cols = get_metric_cols(st.session_state.get("result_data", None))
 
-        df = st.session_state["metric_result_data"]
-        chart_df = (
-            df.select(metric_cols)
-            .unpivot("SCORE", "METRIC", metric_cols)
-            .group_by("METRIC", "SCORE")
-            .count()
-        )
+        df = pd.melt(st.session_state["result_data"], 
+                     value_vars=metric_cols, var_name = 'METRIC', value_name = 'SCORE')\
+                        .groupby(['METRIC', 'SCORE']).size().reset_index(name='COUNT')
         st.write("Score Counts by Metric")
-        st.bar_chart(chart_df, x="SCORE", y="COUNT", color="METRIC")
+        st.bar_chart(df, x="SCORE", y="COUNT", color="METRIC")
 
 
 def get_trendable_column() -> Union[None, str]:
@@ -617,6 +643,15 @@ def show_results():
 
     from src.app_utils import fetch_warehouses
 
+    if st.session_state.get("metric_result_data", None) is not None:
+        if st.session_state.get('result_data', None) is None:
+            st.session_state["result_data"] = add_row_id(st.session_state["metric_result_data"])\
+                                        .withColumn("REVIEW", F.lit(False))\
+                                        .withColumn("COMMENT", F.lit(None)).to_pandas()
+
+            # Store available metrics in session state
+            st.session_state["metrics"] = fetch_metrics(st.session_state["session"], STAGE_NAME)
+
     show_metric()
     if st.session_state["eval_funnel"] is not None:
         top_row = row(5, vertical_align="top")

diff --git a/framework-evalanche/setup/cli_setup.sql b/framework-evalanche/setup/cli_setup.sql
@@ -1,5 +1,5 @@
 SET major = 2;
-SET minor = 0;
+SET minor = 1;
 SET COMMENT = concat('{"origin": "sf_sit",
             "name": "evalanche",
             "version": {"major": ',$major,', "minor": ',$minor,'}}');
@@ -77,6 +77,55 @@ def run(session, metric_name):
         return f"An error occurred: {e}"
 $$;
 
+-- Cortex Analyst runner
+CREATE OR REPLACE PROCEDURE GENAI_UTILITIES.EVALUATION.CORTEX_ANALYST_SQL(prompt STRING, semantic_file_path STRING)
+RETURNS STRING
+LANGUAGE PYTHON
+PACKAGES = ('snowflake-snowpark-python')
+RUNTIME_VERSION = '3.9'
+HANDLER = 'process_message'
+as
+$$
+import _snowflake
+import json
+def send_message(messages, semantic_file_path):
+    """Calls the REST API and returns the response."""
+    
+    request_body = {
+        "messages": messages,
+        "semantic_model_file": f"@{semantic_file_path}",
+    }
+    resp = _snowflake.send_snow_api_request(
+            "POST",
+            f"/api/v2/cortex/analyst/message",
+            {},
+            {},
+            request_body,
+            {},
+            30000,
+        )
+    if resp["status"] < 400:
+        response_content = json.loads(resp["content"])
+        return response_content
+    else:
+        raise Exception(
+            f"Failed request with status {resp['status']}: {resp}"
+        )
+
+def process_message(session, prompt, semantic_file_path):
+    """Processes a message and adds the response to the chat."""
+    messages = []
+    messages.append(
+        {"role": "user", "content": [{"type": "text", "text": prompt}]}
+    )
+    response = send_message(messages, semantic_file_path)
+    for item in response["message"]["content"]:
+        if item["type"] == "sql":
+            return item.get("statement", None)
+    else:
+        return None
+$$;
+
 -- Create Streamlit
 CREATE OR REPLACE STREAMLIT GENAI_UTILITIES.EVALUATION.EVALUATION_APP
 ROOT_LOCATION = '@GENAI_UTILITIES.EVALUATION.STREAMLIT_STAGE'