diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 1f3c1b38..1d07681c 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -48,7 +48,7 @@ jobs: RECIPES_MODEL_MAX_TOKENS: ${{ secrets.RECIPES_MODEL_MAX_TOKENS }} IMAGE_HOST: ${{ secrets.IMAGE_HOST }} - RECIPE_SERVER_API: ${{ secrets.RECIPE_SERVER_API }} + RECIPE_SERVER_API: ${{ secrets.RECIPE_SERVER_API_FROM_GH_HOST }} CHAINLIT_AUTH_SECRET: ${{ secrets.CHAINLIT_AUTH_SECRET }} USER_LOGIN: ${{ secrets.USER_LOGIN }} @@ -124,7 +124,7 @@ jobs: #python3 call_assistant.py --chat_history '[{"author": "user","content": "Hi!"}, {"author": "user","content": "What is the total population of Mali"}]' #python3 call_assistant.py --chat_history '[{"author": "user","content": "plot a line chart of fatalities by month for Chad using HDX data as an image"}]' # This runs a few, with the script kill, like promptflow, but prints all debug. Good for testing. - # python3 call_assistant_debug.py + python3 call_assistant_debug.py echo "Starting Promptflow batch run using data.jsonl ..." pf run create --flow . --data ./data.jsonl --stream --column-mapping query='${data.query}' context='${data.context}' chat_history='${data.chat_history}' --name base_run diff --git a/.gitignore b/.gitignore index da017ea2..fa58168a 100644 --- a/.gitignore +++ b/.gitignore @@ -25,4 +25,6 @@ management/work/ server/robocorp/actions_plugins/recipe-server/utils data server/fastapi/recipes/ +assistants/chat_ui/files/file_search/custom +assistants/chat_ui/files/code_interpreter/custom diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c77ef912..a57ccf6d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,5 +18,5 @@ repos: rev: 1.7.0 hooks: - id: interrogate - args: [--fail-under=65, --verbose] + args: [--fail-under=50, --verbose] exclude: __init__.py diff --git a/README.md b/README.md index 0a7a8340..b545f372 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,12 @@ This repo contains a docker-compose environment that will run the following comp - Assistant Settings - Set to your LLM deployment accordingly 2. `cd data && python3 download_demo_data.py && cd ..` 3. `docker compose up -d --build` -4. Go to [http://localhost:8000/](http://localhost:8000/) +4. `docker compose exec chat python create_update_assistant.py` +5. Update `.env` file and set ASSISTANTS_ID to the value returned from the previous step +6. `docker compose up -d` +7. Go to [http://localhost:8000/](http://localhost:8000/) + + ## Using Recipes @@ -56,9 +61,20 @@ We are in a phase of research to identify and improve recipes, but for now the s ## Additional Features +### Adding your own files for the assistant to analyze + +The assistant can be configured to analyze your own files, either in searching them or using them when analyzing data on-the-fly. To add your won files, place them in one of the following folders: + +`./assistants/chat_ui/files/file_search/custom` : The assistant will search these files +`./assistants/chat_ui/files/code_interpreter/custom` : The assistant can use these files when generating and running code + +Once you have put your files in the above folders, you can update your assistant by running ... + +`docker compose exec chat python create_update_assistant.py` + ### Analysis on Ingested Data -To run the ingestion module for ingested datasets, so assistants and plugins can analysis data on-the-fly as an experimental feature: +By default, the repo comes with some demo HAPI data. To run the ingestion module for ingested datasets yourself, so assistants and plugins can analysis data on-the-fly as an experimental feature: 1. `docker exec -it haa-ingestion /bin/bash` 2. `python3 ingest.py` @@ -171,9 +187,9 @@ To activate: 1. Go to [http://localhost:8091/](http://localhost:8091/) 2. Click on 'Build' -3. Click 'Skills' on left, top right click '...' and import the skill in `./assets` -4. Click 'Agents' on left, top right click '...' and import the skill in `./assets` -5. Click 'Workflows' on left, top right click '...' and import the skill in `./assets` +3. Click 'Skills' on left, top right click '...' and import the skill in `./assistants/recipes_agents/autogen_team/` +4. Click 'Agents' on left, top right click '...' and import the agent in `assistants/recipes_agents/autogen_team/` +5. Click 'Workflows' on left, top right click '...' and import the workflow in `assistants/recipes_agents/autogen_team/` 6. Go to playground and start a new session, select the 'Recipes data Analysis' workflow 7. Ask 'What is the total population of Mali?' diff --git a/assistants/chat_ui/create_update_assistant.py b/assistants/chat_ui/create_update_assistant.py new file mode 100644 index 00000000..0e4ab1c6 --- /dev/null +++ b/assistants/chat_ui/create_update_assistant.py @@ -0,0 +1,284 @@ +import asyncio +import datetime +import glob +import json +import os +import sys +import zipfile + +import pandas as pd +import requests +from dotenv import load_dotenv +from jinja2 import Environment, FileSystemLoader +from openai import AzureOpenAI, OpenAI + +from utils.db import get_data_info + +load_dotenv("../../.env") + +SUPPORTED_ASSISTANT_FILE_TYPES = ["csv", "json", "xlsx", "txt", "pdf", "docx", "pptx"] + +bot_name = os.environ.get("ASSISTANTS_BOT_NAME") +environment = Environment(loader=FileSystemLoader("templates/")) + +# Needed to get common fields standard_names +INTEGRATION_CONFIG = "ingestion.config" +SYSTEM_PROMPT = "instructions.txt" + + +def setup_openai(): + """ + Setup the OpenAI client. + + Returns: + tuple: A tuple containing the OpenAI client, assistant ID, and model. + """ + api_key = os.environ.get("ASSISTANTS_API_KEY") + assistant_id = os.environ.get("ASSISTANTS_ID") + model = os.environ.get("ASSISTANTS_MODEL") + api_type = os.environ.get("ASSISTANTS_API_TYPE") + api_endpoint = os.environ.get("ASSISTANTS_BASE_URL") + api_version = os.environ.get("ASSISTANTS_API_VERSION") + + if api_type == "openai": + print("Using OpenAI API") + client = OpenAI(api_key=api_key) + elif api_type == "azure": + print("Using Azure API") + print(f"Endpoint: {api_endpoint}") + client = AzureOpenAI( + api_key=api_key, + api_version=api_version, + azure_endpoint=api_endpoint, + default_headers={"OpenAI-Beta": "assistants=v2"}, + ) + else: + print("API type not supported") + sys.exit(1) + + return client, assistant_id, model + + +def get_common_field_standard_names(): + """ + Get the standard names of common fields from the integration configuration file. + + Returns: + list: A list of standard names of common fields. + """ + with open(INTEGRATION_CONFIG) as f: + print(f"Reading {INTEGRATION_CONFIG}") + config = json.load(f) + return config["standard_names"] + + +def get_local_files(assistant_file_type): + """ + Get local files of a specific type. + + Args: + assistant_file_type (str): The type of file to get, "file_search" or "code_interpreter" + + Returns: + list: A list of file paths. + """ + + if assistant_file_type not in ["file_search", "code_interpreter"]: + print(f"Assistant file type {assistant_file_type} is not supported") + sys.exit(1) + + file_paths = [] + for file_type in SUPPORTED_ASSISTANT_FILE_TYPES: + dir = f"./files/{assistant_file_type}" + files = glob.glob(f"{dir}/**/*.{file_type}", recursive=True) + file_paths = file_paths + files + + print( + f"Found {len(file_paths)} files - {file_paths} - of type {assistant_file_type}" + ) + + return file_paths + + +def upload_files_for_code_interpreter(client): + """ + Uploads files for the code interpretor section of the assistant. + + Args: + client (OpenAI): The OpenAI client. + assistant_id (str): The assistant ID. + """ + file_ids = [] + + file_paths = get_local_files("code_interpreter") + + for file_path in file_paths: + print(f"Uploading {file_path} to code interpreter ...") + file = client.files.create(file=open(file_path, "rb"), purpose="assistants") + file_ids.append(file.id) + + return file_ids + + +def upload_files_to_vector_store(vector_store_name, client): + """ + Uploads files to the vector store. + + Args: + vestor_store_name(str): The name of the vector store. + client (OpenAI): The OpenAI client. + """ + + file_paths = get_local_files("file_search") + + # No files to upload + if len(file_paths) == 0: + print("No files found to upload to vector store") + return None + + print(f"Uploading {file_paths} to vector store {vector_store_name} ...") + + # Create a vector store caled "Financial Statements" + vector_store = client.beta.vector_stores.create(name=vector_store_name) + + # Ready the files for upload to OpenAI + file_streams = [open(path, "rb") for path in file_paths] + + # Use the upload and poll SDK helper to upload the files, add them to the vector store, + # and poll the status of the file batch for completion. + file_batch = client.beta.vector_stores.file_batches.upload_and_poll( + vector_store_id=vector_store.id, files=file_streams + ) + + # You can print the status and the file counts of the batch to see the result of this operation. + print(file_batch.status) + print(file_batch.file_counts) + + return vector_store.id + + +def get_manually_defined_functions(): + """ + Get a list of manually defined functions. + + Returns: + list: A list of dictionaries representing the manually defined functions in openai format + """ + functions = [ + { + "type": "function", + "function": { + "name": "call_execute_query_api", + "description": "Execute Query", + "parameters": { + "properties": { + "sql": { + "type": "string", + "title": "SQL", + "description": "The SQL query to be executed. Only read queries are allowed.", + } + }, + "type": "object", + "required": ["sql"], + }, + }, + } + ] + + return functions + + +def create_update_assistant(): + """ + Creates or updates a humanitarian response assistant. + + To force creation of a new assistant, be sure that ASSITANT_ID is not set in the .env file. + + """ + + # Client setup + client, assistant_id, model = setup_openai() + + # Information on common names + standard_names = get_common_field_standard_names() + + # Get database information for system prompt + data_info = get_data_info() + print(data_info) + + # Load code examples + template = environment.get_template("openai_assistant_prompt.jinja2") + instructions = template.render( + data_info=data_info, + country_code_field=standard_names["country_code_field"], + admin1_code_field=standard_names["admin1_code_field"], + admin2_code_field=standard_names["admin2_code_field"], + admin3_code_field=standard_names["admin3_code_field"], + ) + + # Save for debugging + with open(SYSTEM_PROMPT, "w") as f: + f.write(instructions) + + # Upload any local files needed by assistant for file_search (RAG) + vector_store_id = upload_files_to_vector_store("local_files_vectore_store", client) + + # Upload any files that will be used for code_interpretor + code_interpreter_file_ids = upload_files_for_code_interpreter(client) + + params = { + "name": bot_name, + "instructions": instructions, + "model": model, + "temperature": 0.1, + } + + tool_resources = {} + + # Set tools + tools = [{"type": "code_interpreter"}] + + # Add file search if we have data + if vector_store_id is not None: + tools.append({"type": "file_search"}) + params["tool_resources"] = { + "file_search": {"vector_store_ids": [vector_store_id]} + } + tool_resources["file_search"] = {"vector_store_ids": [vector_store_id]} + + if len(code_interpreter_file_ids) > 0: + tool_resources["code_interpreter"] = {"file_ids": code_interpreter_file_ids} + + # Add manually defined functions + tools = tools + get_manually_defined_functions() + + params["tools"] = tools + + # Add in tool files as needed + if "code_interpreter" in tool_resources or "file_search" in tool_resources: + params["tool_resources"] = tool_resources + + # If we were provided an ID in .env, pass it in to update existing assistant + if assistant_id is not None: + params["assistant_id"] = assistant_id + + print(json.dumps(params, indent=4)) + + if assistant_id is None: + print( + f"Calling assistant API for ID: {assistant_id}, name: {bot_name} and model {model} ..." + ) + assistant = client.beta.assistants.create(**params) + print("Assistant created!! Here is the assistant ID:\n") + print(assistant.id) + print("\nNow update ASSISTANTS_ID in your .env file with this ID") + else: + print( + f"Calling assistant API for ID: {assistant_id}, name: {bot_name} and model {model} ..." + ) + assistant = client.beta.assistants.update(**params) + print("Assistant updated!!") + + +if __name__ == "__main__": + create_update_assistant() diff --git a/ui/chat-chainlit-flow/actions.py b/assistants/chat_ui/files/code_interpreter/core/.gitkeep old mode 100755 new mode 100644 similarity index 100% rename from ui/chat-chainlit-flow/actions.py rename to assistants/chat_ui/files/code_interpreter/core/.gitkeep diff --git a/assistants/chat_ui/files/code_interpreter/core/test.txt b/assistants/chat_ui/files/code_interpreter/core/test.txt new file mode 100644 index 00000000..9daeafb9 --- /dev/null +++ b/assistants/chat_ui/files/code_interpreter/core/test.txt @@ -0,0 +1 @@ +test diff --git a/assistants/chat_ui/files/file_search/core/HDIP FAQs (External) .pdf b/assistants/chat_ui/files/file_search/core/HDIP FAQs (External) .pdf new file mode 100644 index 00000000..0ef7f584 Binary files /dev/null and b/assistants/chat_ui/files/file_search/core/HDIP FAQs (External) .pdf differ diff --git a/assistants/openai_assistants/create_update_assistant.py b/assistants/openai_assistants/create_update_assistant.py deleted file mode 100644 index f163f2ee..00000000 --- a/assistants/openai_assistants/create_update_assistant.py +++ /dev/null @@ -1,255 +0,0 @@ -import asyncio -import datetime -import glob -import json -import os -import sys -import zipfile - -import pandas as pd -import requests -from dotenv import load_dotenv -from jinja2 import Environment, FileSystemLoader -from openai import AzureOpenAI, OpenAI - -load_dotenv("../../.env") - -api_key = os.environ.get("ASSISTANTS_API_KEY") -assistant_id = os.environ.get("ASSISTANTS_ID") -model = os.environ.get("ASSISTANTS_MODEL") -api_type = os.environ.get("ASSISTANTS_API_TYPE") -api_endpoint = os.environ.get("ASSISTANTS_BASE_URL") -api_version = os.environ.get("ASSISTANTS_API_VERSION") -bot_name = os.environ.get("ASSISTANTS_BOT_NAME") -environment = Environment(loader=FileSystemLoader("templates/")) - -file_to_func_map_loc = "./file_to_func_map.json" -data_files_location = "../../ingestion/api" - -# Needed to get common fields standard_names -INTEGRATION_CONFIG = "../../ingestion/ingestion.config" -SYSTEM_PROMPT = "instructions.txt" - -if api_type == "openai": - print("Using OpenAI API") - client = OpenAI(api_key=api_key) -elif api_type == "azure": - print("Using Azure API") - print(f"Endpoint: {api_endpoint}") - client = AzureOpenAI( - api_key=api_key, - api_version=api_version, - azure_endpoint=api_endpoint, - default_headers={"OpenAI-Beta": "assistants=v2"}, - ) -else: - print("API type not supported") - sys.exit(1) - - -def get_common_field_standard_names(): - """ - Get the standard names of common fields from the integration configuration file. - - Returns: - list: A list of standard names of common fields. - """ - with open(INTEGRATION_CONFIG) as f: - print(f"Reading {INTEGRATION_CONFIG}") - config = json.load(f) - return config["standard_names"] - - -def get_manually_defined_functions(): - """ - Get a list of manually defined functions. - - Returns: - list: A list of dictionaries representing the manually defined functions. - """ - # functions = [ - # { - # "function": { - # "name": "get_info_about_datasets", - # "parameters": {}, - # "description": """ - # Get a JSON object containing information about the datasets you have access to. - # This includes which types of data, the countries they include and columns within each datafiles. - # Use this function for questions about the data you have - # """, - # } - # } - # ] - functions = [] - if len(functions) > 0: - functions_openai_fmt = [] - for f in functions: - f = { - "type": "function", - "function": f["function"], - } - functions_openai_fmt.append(f) - return functions_openai_fmt - - -def upload_files_to_openai(standard_names): - """ - Uploads files to OpenAI and returns a prompt string and a list of file IDs. - - Args: - standard_names (dict): A dictionary containing common field standard_names. - - Returns: - file_prompt (str): A string containing information about the uploaded files. - file_ids (list): A list of file IDs generated by OpenAI. - """ - - files = [] - files += glob.glob(f"{data_files_location}/**/*.csv", recursive=True) - files += glob.glob(f"{data_files_location}/**/*geoBoundaries*.zip", recursive=True) - file_prompt = "" - file_ids = [] - - # sort files with csv first, then zip - files = sorted(files, key=lambda x: x.split(".")[-1]) - - datafiles = [] - for f in files: - print(f) - countries = "" - first_line = "" - # Get column standard_names from first line - if f.endswith(".csv"): - df = pd.read_csv(f) - first_line = list(df.columns) - if standard_names["country_code_field"] in first_line: - countries = list(df[standard_names["country_code_field"]].unique()) - - print(f"Uploading {f} ...") - file = client.files.create(file=open(f, "rb"), purpose="assistants") - - r = {} - if f.endswith(".csv"): - file_loc = f"/mnt/data/{file.id}" - r["file_location"] = file_loc - r["_original_file_name"] = f.split("/")[-1] - metadata_file = f.replace(".csv", "_meta.json") - r["description"] = "This is CSV data" - - # If we have a metadata file, use that - if os.path.exists(metadata_file): - with open(metadata_file) as mf: - metadata = json.load(mf) - description = "" - for f in ["tags", "summary", "description"]: - if f in metadata["get"]: - description += str(metadata["get"][f]) + "\n" - r["description"] = description - - r["columns"] = first_line - r["countries"] = countries - elif "geoBoundaries" in f: - r["zip_file_location_with_shapefiles"] = f"/mnt/data/{file.id}" - r["_original_file_name"] = f - r["description"] = ( - "This file contains administrative boundary data for countries and admin level as specified" - ) - r["admin_level"] = f.split("geoBoundaries-")[1][0:4] - # Intentionall removed some columns here for clarity - r["columns"] = [ - "Shape_Leng", - "Shape_Area", - f"{standard_names['admin0_code_field']}", - f"{standard_names['admin1_code_field']}", - f"{standard_names['admin2_code_field']}", - f"{standard_names['admin3_code_field']}", - "ADM1_REF", - "date", - "validOn", - "validTo", - "geometry", - ] - - with zipfile.ZipFile(f, "r") as zip_ref: - shape_files = [] - files_in_zip = zip_ref.namelist() - for zf in files_in_zip: - if zf.endswith(".shp"): - r2 = {} - r2["shape_file"] = zf - r2["country"] = zf[0:3].upper() - shape_files.append(r2) - - r["shapefiles"] = shape_files - - datafiles.append(r) - print(json.dumps(datafiles, indent=4)) - - file_ids.append(file.id) - - file_prompt = json.dumps(datafiles, indent=4) - - return file_prompt, file_ids - - -def create_update_assistant(): - """ - Creates or updates a humanitarian response assistant. - - To force creation of a new assistant, be sure that ASSITANT_ID is not set in the .env file. - - """ - - standard_names = get_common_field_standard_names() - files_prompt, file_ids = upload_files_to_openai(standard_names) - - # Load code examples - template = environment.get_template("sample_code.jinja") - sample_code = template.render(admin1_code_name=standard_names["country_code_field"]) - - # Populate system prompt - template = environment.get_template("assistant_instructions.jinja") - instructions = template.render( - admin0_code_field=standard_names["admin0_code_field"], - admin1_code_field=standard_names["admin1_code_field"], - admin2_code_field=standard_names["admin2_code_field"], - admin3_code_field=standard_names["admin3_code_field"], - sample_code=sample_code, - files_prompt=files_prompt, - ) - - # Save for debugging - with open(SYSTEM_PROMPT, "w") as f: - f.write(instructions) - - tools = [{"type": "code_interpreter"}] - - # Find if agent exists. v1 needs a try/except for this, TODO upgrade to v2 API - try: - print( - f"Updating existing assistant {assistant_id} {bot_name} and model {model} ..." - ) - assistant = client.beta.assistants.update( - assistant_id, - name=bot_name, - instructions=instructions, - tools=tools, - model=model, - file_ids=file_ids, - ) - except Exception: - print(f"Creating assistant with model {model} ...") - assistant = client.beta.assistants.create( - name=bot_name, - instructions=instructions, - tools=tools, - model=model, - file_ids=file_ids, - ) - print("Assistant created!! Here is the assistant ID:") - print(assistant.id) - print("Now save the ID in your .env file so next time it's updated") - - -if __name__ == "__main__": - create_update_assistant() diff --git a/assistants/openai_assistants/templates/assistant_instructions.jinja2 b/assistants/openai_assistants/templates/assistant_instructions.jinja2 deleted file mode 100644 index ea1cb68a..00000000 --- a/assistants/openai_assistants/templates/assistant_instructions.jinja2 +++ /dev/null @@ -1,60 +0,0 @@ - - "You are a helpful humanitarian response analyst. You answer data-related questions using only the data sources provided in your functions" - - "You only answer questions about humanitarian data, nothing else" - - "Never, ever use sample data, always use real data from the files or functions provided" - - "When plotting numerical scales don't use scientific notation, use thousands, millions, billions etc" - - "Here is the mapping column for locations between tabular datasets and shapefiles: - administrative levels 0 : {{ admin0_code_field }} - administrative levels 1 : {{ admin1_code_field }} - administrative levels 2 : {{ admin2_code_field }} - administrative levels 3 : {{ admin3_code_field }}" - - "You have been provided files to analyze, these are found '/mnt/data/'." - - "You do not need to add a suffix like '.csv' or .zip' when reading the files provided" - - "You do not output your analysis plan, just the answer" - - "If asked what data you have, list the data you have but don't provide file standard_names or IDs. Do provide the type of data though, eg population" - - "Add tabular data is from the humanitarian data exchange (HDX) new HAPI API" - - "ALWAYS filter tabular data by code variables, not standard_names. So for example {{ admin0_code_field }} for country, {{ admin1_code_field }} for admin level 1 etc" - - "Gender columns are set to 'm' or 'f' if set" - - "When generating code, define all files and folders as variables at the top of your code, then reference in code below" - - "Always make sure the variable for the folder name to extract zip files is different to variable for the location of the zip file" - - "ALWAYS Import the following modules in generated code: pandas, geopandas, matplotlib.pyplot, zipfile, os" - - "If asked to display a table, use the 'display' command in python" - - "Always display generated images inline, NEVER give a link to the image or map" - - "If you generate code, run it" - - "If a dataset has admin standard_names in it, no need to merge with administrative data" - - - -=============== - -These are the data files you have access to: - -{{ files_prompt }} - - -Boundary shape files needed for maps can be found in the provided zip files of format geoBoundaries-adm1-countries_a-z.zip -The file standard_names indicate what country and admin level they relate too, eg 'ukr_admbnda_adm1.shp' where 'ukr' is Ukraine and adm1 indicates admin level 1The unzipped shapefiles have country code in the first 3 letters of their name, eg ukr_admbnda_adm1.shp (the date part can change depending on country) -Only use boundary zip files if you have been explicitly asked to plot on a map. No need to use for other plots -When merging shapefiles with HDX datafiles, use columns {{ admin0_code_field }} for admin 0, {{ admin1_code_field }} for admin level 1 and {{ admin2_code_field }} for admin level 2 - -======= SAMPLE CODE ======== - -{{ sample_code }} \ No newline at end of file diff --git a/assistants/openai_assistants/templates/sample_code.jinja2 b/assistants/openai_assistants/templates/sample_code.jinja2 deleted file mode 100644 index b140c561..00000000 --- a/assistants/openai_assistants/templates/sample_code.jinja2 +++ /dev/null @@ -1,60 +0,0 @@ -EXAMPLE PYTHON CODE TO USE: - -1. Example of plotting Admin 1 population data on a map - -To plot data on a map, you need to follow these steps ... - -1. Read the HDX data from the provided file. -2. Filter the data for the task, eg by country, state, date, gender, etc -3. Unzip the boundaries for the admin level requested from the provided zip file. -4. Find the country's shapefile for admin level in the unzipped folder. -5. Load shapefile using GeoPandas. -6. Group the HDX data by admin code (eg admin1_code) to sum up the total per admin level -7. Merge the HDX data with the GeoPandas dataframe using admin1_code,and corresponding ADM PCODE field in the shapefile -8. Plot the map showing the data by admin level - -The following example shows how to read HDX data, and the provided shapefiles, and combine them to plot a map. -You would change the names of files, admin level etc depending on what you were asked. - -``` -import pandas as pd -import geopandas as gpd -import matplotlib.pyplot as plt -import zipfile -import os - -# Load the Mali population data -population_df = pd.read_csv('/mnt/data/file-jSXieGAgEX0roYaN8yMy1IyM') - -# Filter the population data for Mali -mali_population_df = population_df[population_df['location_name'] == 'Mali'] - -# Unzipping the admin level 1 boundaries -zip_file = '/mnt/data/file-WGDAzLoP0a5SqDKEuf4x7aSe' -zip_file_extract_folder = '/mnt/data/geoBoundaries' -shape_file = 'mli_admbnda_adm1.shp' - -with zipfile.ZipFile(zip_file, 'r') as zip_ref: - zip_ref.extractall(zip_file_extract_folder) - -# Load Mali's shapefile -mali_gdf = gpd.read_file(f"{zip_file_extract_folder}/{shape_file}") - -# Group the population by admin1_code and sum up to get the total population per admin1 -mali_population_by_admin1 = mali_population_df.groupby('{{ admin1_code_name }}')['population'].sum().reset_index() - -# Merge the population data with the geopandas dataframe using admin1_code -mali_gdf_merged = mali_gdf.merge(mali_population_by_admin1, left_on='{{ admin1_code_name }}', right_on='{{ admin1_code_name }}') - -# Plotting the map -fig, ax = plt.subplots(1, 1, figsize=(10, 10)) -mali_gdf_merged.plot(column='population', ax=ax, legend=True, - legend_kwds={'label': "Population by Admin1", - 'orientation': "horizontal"}) -ax.set_title('Population by Admin1 in Mali') - -# Remove axes for clarity -ax.set_axis_off() - -plt.show() -``` diff --git a/assistants/recipes_agents/autogen_team/agent_recipes_data_analysis_assistant.json b/assistants/recipes_agents/autogen_team/agent_recipes_data_analysis_assistant.json new file mode 100644 index 00000000..38ad24a1 --- /dev/null +++ b/assistants/recipes_agents/autogen_team/agent_recipes_data_analysis_assistant.json @@ -0,0 +1 @@ +{"type":"assistant","config":{"name":"recipes_data_analysis_assistant","llm_config":{"config_list":[{"model":"gpt-4-1106-preview"}],"temperature":0.1,"cache_seed":null,"timeout":600,"max_tokens":null,"extra_body":null},"human_input_mode":"NEVER","max_consecutive_auto_reply":8,"system_message":"You are a helpful AI assistant that generates and runs code to answer questions about humanitarian response. \n\nIMPORTANT: You ONLY use the skills you have been provided to get data. \n\nWhen you first start run this query to see what tables and columns you have access to: `select table_name, api_name, summary, columns from table_metadata`\n\nadm0_code are 3-letter country ISO codes\n\nadm1 fields are for states within a country\n\nIf you create a plot, you MUST output an image file.\n\nLink Shapefiles to other data using adm1_code\n\nUnless the user is asking for data changes over time, add the following clause to all queries to get the latest data ...\n\n`group by\n\treference_period_start\nhaving\n reference_period_start = MAX(reference_period_start)`\n\nWhen generating code, ALWAYS put the task in a function with parameters so that it can be reused.\n\nThe shapefile in the database will need to be converted to a geoseries for plotting, here is an example:\n\n` ``\n# Convert the data into a DataFrame\ndf = pd.DataFrame(rows, columns=[\"adm1_code\", \"population\", \"geometry\"])\n\n# Convert the 'geometry' column into a GeoSeries\ndf['geometry'] = df['geometry'].apply(lambda x: wkb.loads(x, hex=True))\n\n# Convert the DataFrame into a GeoDataFrame\ngdf = gpd.GeoDataFrame(df, geometry='geometry')\n```\n\nSolve tasks using your coding and language skills. In the following cases, suggest python code (in a python coding block) or shell script (in a sh coding block) for the user to execute. 1. When you need to collect info, use the code to output the info you need, for example, browse or search the web, download/read a file, print the content of a webpage or a file, get the current date/time, check the operating system. After sufficient info is printed and the task is ready to be solved based on your language skill, you can solve the task by yourself. 2. When you need to perform some task with code, use the code to perform the task and output the result. Finish the task smartly. Solve the task step by step if you need to. If a plan is not provided, explain your plan first. Be clear which step uses code, and which step uses your language skill. When using code, you must indicate the script type in the code block. The user cannot provide any other feedback or perform any other action beyond executing the code you suggest. The user can't modify your code. So do not suggest incomplete code which requires users to modify. Don't use a code block if it's not intended to be executed by the user. If you want the user to save the code in a file before executing it, put # filename: inside the code block as the first line. Don't include multiple code blocks in one response. Do not ask users to copy and paste the result. Instead, use 'print' function for the output when relevant. Check the execution result returned by the user. If the result indicates there is an error, fix the error and output the code again. Suggest the full code instead of partial code or code changes. If the error can't be fixed or if the task is not solved even after the code is executed successfully, analyze the problem, revisit your assumption, collect additional info you need, and think of a different approach to try. When you find an answer, verify the answer carefully. Include verifiable evidence in your response if possible. Reply 'TERMINATE' in the end when everything is done.","is_termination_msg":null,"code_execution_config":null,"default_auto_reply":"","description":"A data analysis assistant agent that writes plans and code to solve tasks."},"timestamp":"2024-05-12T14:22:21.797901","user_id":"default","skills":[{"title":"query_data_db","content":"\n ## This is a skill to execute database queires in the data databse,\n ## For answering questions about humanitarian response.\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport psycopg2\nimport os\nfrom dotenv import load_dotenv\n\nload_dotenv()\n\ndef get_connection():\n \"\"\"\n This function gets a connection to the database\n \"\"\"\n host = os.getenv(\"POSTGRES_DATA_HOST\")\n port = os.getenv(\"POSTGRES_DATA_PORT\")\n database = os.getenv(\"POSTGRES_DATA_DB\")\n user = os.getenv(\"POSTGRES_DATA_USER\")\n password = os.getenv(\"POSTGRES_DATA_PASSWORD\")\n\n conn = psycopg2.connect(\n dbname=database,\n user=user,\n password=password,\n host=host,\n port=port\n )\n return conn\n\ndef execute_query(query):\n \"\"\"\n This skill executes a query in the data database.\n\n To find out what tables and columns are available, you can run \"select table_name, api_name, summary, columns from table_metadata\" \n\n \"\"\"\n conn = get_connection()\n cur = conn.cursor()\n\n # Execute the query\n cur.execute(query)\n\n # Fetch all the returned rows\n rows = cur.fetchall()\n\n # Close the cursor and connection\n cur.close()\n conn.close()\n\n return rows\n","file_name":null,"description":null,"timestamp":"2024-05-12T14:21:38.809485","user_id":"default"}]} \ No newline at end of file diff --git a/assistants/recipes_agents/autogen_team/skill_recipes_query_data_db.json b/assistants/recipes_agents/autogen_team/skill_recipes_query_data_db.json new file mode 100644 index 00000000..4d2b64ba --- /dev/null +++ b/assistants/recipes_agents/autogen_team/skill_recipes_query_data_db.json @@ -0,0 +1 @@ +{"title":"recipes_query_data_db","content":"\n ## This is a skill to execute database queires in the data databse,\n ## For answering questions about humanitarian response.\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport psycopg2\nimport os\nfrom dotenv import load_dotenv\n\nload_dotenv()\n\ndef get_connection():\n \"\"\"\n This function gets a connection to the database\n \"\"\"\n host = os.getenv(\"POSTGRES_DATA_HOST\")\n port = os.getenv(\"POSTGRES_DATA_PORT\")\n database = os.getenv(\"POSTGRES_DATA_DB\")\n user = os.getenv(\"POSTGRES_DATA_USER\")\n password = os.getenv(\"POSTGRES_DATA_PASSWORD\")\n\n conn = psycopg2.connect(\n dbname=database,\n user=user,\n password=password,\n host=host,\n port=port\n )\n return conn\n\ndef execute_query(query):\n \"\"\"\n This skill executes a query in the data database.\n\n To find out what tables and columns are available, you can run \"select table_name, api_name, summary, columns from table_metadata\" \n\n \"\"\"\n conn = get_connection()\n cur = conn.cursor()\n\n # Execute the query\n cur.execute(query)\n\n # Fetch all the returned rows\n rows = cur.fetchall()\n\n # Close the cursor and connection\n cur.close()\n conn.close()\n\n return rows\n","file_name":null,"description":null,"timestamp":"2024-05-12T14:21:38.809485","user_id":"default"} \ No newline at end of file diff --git a/assistants/recipes_agents/autogen_team/workflow_Recipes Data Analysis Workflow.json b/assistants/recipes_agents/autogen_team/workflow_Recipes Data Analysis Workflow.json new file mode 100644 index 00000000..475511d9 --- /dev/null +++ b/assistants/recipes_agents/autogen_team/workflow_Recipes Data Analysis Workflow.json @@ -0,0 +1 @@ +{"name":"Recipes Data Analysis Workflow","description":"This workflow is used for doing data analysis using the database and API provided in your skills","sender":{"type":"userproxy","config":{"name":"userproxy","llm_config":false,"human_input_mode":"NEVER","max_consecutive_auto_reply":10,"system_message":"You are a helpful assistant.","is_termination_msg":null,"code_execution_config":{"work_dir":null,"use_docker":false},"default_auto_reply":"TERMINATE","description":"A user proxy agent that executes code."},"timestamp":"2024-05-12T14:22:21.798398","user_id":"default","skills":null},"receiver":{"type":"assistant","config":{"name":"recipes_data_analysis_assistant","llm_config":{"config_list":[{"model":"gpt-4-1106-preview"}],"temperature":0.1,"cache_seed":null,"timeout":600,"max_tokens":null,"extra_body":null},"human_input_mode":"NEVER","max_consecutive_auto_reply":8,"system_message":"You are a helpful AI assistant that generates and runs code to answer questions about humanitarian response. \n\nIMPORTANT: You ONLY use the skills you have been provided to get data. \n\nWhen you first start run this query to see what tables and columns you have access to: `select table_name, api_name, summary, columns from table_metadata`\n\nadm0_code are 3-letter country ISO codes\n\nadm1 fields are for states within a country\n\n\nSolve tasks using your coding and language skills. In the following cases, suggest python code (in a python coding block) or shell script (in a sh coding block) for the user to execute. 1. When you need to collect info, use the code to output the info you need, for example, browse or search the web, download/read a file, print the content of a webpage or a file, get the current date/time, check the operating system. After sufficient info is printed and the task is ready to be solved based on your language skill, you can solve the task by yourself. 2. When you need to perform some task with code, use the code to perform the task and output the result. Finish the task smartly. Solve the task step by step if you need to. If a plan is not provided, explain your plan first. Be clear which step uses code, and which step uses your language skill. When using code, you must indicate the script type in the code block. The user cannot provide any other feedback or perform any other action beyond executing the code you suggest. The user can't modify your code. So do not suggest incomplete code which requires users to modify. Don't use a code block if it's not intended to be executed by the user. If you want the user to save the code in a file before executing it, put # filename: inside the code block as the first line. Don't include multiple code blocks in one response. Do not ask users to copy and paste the result. Instead, use 'print' function for the output when relevant. Check the execution result returned by the user. If the result indicates there is an error, fix the error and output the code again. Suggest the full code instead of partial code or code changes. If the error can't be fixed or if the task is not solved even after the code is executed successfully, analyze the problem, revisit your assumption, collect additional info you need, and think of a different approach to try. When you find an answer, verify the answer carefully. Include verifiable evidence in your response if possible. Reply 'TERMINATE' in the end when everything is done.","is_termination_msg":null,"code_execution_config":null,"default_auto_reply":"","description":"A data analysis assistant agent that writes plans and code to solve tasks."},"timestamp":"2024-05-12T14:22:21.797901","user_id":"default","skills":[{"title":"generate_images","content":"from typing import List\nimport uuid\nimport requests # to perform HTTP requests\nfrom pathlib import Path\n\nfrom openai import OpenAI\n\n\ndef generate_and_save_images(query: str, image_size: str = \"1024x1024\") -> List[str]:\n \"\"\"\n Function to paint, draw or illustrate images based on the users query or request. Generates images from a given query using OpenAI's DALL-E model and saves them to disk. Use the code below anytime there is a request to create an image.\n\n :param query: A natural language description of the image to be generated.\n :param image_size: The size of the image to be generated. (default is \"1024x1024\")\n :return: A list of filenames for the saved images.\n \"\"\"\n\n client = OpenAI() # Initialize the OpenAI client\n response = client.images.generate(model=\"dall-e-3\", prompt=query, n=1, size=image_size) # Generate images\n\n # List to store the file names of saved images\n saved_files = []\n\n # Check if the response is successful\n if response.data:\n for image_data in response.data:\n # Generate a random UUID as the file name\n file_name = str(uuid.uuid4()) + \".png\" # Assuming the image is a PNG\n file_path = Path(file_name)\n\n img_url = image_data.url\n img_response = requests.get(img_url)\n if img_response.status_code == 200:\n # Write the binary content to a file\n with open(file_path, \"wb\") as img_file:\n img_file.write(img_response.content)\n print(f\"Image saved to {file_path}\")\n saved_files.append(str(file_path))\n else:\n print(f\"Failed to download the image from {img_url}\")\n else:\n print(\"No image data found in the response!\")\n\n # Return the list of saved files\n return saved_files\n\n\n# Example usage of the function:\n# generate_and_save_images(\"A cute baby sea otter\")\n","file_name":null,"description":"This skill generates images from a given query using OpenAI's DALL-E model and saves them to disk.","timestamp":"2024-05-12T14:22:21.797899","user_id":"default"},{"title":"query_data_db","content":"\n ## This is a skill to execute database queires in the data databse,\n ## For answering questions about humanitarian response.\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport psycopg2\nimport os\nfrom dotenv import load_dotenv\n\nload_dotenv()\n\ndef get_connection():\n \"\"\"\n This function gets a connection to the database\n \"\"\"\n host = os.getenv(\"POSTGRES_DATA_HOST\")\n port = os.getenv(\"POSTGRES_DATA_PORT\")\n database = os.getenv(\"POSTGRES_DATA_DB\")\n user = os.getenv(\"POSTGRES_DATA_USER\")\n password = os.getenv(\"POSTGRES_DATA_PASSWORD\")\n\n conn = psycopg2.connect(\n dbname=database,\n user=user,\n password=password,\n host=host,\n port=port\n )\n return conn\n\ndef execute_query(query):\n \"\"\"\n This skill executes a query in the data database.\n\n To find out what tables and columns are available, you can run \"select table_name, api_name, summary, columns from table_metadata\" \n\n \"\"\"\n conn = get_connection()\n cur = conn.cursor()\n\n # Execute the query\n cur.execute(query)\n\n # Fetch all the returned rows\n rows = cur.fetchall()\n\n # Close the cursor and connection\n cur.close()\n conn.close()\n\n return rows\n","file_name":null,"description":null,"timestamp":"2024-05-12T14:21:38.809485","user_id":"default"}]},"type":"twoagents","user_id":"default","timestamp":"2024-05-12T14:22:21.798482","summary_method":"last"} \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index f6cb9e43..ce2aff11 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -86,16 +86,9 @@ services: - ./templates:/app/templates - ./utils:/app/utils - ./server/robocorp/actions_plugins/recipe-server/actions.py:/app/actions.py - # # Init container - # init: - # image: busybox - # container_name: recipes-ai-init - # volumes: - # - shared-data:/data - # command: "sh -c 'chown -R 1000:1000 /data && chmod -R 775 /data'" - # user: "root" - # depends_on: - # - actions + - ./ingestion/ingestion.config:/app/ingestion.config + - ./assistants/chat_ui/create_update_assistant.py:/app/create_update_assistant.py + - ./assistants/chat_ui/files:/app/files ingestion: container_name: recipes-ai-ingestion build: diff --git a/flows/chainlit-ui-evaluation/call_assistant.py b/flows/chainlit-ui-evaluation/call_assistant.py index 5f135910..99c876cd 100644 --- a/flows/chainlit-ui-evaluation/call_assistant.py +++ b/flows/chainlit-ui-evaluation/call_assistant.py @@ -239,6 +239,21 @@ def Image(self, path, display, size): "image_path": image_path, } + def instrument_openai(self): + """ + Instruments the OpenAI MOCK. + + This method is responsible for instrumenting the OpenAI MOCK. + It prints a message indicating that the OpenAI MOCK is being instrumented. + + Parameters: + None + + Returns: + None + """ + print("Instrumenting OpenAI MOCK") + cl_mock = MockChainlit() return cl_mock @@ -449,6 +464,8 @@ async def test_using_app_code_async(chat_history, timeout=5): await app.start_chat() + sync_openai_client = app.cl.user_session.get("sync_openai_client") + thread_id = app.cl.user_session.get("thread_id") # Here build history @@ -471,7 +488,7 @@ async def test_using_app_code_async(chat_history, timeout=5): msg = cl_mock.Message(author="user", content=last_message["content"], elements=[]) await app.process_message(msg) - messages = app.sync_openai_client.beta.threads.messages.list(thread_id) + messages = sync_openai_client.beta.threads.messages.list(thread_id) print("Messages:", messages.data[0].content[0]) if messages.data[0].content[0].type == "image_file": file_id = messages.data[0].content[0].image_file.file_id diff --git a/flows/chainlit-ui-evaluation/call_assistant_debug.py b/flows/chainlit-ui-evaluation/call_assistant_debug.py index 9982831f..e8159735 100644 --- a/flows/chainlit-ui-evaluation/call_assistant_debug.py +++ b/flows/chainlit-ui-evaluation/call_assistant_debug.py @@ -1,3 +1,5 @@ +import sys + from call_assistant import run_chainlit_mock # @@ -18,6 +20,8 @@ def main(): # Assistant smalltalk run_chainlit_mock('[{"author": "user","content": "Hi"}]') + sys.exit(0) + # Memories, text output run_chainlit_mock( '[{"author": "user","content": "what is the population of Mali?"}]' diff --git a/flows/chainlit-ui-evaluation/data.jsonl b/flows/chainlit-ui-evaluation/data.jsonl index fbaff46f..1e0b72d8 100644 --- a/flows/chainlit-ui-evaluation/data.jsonl +++ b/flows/chainlit-ui-evaluation/data.jsonl @@ -3,4 +3,5 @@ {"test_scenario":"Image answer from memory", "query": "plot a line chart of fatalities by month for Chad using HDX data as an image", "chat_history": "[]", "context": "The answer is:\n\n Image cksum: 6a410014fde98dc5bde69c24e6d64cc1\nImage description: {'content': 'The image is a line graph titled \"Fatalities by Month for Chad.\" It displays the total number of fatalities over time, with the x-axis representing the months from January 2008 to January 2024, and the y-axis representing the total number of fatalities, ranging from 0 to 500.\\n\\nKey observations from the graph:\\n- There are several peaks indicating months with high fatalities.\\n- Notable spikes occur around mid-2008, early 2009, mid-2015, early 2021, and mid-2021.\\n- The highest peak appears to be around early 2021, reaching close to 500 fatalities.\\n- There are periods of relatively low fatalities, particularly between 2010 and 2014.\\n\\nTo determine if this image is relevant to the user query, more context about the query is needed. If the query pertains to historical data on fatalities in Chad, trends in violence or conflict, or similar topics, then this image is highly relevant.'}\n \n\n Metadata for the answer:\n {\"params\": {\"country_code\": \"TCD\"}, \"attribution\": \"https://data.humdata.org/dataset/b009f9b0-aa65-49c5-b188-a33daade0f4a\", \"data_url\": \"https://data.humdata.org/dataset/b009f9b0-aa65-49c5-b188-a33daade0f4a/resource/bb78c035-ec19-4503-b325-0673749c2eb4/download/chad_hrp_political_violence_events_and_fatalities_by_month-year_as-of-29may2024.xlsx\"}"} {"test_scenario":"Image answer from recipe", "query": "Plot population pyramids for Nigeria", "chat_history": "[]", "context": "The answer is:\n\n Image cksum: 7940162caf0e79eba9caae30c2955a6e\nImage description: {'content': \"The image is a population pyramid for Nigeria (NGA). It is a bar chart that displays the distribution of various age groups in the population, divided by gender. The x-axis represents the population in millions, with males on the left side (in blue) and females on the right side (in pink). The y-axis represents the age range, divided into 5-year intervals from 0-4 up to 80+.\\n\\nKey features of the population pyramid:\\n- The base of the pyramid (0-4 age range) is the widest, indicating a high number of young children.\\n- As the age range increases, the width of the bars decreases, showing a tapering effect typical of a youthful population.\\n- The population decreases steadily with age, with the smallest population in the 80+ age range.\\n- The pyramid shows a relatively balanced distribution between males and females across most age groups.\\n\\nThis image is relevant to a user query related to demographic analysis, population studies, or understanding the age and gender distribution of Nigeria's population.\"}\n \n\n Metadata for the answer:\n {'params': {'adm0_code': 'NGA'}, 'attribution': 'https://data.humdata.org/dataset/a7c3de5e-ff27-4746-99cd-05f2ad9b1066', 'data_url': 'https://data.humdata.org/dataset/a7c3de5e-ff27-4746-99cd-05f2ad9b1066/resource/562e7757-0683-4d61-87bd-a7c94af2ee38/download/nga_admpop_adm2_2020.csv', 'time_period': {'start': '2020-01-01', 'end': '2020-12-31T23:59:59'}}"} {"test_scenario":"Assistant on-the-fly SQL, text answer", "query": "How many rows does the population table have for Nigeria", "chat_history": "[]", "context": "There are **43,794** rows of data in the population table for Nigeria."} -{"test_scenario":"Assistant created image (simple)", "query": "Plot f{x}=10", "chat_history": "[]", "context": "Image cksum: 3f4dafc66e68dc03e3ef6d2f02a85bc7\nImage description: {'content': 'The image is a plot of the function \\\\( f(x) = 10 \\\\). Here are the details of the plot:\\n\\n- The title of the plot is \"Plot of f(x) = 10\".\\n- The x-axis ranges from -10 to 10.\\n- The y-axis ranges from 0 to 10.\\n- The function \\\\( f(x) = 10 \\\\) is represented by a horizontal orange line at \\\\( y = 10 \\\\).\\n- There is a legend in the plot that labels the orange line as \"f(x) = 10\".\\n- The x-axis is labeled \"x\" and the y-axis is labeled \"f(x)\".\\n- The plot has grid lines for better readability.\\n\\nThe plot is relevant if the user query is about visualizing or understanding the function \\\\( f(x) = 10 \\\\), which is a constant function.'}"} \ No newline at end of file +{"test_scenario":"Assistant created image (simple)", "query": "Plot f{x}=10", "chat_history": "[]", "context": "Image cksum: 3f4dafc66e68dc03e3ef6d2f02a85bc7\nImage description: {'content': 'The image is a plot of the function \\\\( f(x) = 10 \\\\). Here are the details of the plot:\\n\\n- The title of the plot is \"Plot of f(x) = 10\".\\n- The x-axis ranges from -10 to 10.\\n- The y-axis ranges from 0 to 10.\\n- The function \\\\( f(x) = 10 \\\\) is represented by a horizontal orange line at \\\\( y = 10 \\\\).\\n- There is a legend in the plot that labels the orange line as \"f(x) = 10\".\\n- The x-axis is labeled \"x\" and the y-axis is labeled \"f(x)\".\\n- The plot has grid lines for better readability.\\n\\nThe plot is relevant if the user query is about visualizing or understanding the function \\\\( f(x) = 10 \\\\), which is a constant function.'}"} +{"test_scenario":"Assistant answering from uploaded documents", "query": "Is your data updated in realtime?", "chat_history": "[]", "context": "The data is not updated in real-time. For data sources configured as API data sources, the system will call them on-demand to pull in the latest data from the remote system. However, for data sources where data is ingested, like HAPI, the update frequency depends on how often the ingestion process is run, which is controlled by the user of the humanitarian AI assistant"} \ No newline at end of file diff --git a/flows/chainlit-ui-evaluation/flow.dag.yaml b/flows/chainlit-ui-evaluation/flow.dag.yaml index 8c544e72..9b6236ac 100644 --- a/flows/chainlit-ui-evaluation/flow.dag.yaml +++ b/flows/chainlit-ui-evaluation/flow.dag.yaml @@ -45,7 +45,7 @@ nodes: type: llm source: type: code - path: groundedness_score.jinja2 + path: templates/groundedness_score.jinja2 inputs: deployment_name: gpt-4-turbo answer: ${call_assistant.output.response} diff --git a/management/cli.py b/management/cli.py index 8547bf63..61e0a48c 100644 --- a/management/cli.py +++ b/management/cli.py @@ -15,7 +15,7 @@ os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) # noqa: E402 from utils.db import connect_to_db, execute_query # noqa: E402 -from utils.llm import call_llm # noqa: E402 +from utils.llm import call_llm, gen_sql # noqa: E402 llm_prompt_cap = 5000 sql_rows_cap = 100 @@ -436,30 +436,6 @@ def get_data_info(): return data_info -def gen_sql(input, chat_history, stdout_output, stderr_output): - - data_info = get_data_info() - - gen_sql_template = environment.get_template("gen_sql_prompt.jinja2") - prompt = gen_sql_template.render( - input=input, - stderr_output=stderr_output, - stdout_output=stdout_output, - data_info=data_info, - chat_history=chat_history, - ) - - response = call_llm("", prompt) - - query = response["code"] - - query = query.replace(";", "") + f" \nLIMIT {sql_rows_cap};" - - # print(query) - - return query - - def gen_summarize_results(input, sql, stdout_output, stderr_output): typer.echo(" Summarizing results ...") diff --git a/server/fastapi/app.py b/server/fastapi/app.py index 5d334cb1..e3e226b9 100644 --- a/server/fastapi/app.py +++ b/server/fastapi/app.py @@ -9,6 +9,8 @@ from utils.db import execute_query as db_execute_query from utils.recipes import get_memory_recipe +MAX_RESULTS = 500 + class MemoryRecipeInput(BaseModel): """ @@ -69,20 +71,26 @@ def execute_query_route(data: ExecuteQueryInput): """ try: + + trailer = "" + results = db_execute_query(data.query) + num_results = results.shape[0] + + # TODO: Add code to send back a link, if results are too large + if num_results > MAX_RESULTS: + print("Results are too large to send back") + results = results[0:MAX_RESULTS] + trailer = "... etc" + trailer += f"\n\nToo many rows ({num_results}) in the SQL query results. Please try again with a different query." + + results = results.to_json(orient="records") + results = json.dumps(json.loads(results), indent=4) + results += trailer + except Exception as e: print(f"Error executing query: {e}") results = f"Error executing query: {e}" return results - # TODO: Add code to send back a link, in case results are too large - if results.shape[0] > 500: - print("Results are too large to send back") - rowcount = results.shape[0] - results = str(results[0:50]) - results += "... etc" - results += f"\n\nToo many rows ({rowcount}) in the SQL query results. Please try again with a different query." - else: - results = str(results) - - return str(results) + return results diff --git a/flows/chainlit-ui-evaluation/groundedness_score.jinja2 b/templates/groundedness_score.jinja2 similarity index 98% rename from flows/chainlit-ui-evaluation/groundedness_score.jinja2 rename to templates/groundedness_score.jinja2 index 182a6329..e3a5dad2 100644 --- a/flows/chainlit-ui-evaluation/groundedness_score.jinja2 +++ b/templates/groundedness_score.jinja2 @@ -1,35 +1,35 @@ -System: -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. -User: -You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating: -1. 5: The ANSWER follows logically from the information contained in the CONTEXT. -2. 1: The ANSWER is logically false from the information contained in the CONTEXT. -3. an integer score between 1 and 5 and if such integer score does not exists, use 1: It is not possible to determine whether the ANSWER is true or false without further information. - -Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. - -Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. -Independent Examples: -## Example Task #1 Input: -{"CONTEXT": "The Academy Awards, also known as the Oscars are awards for artistic and technical merit for the film industry. They are presented annually by the Academy of Motion Picture Arts and Sciences, in recognition of excellence in cinematic achievements as assessed by the Academy's voting membership. The Academy Awards are regarded by many as the most prestigious, significant awards in the entertainment industry in the United States and worldwide.", "ANSWER": "Oscar is presented every other two years"} -## Example Task #1 Output: -1 -## Example Task #2 Input: -{"CONTEXT": "The Academy Awards, also known as the Oscars are awards for artistic and technical merit for the film industry. They are presented annually by the Academy of Motion Picture Arts and Sciences, in recognition of excellence in cinematic achievements as assessed by the Academy's voting membership. The Academy Awards are regarded by many as the most prestigious, significant awards in the entertainment industry in the United States and worldwide.", "ANSWER": "Oscar is very important awards in the entertainment industry in the United States. And it's also significant worldwide"} -## Example Task #2 Output: -5 -## Example Task #3 Input: -{"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."} -## Example Task #3 Output: -5 -## Example Task #4 Input: -{"CONTEXT": "Some are reported as not having been wanted at all.", "ANSWER": "All are reported as being completely and fully wanted."} -## Example Task #4 Output: -1 - -Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context. - -## Actual Task Input: -{"CONTEXT": {{context}}, "ANSWER": {{answer}}} - +System: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +User: +You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating: +1. 5: The ANSWER follows logically from the information contained in the CONTEXT. +2. 1: The ANSWER is logically false from the information contained in the CONTEXT. +3. an integer score between 1 and 5 and if such integer score does not exists, use 1: It is not possible to determine whether the ANSWER is true or false without further information. + +Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. + +Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. +Independent Examples: +## Example Task #1 Input: +{"CONTEXT": "The Academy Awards, also known as the Oscars are awards for artistic and technical merit for the film industry. They are presented annually by the Academy of Motion Picture Arts and Sciences, in recognition of excellence in cinematic achievements as assessed by the Academy's voting membership. The Academy Awards are regarded by many as the most prestigious, significant awards in the entertainment industry in the United States and worldwide.", "ANSWER": "Oscar is presented every other two years"} +## Example Task #1 Output: +1 +## Example Task #2 Input: +{"CONTEXT": "The Academy Awards, also known as the Oscars are awards for artistic and technical merit for the film industry. They are presented annually by the Academy of Motion Picture Arts and Sciences, in recognition of excellence in cinematic achievements as assessed by the Academy's voting membership. The Academy Awards are regarded by many as the most prestigious, significant awards in the entertainment industry in the United States and worldwide.", "ANSWER": "Oscar is very important awards in the entertainment industry in the United States. And it's also significant worldwide"} +## Example Task #2 Output: +5 +## Example Task #3 Input: +{"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."} +## Example Task #3 Output: +5 +## Example Task #4 Input: +{"CONTEXT": "Some are reported as not having been wanted at all.", "ANSWER": "All are reported as being completely and fully wanted."} +## Example Task #4 Output: +1 + +Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context. + +## Actual Task Input: +{"CONTEXT": {{context}}, "ANSWER": {{answer}}} + Actual Task Output: \ No newline at end of file diff --git a/templates/openai_assistant_prompt.jinja2 b/templates/openai_assistant_prompt.jinja2 index 5bb765b2..594d6ab0 100644 --- a/templates/openai_assistant_prompt.jinja2 +++ b/templates/openai_assistant_prompt.jinja2 @@ -1,3 +1,5 @@ +{# templates/openai_assistant_prompt.jinja2 #} + You are a helpful AI assistant that answers questions about humanitarian response. You should respond to different types of requests as follows: @@ -15,7 +17,23 @@ How to answer: Query table table_metadata to get a list of tables and their summaries and the countries they cover. See below for the exact columns in this table. -2. Specific requests about data coverage (by region, location) +2. Questions about how data was captured and data sources + +Example queries: + +- How was the data captured? +- Can you tell me more about the data providers? +- How frequently is your data updated? + +How to answer: + +Search your local document store to get an answer. + +How to answer: + +Describe the data you have access to and that you can help analyze data. + +3. Specific requests about data coverage (by region, location) Example queries: @@ -29,7 +47,7 @@ First check hapi_metadata_location, hapi_metadata_admin1, hapi_metadata_admin2 t Using the country, query table_metadata to see which tables have data for that country Query tables to check if they have data for the region. Use summary queries like count(*) -3. Requests using data (by entities such as country, region, location) +4. Requests using data (by entities such as country, region, location) Example queries: @@ -87,18 +105,18 @@ all data for a country, you would query with For population ... WHERE - adm0_code ='MLI' AND + {{ country_code_field }} ='MLI' AND gender = 'all' and age_range = 'all' - and adm1_code is null - and adm2_code is null + and {{ admin1_code_field }} is null + and {{ admin2_code_field }} is null For other tables ... WHERE - adm0_code ='MLI' AND - and adm1_code is null - and adm2_code is null + {{ country_code_field }} ='MLI' AND + and {{ admin1_code_field }} is null + and {{ admin2_code_field }} is null Conversely, if you do not exclude the aggregate data, you will get a mix of aggregated and disaggregated data. @@ -120,117 +138,4 @@ NEVER query the database for shape files, they are too large. Tables and their columns of data available to you ... -[ - { - "table_name": "hapi_affected_people_humanitarian_needs", - "summary": "['Affected people']", - "columns": "location_ref (bigint); admin1_ref (bigint); admin2_ref (bigint); min_age (double precision); max_age (double precision); population (bigint); latest (boolean); adm2_code (text); adm2_name (text); resource_hdx_id (text); gender (text); age_range (text); reference_period_start (text); reference_period_end (text); disabled_marker (text); sector_code (text); population_group (text); adm0_code (text); location_name (text); population_status (text); adm1_code (text); adm1_name (text); sector_name (text); " - }, - { - "table_name": "hapi_affected_people_refugees", - "summary": "['Affected people']", - "columns": "latest (boolean); origin_location_ref (bigint); asylum_location_ref (bigint); min_age (double precision); max_age (double precision); population (bigint); asylum_location_code (text); asylum_location_name (text); resource_hdx_id (text); reference_period_start (text); reference_period_end (text); origin_location_code (text); origin_location_name (text); population_group (text); gender (text); age_range (text); " - }, - { - "table_name": "hapi_coordination_context_conflict_event", - "summary": "['Coordination & Context']", - "columns": "location_ref (bigint); admin1_ref (bigint); admin2_ref (bigint); events (bigint); fatalities (double precision); latest (boolean); reference_period_end (text); adm2_code (text); adm2_name (text); resource_hdx_id (text); event_type (text); adm0_code (text); location_name (text); reference_period_start (text); adm1_code (text); adm1_name (text); " - }, - { - "table_name": "hapi_coordination_context_funding", - "summary": "['Coordination & Context']", - "columns": "latest (boolean); requirements_usd (double precision); funding_usd (double precision); funding_pct (double precision); location_ref (bigint); reference_period_start (text); reference_period_end (text); resource_hdx_id (text); adm0_code (text); appeal_code (text); appeal_name (text); appeal_type (text); location_name (text); " - }, - { - "table_name": "hapi_coordination_context_national_risk", - "summary": "['Coordination & Context']", - "columns": "latest (boolean); global_rank (bigint); overall_risk (double precision); hazard_exposure_risk (double precision); vulnerability_risk (double precision); coping_capacity_risk (double precision); meta_missing_indicators_pct (double precision); meta_avg_recentness_years (double precision); risk_class (bigint); reference_period_end (text); resource_hdx_id (text); adm0_code (text); location_name (text); reference_period_start (text); " - }, - { - "table_name": "hapi_coordination_context_operational_presence", - "summary": "['Coordination & Context']", - "columns": "location_ref (bigint); admin1_ref (bigint); admin2_ref (bigint); org_type_code (double precision); latest (boolean); adm1_name (text); org_type_description (text); adm2_code (text); adm2_name (text); resource_hdx_id (text); org_acronym (text); org_name (text); sector_code (text); sector_name (text); reference_period_start (text); adm0_code (text); location_name (text); reference_period_end (text); adm1_code (text); " - }, - { - "table_name": "hapi_food_food_price", - "summary": "['Food Security & Nutrition']", - "columns": "latest (boolean); admin1_ref (bigint); admin2_ref (bigint); market_code (bigint); commodity_code (bigint); price (double precision); lat (double precision); lon (double precision); location_ref (bigint); resource_hdx_id (text); reference_period_end (text); market_name (text); reference_period_start (text); commodity_name (text); commodity_category (text); currency_code (text); unit (text); adm0_code (text); location_name (text); price_flag (text); adm1_code (text); adm1_name (text); price_type (text); adm2_code (text); adm2_name (text); " - }, - { - "table_name": "hapi_food_food_security", - "summary": "['Food Security & Nutrition']", - "columns": "location_ref (bigint); admin1_ref (bigint); admin2_ref (bigint); population_in_phase (bigint); population_fraction_in_phase (double precision); latest (boolean); reference_period_end (text); adm2_code (text); adm2_name (text); resource_hdx_id (text); ipc_phase (text); ipc_type (text); adm0_code (text); location_name (text); reference_period_start (text); adm1_code (text); adm1_name (text); " - }, - { - "table_name": "hapi_metadata_admin1", - "summary": "['Metadata']", - "columns": "reference_period_end (double precision); code (text); name (text); reference_period_start (text); adm0_code (text); location_name (text); " - }, - { - "table_name": "hapi_metadata_admin2", - "summary": "['Metadata']", - "columns": "reference_period_end (double precision); name (text); reference_period_start (text); adm1_code (text); adm1_name (text); adm0_code (text); code (text); location_name (text); " - }, - { - "table_name": "hapi_metadata_currency", - "summary": "['Metadata']", - "columns": "code (text); name (text); " - }, - { - "table_name": "hapi_metadata_dataset", - "summary": "['Metadata']", - "columns": "hdx_id (text); hdx_stub (text); title (text); hdx_provider_stub (text); hdx_provider_name (text); hdx_link (text); hdx_api_link (text); provider_hdx_link (text); provider_hdx_api_link (text); " - }, - { - "table_name": "hapi_metadata_location", - "summary": "['Metadata']", - "columns": "reference_period_end (double precision); code (text); name (text); reference_period_start (text); " - }, - { - "table_name": "hapi_population_social_poverty_rate", - "summary": "['Population & Socio-Economy']", - "columns": "mpi (double precision); headcount_ratio (double precision); intensity_of_deprivation (double precision); vulnerable_to_poverty (double precision); in_severe_poverty (double precision); latest (boolean); reference_period_start (text); resource_hdx_id (text); reference_period_end (text); adm0_code (text); location_name (text); adm1_name (text); " - }, - { - "table_name": "hapi_metadata_org_type", - "summary": "['Metadata']", - "columns": "code (bigint); description (text); " - }, - { - "table_name": "hapi_metadata_org", - "summary": "['Metadata']", - "columns": "org_type_code (double precision); acronym (text); name (text); org_type_description (text); " - }, - { - "table_name": "hapi_metadata_resource", - "summary": "['Metadata']", - "columns": "is_hxl (boolean); dataset_hdx_id (text); name (text); format (text); update_date (text); download_url (text); hapi_updated_date (text); dataset_hdx_stub (text); dataset_title (text); dataset_hdx_provider_stub (text); dataset_hdx_provider_name (text); hdx_link (text); hdx_api_link (text); dataset_hdx_link (text); dataset_hdx_api_link (text); provider_hdx_link (text); hdx_id (text); provider_hdx_api_link (text); " - }, - { - "table_name": "hapi_metadata_sector", - "summary": "['Metadata']", - "columns": "code (text); name (text); " - }, - { - "table_name": "hapi_metadata_wfp_commodity", - "summary": "['Metadata']", - "columns": "code (bigint); category (text); name (text); " - }, - { - "table_name": "hapi_metadata_wfp_market", - "summary": "['Metadata']", - "columns": "lon (double precision); admin1_ref (bigint); admin2_ref (bigint); code (bigint); lat (double precision); location_ref (bigint); name (text); adm2_code (text); adm0_code (text); location_name (text); adm2_name (text); adm1_code (text); adm1_name (text); " - }, - { - "table_name": "hapi_population_social_population", - "summary": "['Population & Socio-Economy']", - "columns": "location_ref (bigint); admin1_ref (bigint); admin2_ref (bigint); min_age (double precision); max_age (double precision); population (bigint); latest (boolean); adm2_code (text); adm2_name (text); resource_hdx_id (text); gender (text); age_range (text); adm0_code (text); location_name (text); reference_period_end (text); adm1_code (text); adm1_name (text); reference_period_start (text); " - }, - { - "table_name": "hdx_shape_files", - "summary": "HDX Shape Files", - "columns": "geometry (USER-DEFINED); OBJECTID (double precision); AREA_SQKM (double precision); Shape_Area (double precision); Shape_Leng (double precision); ADM1ALT2FR (text); ADM0_FR (text); adm0_code (text); date (text); validOn (text); validTo (text); ADM2_FR (text); adm2_code (text); ADM2_REF (text); ADM2ALT1FR (text); ADM2ALT2FR (text); ADM1_EN (text); ADM1ALT1EN (text); ADM1ALT2EN (text); ADM0_EN (text); ADM2_EN (text); ADM2ALT1EN (text); ADM2ALT2EN (text); ADM1_ES (text); ADM1ALT1ES (text); ADM1ALT2ES (text); ADM0_ES (text); ADM2_ES (text); ADM2ALT1ES (text); ADM2ALT2ES (text); ValidTo (text); ADM1_HT (text); ADM1ALT1HT (text); ADM1ALT2HT (text); ADM0_HT (text); ADM2_HT (text); ADM2ALT1HT (text); ADM2ALT2HT (text); ADM1_MY (text); ADM1_ALTPC (text); ADM0_MY (text); ADM2_MY (text); ADM1_PT (text); ADM1ALT1PT (text); ADM1ALT2PT (text); ADM0_PT (text); ADM2_PT (text); ADM2ALT1PT (text); ADM2ALT2PT (text); SD_EN (text); SD_PCODE (text); ADM1_AR (text); ADM1ALT1AR (text); ADM1ALT2AR (text); ADM0_AR (text); ADM2_AR (text); ADM2ALT1AR (text); ADM2ALT2AR (text); admin1Name (text); admin1RefN (text); admin1Na_1 (text); admin1AltN (text); admin1Al_1 (text); admin0Name (text); admin2Name (text); admin2RefN (text); admin2Na_1 (text); admin2AltN (text); admin2Al_1 (text); ADM1_UA (text); ADM1_RU (text); ADM0_UA (text); ADM0_RU (text); ADM2_UA (text); ADM2_RU (text); ADM1_FR (text); adm1_code (text); ADM1_REF (text); ADM1ALT1FR (text); " - } -] - - +{{ data_info }} \ No newline at end of file diff --git a/ui/chat-chainlit-assistant/app.py b/ui/chat-chainlit-assistant/app.py index 31123430..c248858d 100644 --- a/ui/chat-chainlit-assistant/app.py +++ b/ui/chat-chainlit-assistant/app.py @@ -28,57 +28,66 @@ from utils.general import call_execute_query_api, call_get_memory_recipe_api -environment = Environment(loader=FileSystemLoader("./templates/")) -chat_ui_assistant_prompt_template = environment.get_template( - "chat_ui_assistant_prompt.jinja2" -) - -footer = "\n***\n" -llm_footer = footer + "🤖 *Caution: LLM Analysis*" -human_footer = footer + "✅ *A human approved this data recipe*" - logging.basicConfig(filename="output.log", level=logging.DEBUG) logger = logging.getLogger() load_dotenv("../../.env") +footer = "\n***\n" +llm_footer = footer + "🤖 *Caution: LLM Analysis*" +human_footer = footer + "✅ *A human approved this data recipe*" images_loc = "./public/images/" - user = os.environ.get("USER_LOGIN") password = os.environ.get("USER_PWD") -if os.environ.get("ASSISTANTS_API_TYPE") == "openai": - async_openai_client = AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY")) - sync_openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) -else: - async_openai_client = AsyncAzureOpenAI( - azure_endpoint=os.getenv("ASSISTANTS_BASE_URL"), - api_key=os.getenv("ASSISTANTS_API_KEY"), - api_version=os.getenv("ASSISTANTS_API_VERSION"), - ) - sync_openai_client = AzureOpenAI( - azure_endpoint=os.getenv("ASSISTANTS_BASE_URL"), - api_key=os.getenv("ASSISTANTS_API_KEY"), - api_version=os.getenv("ASSISTANTS_API_VERSION"), - ) +def setup(cl): + """ + Sets up the assistant and OpenAI API clients based on the environment variables. -cl.instrument_openai() # Instrument the OpenAI API client + Args: + cl: The ChatLabs instance. -assistant = sync_openai_client.beta.assistants.retrieve(os.environ.get("ASSISTANTS_ID")) + Returns: + tuple: A tuple containing the assistant, async OpenAI API client, and sync OpenAI API client. + """ -# config.ui.name = assistant.name -bot_name = os.getenv("ASSISTANTS_BOT_NAME") -config.ui.name = bot_name + if os.environ.get("ASSISTANTS_API_TYPE") == "openai": + async_openai_client = AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + sync_openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + else: + async_openai_client = AsyncAzureOpenAI( + azure_endpoint=os.getenv("ASSISTANTS_BASE_URL"), + api_key=os.getenv("ASSISTANTS_API_KEY"), + api_version=os.getenv("ASSISTANTS_API_VERSION"), + ) + sync_openai_client = AzureOpenAI( + azure_endpoint=os.getenv("ASSISTANTS_BASE_URL"), + api_key=os.getenv("ASSISTANTS_API_KEY"), + api_version=os.getenv("ASSISTANTS_API_VERSION"), + ) + cl.instrument_openai() # Instrument the OpenAI API client -def get_event_handler(cl, assistant_name): # noqa: C901 + assistant = sync_openai_client.beta.assistants.retrieve( + os.environ.get("ASSISTANTS_ID") + ) + + # config.ui.name = assistant.name + bot_name = os.getenv("ASSISTANTS_BOT_NAME") + config.ui.name = bot_name + + return assistant, async_openai_client, sync_openai_client + + +def get_event_handler(cl, assistant_name, sync_openai_client): # noqa: C901 """ Returns an instance of the EventHandler class, which is responsible for handling events in the ChatChainlitAssistant. Args: cl: The ChatClient instance used for communication with the chat service. assistant_name (str): The name of the assistant. + sync_openai_client: The synchronous OpenAI API client. Returns: EventHandler: An instance of the EventHandler class. @@ -86,12 +95,14 @@ def get_event_handler(cl, assistant_name): # noqa: C901 class EventHandler(AssistantEventHandler): - def __init__(self, assistant_name: str) -> None: + def __init__(self, cl, assistant_name: str, sync_openai_client) -> None: """ Initializes a new instance of the ChatChainlitAssistant class. Args: assistant_name (str): The name of the assistant. + sync_openai_client: The synchronous OpenAI API client. + Returns: None @@ -103,6 +114,11 @@ def __init__(self, assistant_name: str) -> None: self.current_message_text = "" self.assistant_name = assistant_name self.cl = cl + self.sync_openai_client = sync_openai_client + + @override + def on_message_done(self, message) -> None: + self.handle_message_completed(message) @override def on_event(self, event): @@ -115,19 +131,21 @@ def on_event(self, event): Returns: None """ - # print(event.event) + print(event.event) run_id = event.data.id if event.event == "thread.message.created": self.current_message = self.cl.Message(content="") self.current_message = run_sync(self.current_message.send()) self.current_message_text = "" print("Run started") - if event.event == "thread.message.completed": - self.handle_message_completed(event.data, run_id) + # if event.event == "thread.message.completed": + # self.handle_message_completed(event.data, run_id) elif event.event == "thread.run.requires_action": self.handle_requires_action(event.data, run_id) elif event.event == "thread.message.delta": self.handle_message_delta(event.data) + elif event.event == "thread.run.step.completed": + print("Message done") elif event.event == "thread.run.step.delta": # TODO Here put code to stream code_interpreter output to the chat. # When chainlit openai async supports functions @@ -156,7 +174,7 @@ def handle_message_delta(self, data): run_sync(self.current_message.stream_token(content)) elif content.type == "image_file": file_id = content.image_file.file_id - image_data = sync_openai_client.files.content(file_id) + image_data = self.sync_openai_client.files.content(file_id) image_data_bytes = image_data.read() png_file = f"{images_loc}{file_id}.png" print(f"Writing image to {png_file}") @@ -171,17 +189,40 @@ def handle_message_delta(self, data): else: print(f"Unhandled delta type: {content.type}") - def handle_message_completed(self, data, run_id): + def handle_message_completed(self, message): """ Handles the completion of a message. Args: - data: The data associated with the completed message. - run_id: The ID of the message run. + message: The message object. + citations: The citations to be added to the message. Returns: None """ + + # Check for citations + if hasattr(message.content[0], "text"): + message_content = message.content[0].text + annotations = message_content.annotations + citations = [] + if annotations: + message_content = message.content[0].text + annotations = message_content.annotations + for index, annotation in enumerate(annotations): + message_content.value = message_content.value.replace( + annotation.text, f"[{index}]" + ) + if file_citation := getattr(annotation, "file_citation", None): + cited_file = self.sync_openai_client.files.retrieve( + file_citation.file_id + ) + citations.append(f"[{index}] {cited_file.filename}") + + print(message_content.value) + content = message_content.value + self.current_message.content = content + # Add footer to self message. We have to start a new message so it's in right order # TODO combine streaming with image and footer run_sync(self.current_message.update()) @@ -191,7 +232,12 @@ def handle_message_completed(self, data, run_id): word_count = len(self.current_message_text.split()) if word_count > 10: - run_sync(self.current_message.stream_token(llm_footer)) + if citations is not None: + citations = "; Sources: " + "; ".join(citations) + else: + citations = "" + run_sync(self.current_message.stream_token(llm_footer + citations)) + run_sync(self.current_message.update()) def handle_requires_action(self, data, run_id): @@ -237,8 +283,10 @@ def submit_tool_outputs(self, tool_outputs, run_id): Returns: None """ - event_handler = get_event_handler(cl, assistant.name) - with sync_openai_client.beta.threads.runs.submit_tool_outputs_stream( + event_handler = get_event_handler( + cl, self.assistant_name, self.sync_openai_client + ) + with self.sync_openai_client.beta.threads.runs.submit_tool_outputs_stream( thread_id=self.current_run.thread_id, run_id=self.current_run.id, tool_outputs=tool_outputs, @@ -248,7 +296,7 @@ def submit_tool_outputs(self, tool_outputs, run_id): for text in stream.text_deltas: print(text) - event_handler = EventHandler(assistant_name) + event_handler = EventHandler(cl, assistant_name, sync_openai_client) return event_handler @@ -306,6 +354,8 @@ async def cleanup(): # await cl.user_session.clear() thread = cl.user_session.get("thread") run_id = cl.user_session.get("run_id") + async_openai_client = cl.user_session.get("async_openai_client") + if run_id is not None: await async_openai_client.beta.threads.runs.cancel( thread_id=thread.id, run_id=cl.user_session.get("run_id") @@ -333,6 +383,7 @@ async def speech_to_text(audio_file): Any exceptions raised by the OpenAI API. """ + async_openai_client = cl.user_session.get("async_openai_client") response = await async_openai_client.audio.transcriptions.create( model="whisper-1", file=audio_file ) @@ -350,6 +401,7 @@ async def upload_files(files: List[Element]): Returns: List[str]: A list of file IDs corresponding to the uploaded files. """ + async_openai_client = cl.user_session.get("async_openai_client") file_ids = [] for file in files: uploaded_file = await async_openai_client.files.create( @@ -394,8 +446,16 @@ async def start_chat(): Returns: dict: The thread object returned by the OpenAI API. """ + + # Setup clients + assistant, async_openai_client, sync_openai_client = setup(cl) + cl.user_session.set("assistant", assistant) + cl.user_session.set("async_openai_client", async_openai_client) + cl.user_session.set("sync_openai_client", sync_openai_client) + # Create a Thread thread = await async_openai_client.beta.threads.create() + # Store thread ID in user session for later use cl.user_session.set("thread_id", thread.id) @@ -536,6 +596,7 @@ def check_memories_recipes(user_input: str, history=[]) -> str: if len(data) > 50: data = data[:50] data.append(["..."]) + data = str(data) elements.append(cl.Text(name="", content=data, display="inline")) @@ -657,6 +718,8 @@ async def add_message_to_thread(thread_id, role, content, message=None): None """ + async_openai_client = cl.user_session.get("async_openai_client") + print(f"Content: {content}") attachments = [] @@ -688,6 +751,9 @@ async def process_message(message: cl.Message): thread_id = cl.user_session.get("thread_id") chat_history = cl.user_session.get("chat_history") + assistant = cl.user_session.get("assistant") + sync_openai_client = cl.user_session.get("sync_openai_client") + chat_history.append({"role": "user", "content": message.content}) # Add user message to thread @@ -720,7 +786,7 @@ async def process_message(message: cl.Message): # Create and Stream a Run to assistant print(f"Creating and streaming a run {assistant.id}") - event_handler = get_event_handler(cl, assistant.name) + event_handler = get_event_handler(cl, assistant.name, sync_openai_client) with sync_openai_client.beta.threads.runs.stream( thread_id=thread_id, assistant_id=assistant.id, diff --git a/ui/chat-chainlit-assistant/create_update_assistant.py b/ui/chat-chainlit-assistant/create_update_assistant.py new file mode 100755 index 00000000..e69de29b diff --git a/ui/chat-chainlit-flow/.chainlit/config.toml b/ui/chat-chainlit-flow/.chainlit/config.toml deleted file mode 100644 index b35e4633..00000000 --- a/ui/chat-chainlit-flow/.chainlit/config.toml +++ /dev/null @@ -1,121 +0,0 @@ -[project] -# Whether to enable telemetry (default: true). No personal data is collected. -enable_telemetry = false - - -# List of environment variables to be provided by each user to use the app. -user_env = [] - -# Duration (in seconds) during which the session is saved when the connection is lost -session_timeout = 3600 - -# Enable third parties caching (e.g LangChain cache) -cache = false - -# Authorized origins -allow_origins = ["*"] - -# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317) -# follow_symlink = false - -[features] -# Show the prompt playground -prompt_playground = false - -# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript) -unsafe_allow_html = false - -# Process and display mathematical expressions. This can clash with "$" characters in messages. -latex = false - -# Automatically tag threads with the current chat profile (if a chat profile is used) -auto_tag_thread = true - -# Authorize users to spontaneously upload files with messages -[features.spontaneous_file_upload] - enabled = false - accept = ["*/*"] - max_files = 20 - max_size_mb = 500 - -[features.audio] - # Threshold for audio recording - min_decibels = -45 - # Delay for the user to start speaking in MS - initial_silence_timeout = 3000 - # Delay for the user to continue speaking in MS. If the user stops speaking for this duration, the recording will stop. - silence_timeout = 1500 - # Above this duration (MS), the recording will forcefully stop. - max_duration = 15000 - # Duration of the audio chunks in MS - chunk_duration = 1000 - # Sample rate of the audio - sample_rate = 44100 - -[UI] -# Name of the app and chatbot. -name = "Humanitarian AI Assistant" - -# Show the readme while the thread is empty. -show_readme_as_default = true - -# Description of the app and chatbot. This is used for HTML tags. -# description = "" - -# Large size content are by default collapsed for a cleaner ui -default_collapse_content = false - -# The default value for the expand messages settings. -default_expand_messages = false - -# Hide the chain of thought details from the user in the UI. -hide_cot = true - -# Link to your github repo. This will add a github button in the UI's header. -# github = "" - -# Specify a CSS file that can be used to customize the user interface. -# The CSS file can be served from the public directory or via an external link. -custom_css = "/public/elastic.css" - -# Specify a Javascript file that can be used to customize the user interface. -# The Javascript file can be served from the public directory. -# custom_js = "/public/test.js" - -# Specify a custom font url. -# custom_font = "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap" - -# Specify a custom meta image url. -# custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png" - -# Specify a custom build directory for the frontend. -# This can be used to customize the frontend code. -# Be careful: If this is a relative path, it should not start with a slash. -# custom_build = "./public/build" - -[UI.theme] - #layout = "wide" - #font_family = "Inter, sans-serif" -# Override default MUI light theme. (Check theme.ts) -[UI.theme.light] - #background = "#FAFAFA" - #paper = "#FFFFFF" - - [UI.theme.light.primary] - #main = "#F80061" - #dark = "#980039" - #light = "#FFE7EB" - -# Override default MUI dark theme. (Check theme.ts) -[UI.theme.dark] - #background = "#FAFAFA" - #paper = "#FFFFFF" - - [UI.theme.dark.primary] - #main = "#F80061" - #dark = "#980039" - #light = "#FFE7EB" - - -[meta] -generated_by = "1.1.202" diff --git a/ui/chat-chainlit-flow/.chainlit/style.css b/ui/chat-chainlit-flow/.chainlit/style.css deleted file mode 100644 index 23e7012a..00000000 --- a/ui/chat-chainlit-flow/.chainlit/style.css +++ /dev/null @@ -1,3 +0,0 @@ -a[href*='https://github.com/Chainlit/chainlit'] { - visibility: hidden; -} \ No newline at end of file diff --git a/ui/chat-chainlit-flow/.chainlit/translations/en-US.json b/ui/chat-chainlit-flow/.chainlit/translations/en-US.json deleted file mode 100644 index 0bca7207..00000000 --- a/ui/chat-chainlit-flow/.chainlit/translations/en-US.json +++ /dev/null @@ -1,231 +0,0 @@ -{ - "components": { - "atoms": { - "buttons": { - "userButton": { - "menu": { - "settings": "Settings", - "settingsKey": "S", - "APIKeys": "API Keys", - "logout": "Logout" - } - } - } - }, - "molecules": { - "newChatButton": { - "newChat": "New Chat" - }, - "tasklist": { - "TaskList": { - "title": "\ud83d\uddd2\ufe0f Task List", - "loading": "Loading...", - "error": "An error occured" - } - }, - "attachments": { - "cancelUpload": "Cancel upload", - "removeAttachment": "Remove attachment" - }, - "newChatDialog": { - "createNewChat": "Create new chat?", - "clearChat": "This will clear the current messages and start a new chat.", - "cancel": "Cancel", - "confirm": "Confirm" - }, - "settingsModal": { - "settings": "Settings", - "expandMessages": "Expand Messages", - "hideChainOfThought": "Hide Chain of Thought", - "darkMode": "Dark Mode" - }, - "detailsButton": { - "using": "Using", - "running": "Running", - "took_one": "Took {{count}} step", - "took_other": "Took {{count}} steps" - }, - "auth": { - "authLogin": { - "title": "Login to access the app.", - "form": { - "email": "Email address", - "password": "Password", - "noAccount": "Don't have an account?", - "alreadyHaveAccount": "Already have an account?", - "signup": "Sign Up", - "signin": "Sign In", - "or": "OR", - "continue": "Continue", - "forgotPassword": "Forgot password?", - "passwordMustContain": "Your password must contain:", - "emailRequired": "email is a required field", - "passwordRequired": "password is a required field" - }, - "error": { - "default": "Unable to sign in.", - "signin": "Try signing in with a different account.", - "oauthsignin": "Try signing in with a different account.", - "redirect_uri_mismatch": "The redirect URI is not matching the oauth app configuration.", - "oauthcallbackerror": "Try signing in with a different account.", - "oauthcreateaccount": "Try signing in with a different account.", - "emailcreateaccount": "Try signing in with a different account.", - "callback": "Try signing in with a different account.", - "oauthaccountnotlinked": "To confirm your identity, sign in with the same account you used originally.", - "emailsignin": "The e-mail could not be sent.", - "emailverify": "Please verify your email, a new email has been sent.", - "credentialssignin": "Sign in failed. Check the details you provided are correct.", - "sessionrequired": "Please sign in to access this page." - } - }, - "authVerifyEmail": { - "almostThere": "You're almost there! We've sent an email to ", - "verifyEmailLink": "Please click on the link in that email to complete your signup.", - "didNotReceive": "Can't find the email?", - "resendEmail": "Resend email", - "goBack": "Go Back", - "emailSent": "Email sent successfully.", - "verifyEmail": "Verify your email address" - }, - "providerButton": { - "continue": "Continue with {{provider}}", - "signup": "Sign up with {{provider}}" - }, - "authResetPassword": { - "newPasswordRequired": "New password is a required field", - "passwordsMustMatch": "Passwords must match", - "confirmPasswordRequired": "Confirm password is a required field", - "newPassword": "New password", - "confirmPassword": "Confirm password", - "resetPassword": "Reset Password" - }, - "authForgotPassword": { - "email": "Email address", - "emailRequired": "email is a required field", - "emailSent": "Please check the email address {{email}} for instructions to reset your password.", - "enterEmail": "Enter your email address and we will send you instructions to reset your password.", - "resendEmail": "Resend email", - "continue": "Continue", - "goBack": "Go Back" - } - } - }, - "organisms": { - "chat": { - "history": { - "index": { - "showHistory": "Show history", - "lastInputs": "Last Inputs", - "noInputs": "Such empty...", - "loading": "Loading..." - } - }, - "inputBox": { - "input": { - "placeholder": "Type your message here..." - }, - "speechButton": { - "start": "Start recording", - "stop": "Stop recording" - }, - "SubmitButton": { - "sendMessage": "Send message", - "stopTask": "Stop Task" - }, - "UploadButton": { - "attachFiles": "Attach files" - }, - "waterMark": { - "text": "Built with" - } - }, - "Messages": { - "index": { - "running": "Running", - "executedSuccessfully": "executed successfully", - "failed": "failed", - "feedbackUpdated": "Feedback updated", - "updating": "Updating" - } - }, - "dropScreen": { - "dropYourFilesHere": "Drop your files here" - }, - "index": { - "failedToUpload": "Failed to upload", - "cancelledUploadOf": "Cancelled upload of", - "couldNotReachServer": "Could not reach the server", - "continuingChat": "Continuing previous chat" - }, - "settings": { - "settingsPanel": "Settings panel", - "reset": "Reset", - "cancel": "Cancel", - "confirm": "Confirm" - } - }, - "threadHistory": { - "sidebar": { - "filters": { - "FeedbackSelect": { - "feedbackAll": "Feedback: All", - "feedbackPositive": "Feedback: Positive", - "feedbackNegative": "Feedback: Negative" - }, - "SearchBar": { - "search": "Search" - } - }, - "DeleteThreadButton": { - "confirmMessage": "This will delete the thread as well as it's messages and elements.", - "cancel": "Cancel", - "confirm": "Confirm", - "deletingChat": "Deleting chat", - "chatDeleted": "Chat deleted" - }, - "index": { - "pastChats": "Past Chats" - }, - "ThreadList": { - "empty": "Empty...", - "today": "Today", - "yesterday": "Yesterday", - "previous7days": "Previous 7 days", - "previous30days": "Previous 30 days" - }, - "TriggerButton": { - "closeSidebar": "Close sidebar", - "openSidebar": "Open sidebar" - } - }, - "Thread": { - "backToChat": "Go back to chat", - "chatCreatedOn": "This chat was created on" - } - }, - "header": { - "chat": "Chat", - "readme": "Readme" - } - } - }, - "hooks": { - "useLLMProviders": { - "failedToFetchProviders": "Failed to fetch providers:" - } - }, - "pages": { - "Design": {}, - "Env": { - "savedSuccessfully": "Saved successfully", - "requiredApiKeys": "Required API Keys", - "requiredApiKeysInfo": "To use this app, the following API keys are required. The keys are stored on your device's local storage." - }, - "Page": { - "notPartOfProject": "You are not part of this project." - }, - "ResumeButton": { - "resumeChat": "Resume Chat" - } - } -} \ No newline at end of file diff --git a/ui/chat-chainlit-flow/Dockerfile b/ui/chat-chainlit-flow/Dockerfile deleted file mode 100644 index 40fd57ca..00000000 --- a/ui/chat-chainlit-flow/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -FROM python:3.11.4 - -COPY ui/chat-chainlit-flow /app -COPY requirements.txt /app -WORKDIR /app - -RUN pip install --upgrade pip -RUN pip install -r requirements.txt - - -COPY ./utils/ ./utils -COPY ./templates ./templates -# -RUN mkdir recepes -RUN mkdir recepes/images - -CMD ["chainlit", "run", "app.py", "--port", "8000", "--watch"] diff --git a/ui/chat-chainlit-flow/app.py b/ui/chat-chainlit-flow/app.py deleted file mode 100644 index 3e7d4a69..00000000 --- a/ui/chat-chainlit-flow/app.py +++ /dev/null @@ -1,156 +0,0 @@ -import json -import logging -import os -import sys - -import chainlit as cl -import pandas as pd -import requests -from dotenv import load_dotenv - -from utils.general import call_execute_query_api, call_get_memory_recipe_api -from utils.llm import gen_sql, gen_summarize_results - -logging.basicConfig(filename="output.log", level=logging.DEBUG) -logger = logging.getLogger() - -load_dotenv("../../.env") - - -def print(*tup): - logger.info(" ".join(str(x) for x in tup)) - - -async def ask_data(input, chat_history, active_message): - """ - Asynchronously processes the input data and chat history to generate an output. - - Args: - input: The input data. - chat_history: The chat history. - - Returns: - The generated output. - - Raises: - Exception: If there is an error during the execution of the query. - """ - - output = "" - - # Default to memory/recipe - mode = "memory_recipe" - - chat_history = cl.user_session.get("chat_history") - if len(chat_history) > 3: - chat_history = chat_history[-3:] - - # Loop 3 times to retry errors - for i in range(10): - try: - if mode == "memory_recipe": - output = await call_get_memory_recipe_api( - input, history=str(chat_history), generate_intent="true" - ) - # To do, make this a variable in recipes module - if "Sorry, no recipe or found" in str(output): - mode = "execute_query" - sql = "" - - if mode == "execute_query": - # active_message.content = "Hmm. I didn't find any recipes, let me query the database ..." - # await active_message.update() - # await active_message.send() - - sql = await gen_sql(input, str(chat_history), output) - print(sql) - output = await call_execute_query_api(sql) - - # Hack for the demo - if "error" in str(output): - print("Error in output, trying again ...") - else: - output = await gen_summarize_results(input, sql, output) - - # print(output) - # break - except Exception as e: - print(e) - if i == 2: - print("Failed to execute query") - break - - return output - - -@cl.step(type="tool") -async def tool(message: str, active_message: cl.Message): - """ - This function represents a tool step in the data recipe chat chainlit. - - Parameters: - message (str): The message to be passed to the ask_data function. - - Returns: - The result obtained from the ask_data function. - """ - result = await ask_data(message, [], active_message) - return result - - -@cl.on_chat_start -async def start_chat(): - """ - Starts a chat session by creating a new thread, setting the thread in the user session, - and sending an introductory message from bot. - """ - - cl.user_session.set("messages", []) - - -async def add_message_to_history(message, role): - """ - Adds a message to the chat history. - - Args: - message: The message to be added to the chat history. - role: The role of the message (bot/user) - - Returns: - None. - """ - - if cl.user_session.get("chat_history") is None: - cl.user_session.set("chat_history", []) - - chat_history = cl.user_session.get("chat_history") + [ - {"role": role, "content": message}, - ] - cl.user_session.set("chat_history", chat_history) - - -@cl.on_message # this function will be called every time a user inputs a message in the UI -async def main(message: cl.Message): - """ - This function is called every time a user inputs a message in the UI. - It sends back an intermediate response from the tool, followed by the final answer. - - Args: - message: The user's message. - - Returns: - None. - """ - - await add_message_to_history(message.content, "user") - - final_answer = await cl.Message(content="").send() - - # Call the tool - final_answer.content = await tool(message.content, final_answer) - - # print(final_answer.content) - - await add_message_to_history(final_answer.content[0:1000], "bot") - - await final_answer.update() diff --git a/ui/chat-chainlit-flow/chainlit.md b/ui/chat-chainlit-flow/chainlit.md deleted file mode 100644 index c45e28aa..00000000 --- a/ui/chat-chainlit-flow/chainlit.md +++ /dev/null @@ -1,2 +0,0 @@ - -Hi. I'm your humanitarian AI assistant. \ No newline at end of file diff --git a/ui/chat-chainlit-flow/public/elastic.css b/ui/chat-chainlit-flow/public/elastic.css deleted file mode 100644 index 363aa608..00000000 --- a/ui/chat-chainlit-flow/public/elastic.css +++ /dev/null @@ -1,5 +0,0 @@ -a[href*='https://github.com/Chainlit/chainlit'] { - visibility: hidden; -} - - diff --git a/ui/chat-chainlit-flow/public/logo_dark.png b/ui/chat-chainlit-flow/public/logo_dark.png deleted file mode 100644 index cfce562e..00000000 Binary files a/ui/chat-chainlit-flow/public/logo_dark.png and /dev/null differ diff --git a/ui/chat-chainlit-flow/public/logo_light.png b/ui/chat-chainlit-flow/public/logo_light.png deleted file mode 100644 index db9e4a68..00000000 Binary files a/ui/chat-chainlit-flow/public/logo_light.png and /dev/null differ diff --git a/utils/db.py b/utils/db.py index 12e65b58..b91bf795 100644 --- a/utils/db.py +++ b/utils/db.py @@ -103,7 +103,7 @@ def connect_to_db(instance="recipe"): return conn -async def get_data_info(): +def get_data_info(): """ Get data info from the database. @@ -126,6 +126,6 @@ async def get_data_info(): -- countries is not null """ - data_info = await call_execute_query_api(query) + data_info = call_execute_query_api(query) return data_info diff --git a/utils/general.py b/utils/general.py index 0a41fb1a..44918e09 100755 --- a/utils/general.py +++ b/utils/general.py @@ -101,6 +101,13 @@ def make_api_request(url, payload): response = requests.post(url, headers=headers, json=payload) print(f"API Response Status Code: {response.status_code}") response = response.content + + try: + response = json.loads(response) + # response = json.dumps(response, indent=4) + except json.JSONDecodeError: + print("Error decoding JSON response") + pass print(f"API Response {response}") return response @@ -118,6 +125,9 @@ def call_execute_query_api(sql): """ data = {"query": f"{sql}"} print(f"Calling execute query API {execute_query_url} with {sql} ...") + + make_api_request(execute_query_url, data) + return make_api_request(execute_query_url, data) @@ -146,7 +156,9 @@ def call_get_memory_recipe_api(user_input, history, generate_intent="true"): if isinstance(result, bytes): result = result.decode("utf-8") + if isinstance(result, str): + result = json.loads(result) + print("IN API CALL", result) - result = json.loads(result) return result diff --git a/utils/llm.py b/utils/llm.py index d2bb9dc7..d5fda88d 100644 --- a/utils/llm.py +++ b/utils/llm.py @@ -175,7 +175,7 @@ def call_llm(instructions, prompt, image=None): response = None -async def gen_sql(input, chat_history, output): +def gen_sql(input, chat_history, output): """ Generate SQL query based on input, chat history, and output. @@ -194,7 +194,7 @@ async def gen_sql(input, chat_history, output): global data_info if data_info is None: - data_info = await get_data_info() + data_info = get_data_info() prompt = sql_prompt_template.render( input=input,