Merge branch 'main' of github.com:datakind/humanitarian_ai_assistant

datakind · May 15, 2024 · 04efeab · 04efeab
2 parents c1f94df + 42c77a1
commit 04efeab
Show file tree

Hide file tree

Showing 11 changed files with 238 additions and 61 deletions.
diff --git a/.env.example b/.env.example
@@ -63,7 +63,7 @@ IMAGE_HOST=http://localhost:3080/images
 #             Deployment to Azure                  #
 #==================================================#
 AZURE_CONTAINER_REGISTRY=
-AZURE_CONTAINER_REGISTRY_USERNAME=d
+AZURE_CONTAINER_REGISTRY_REPO=
 
 #==================================================#
 #                 API Settings                     #

diff --git a/...rkflows/main_ai-assistants-prototypes.yml → .github/workflows/test_deploy.yml b/...rkflows/main_ai-assistants-prototypes.yml → .github/workflows/test_deploy.yml
@@ -1,10 +1,6 @@
 name: Run checks on recipes ai repo
 
-on:
-  push:
-    branches:
-      - main
-  workflow_dispatch:
+on: [push, pull_request]
 
 jobs:
   build:

diff --git a/README.md b/README.md
@@ -165,7 +165,7 @@ We will add more details here soon, for now, here are some notes on Azure ...
 
 ## Deploying to Azure
 
-A deployment script './deployment/deploy_azure.py' is provided to deploy to an Azure Multicontainer web app you have set up with [these instructions](https://learn.microsoft.com/en-us/azure/app-service/tutorial-multi-container-app). Note: This is for demo purposes only, as Multicontainer web app are still in Public Preview. 
+A deployment script './deployment/deploy_azure.py' is provided to deploy to an Azure Multicontainer web app you have set up with [these instructions](https://learn.microsoft.com/en-us/azure/app-service/tutorial-multi-container-app). The script is run from the top directory. Note: This is for demo purposes only, as Multicontainer web app are still in Public Preview. 
 
 To run the deployment ...
 
@@ -181,6 +181,8 @@ Note:
 
 :warning: *This is very much a work in progress, deployment will be automated with fewer compose files soon*
 
+You will need to set key environment variables, see your local `.env` for examples. The exceptions are the tokens needed for authetication, do not use the defaults for these. You can generate them on [this page](https://www.librechat.ai/toolkit/creds_generator).
+
 ## Databases
 
 When running in Azure it is useful to use remote databases, at least for the mongodb instance so that user logins are retained with each release. For example, a databse can be configured by following [these instructions](https://docs.librechat.ai/install/configuration/mongodb.html). If doing this, then docker-compose-azure.yml in Azure can have the mongo DB section removed, and any instance of the Mongo URL used by other containers updated with the cloud connection string accordingly.
diff --git a/deployment/deploy_azure.py b/deployment/deploy_azure.py
@@ -9,14 +9,28 @@
 #
 
 import os
+import sys
 
 import docker
+from dotenv import load_dotenv
 
-client = docker.from_env()
+load_dotenv()
 
 container_registry = os.getenv("AZURE_CONTAINER_REGISTRY")
 repo = os.getenv("AZURE_CONTAINER_REGISTRY_REPO")
 
+# Script is run from top directory
+docker_compose_file = "docker-compose-deploy.yml"
+azure_platform = "linux/amd64"
+
+if sys.platform == "darwin":
+    print("Running on Mac")
+    client = docker.DockerClient(
+        base_url="unix:///Users/matthewharris/.docker/run/docker.sock "
+    )
+else:
+    client = docker.from_env()
+
 
 def run_cmd(cmd):
     """
@@ -52,25 +66,23 @@ def deploy():
     should be defined before calling this function.
     """
     tags = {
-        "humanitarian_ai_assistant-api": [f"{container_registry}/{repo}", "api"],
+        "data-recipes-ai-api": [f"{container_registry}/{repo}", "api"],
         "getmeili/meilisearch:v1.7.3": [f"{container_registry}/{repo}", "meilisearch"],
         "ghcr.io/danny-avila/librechat-rag-api-dev-lite:latest": [
             f"{container_registry}/{repo}",
             "rag_api",
         ],
         "ankane/pgvector:latest": [f"{container_registry}/{repo}", "docsdb"],
-        "humanitarian_ai_assistant-actions": [
+        "data-recipes-ai-actions": [
             f"{container_registry}/{repo}",
             "actions",
         ],
         "busybox": [f"{container_registry}/{repo}", "init"],
-        "humanitarian_ai_assistant-code-interpreter": [
+        "data-recipes-ai-code-interpreter": [
             f"{container_registry}/{repo}",
             "code-interpreter",
         ],
     }
-    docker_compose_file = "docker-compose-deploy.yml"
-    azure_platform = "linux/amd64"
 
     run_cmd("az login")
     run_cmd(f"az acr login --name {container_registry}")

diff --git a/deployment/docker-compose-deploy.yml → docker-compose-deploy.yml b/deployment/docker-compose-deploy.yml → docker-compose-deploy.yml
@@ -5,9 +5,6 @@ services:
   api:
     platform: linux/amd64
     container_name: haa-libre-chat
-    # Have to build a dockerfile as Azure multicontainer apps don't support bind mounts
-    #image: ghcr.io/danny-avila/librechat:v0.7.0
-    #image: ghcr.io/danny-avila/librechat-dev:latest
     build:
       context: .
       dockerfile: ./ui/recipes-chat/Dockerfile
@@ -19,6 +16,8 @@ services:
     depends_on:
       #- mongodb 
       - rag-api
+    env_file:
+      - .env
     restart: always
     user: "${UID}:${GID}"
     extra_hosts:

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -3,9 +3,6 @@ version: "3.4"
 services:
   api:
     container_name: haa-libre-chat
-    # Have to build a dockerfile as Azure multicontainer apps don't support bind mounts
-    #image: ghcr.io/danny-avila/librechat:v0.7.0
-    #image: ghcr.io/danny-avila/librechat-dev:latest
     build:
       context: .
       dockerfile: ./ui/recipes-chat/Dockerfile

diff --git a/ingestion/api/hapi_utils.py b/ingestion/api/hapi_utils.py
@@ -0,0 +1,63 @@
+import sys
+
+import pandas as pd
+
+
+def filter_hapi_df(df, admin0_code_field):
+    """
+    Filter a pandas DataFrame by removing columns where all values are null and removing rows where any value is null.
+    Hack to get around the fact HDX mixes total values in with disaggregated values in the API
+
+    Args:
+        df (pandas.DataFrame): The DataFrame to be filtered.
+        admin0_code_field (str): The name of the column containing the admin0 code.
+
+    Returns:
+        pandas.DataFrame: The filtered DataFrame.
+    """
+    df_orig = df.copy()
+
+    if df.shape[0] == 0:
+        return df_orig
+
+    dfs = []
+    if admin0_code_field in df.columns:
+        for country in df[admin0_code_field].unique():
+            df2 = df.copy()
+            df2 = df2[df2[admin0_code_field] == country]
+
+            # Remove any columns where all null
+            df2 = df2.dropna(axis=1, how="all")
+
+            # Remove any rows where one of the values is null
+            df2 = df2.dropna(axis=0, how="any")
+
+            dfs.append(df.iloc[df2.index])
+
+        df = pd.concat(dfs)
+
+    return df
+
+
+def post_process_data(df, standard_names):
+    """
+    Post-processes the data by filtering and renaming columns.
+
+    Args:
+        df (pandas.DataFrame): The DataFrame to be post-processed.
+
+    Returns:
+        pandas.DataFrame: The post-processed DataFrame.
+    """
+    # aggregate and disaggregated data in the same tables, where the hierarchy differs by country
+    df = filter_hapi_df(df, standard_names["admin0_code_field"])
+
+    # Add a flag to indicate latest dataset by HDX ID, useful for LLM queries
+    if "resource_hdx_id" in df.columns:
+        df["latest"] = 0
+        df["reference_period_start"] = pd.to_datetime(df["reference_period_start"])
+        df["latest"] = df.groupby("dataset_hdx_stub")[
+            "reference_period_start"
+        ].transform(lambda x: x == x.max())
+
+    return df
diff --git a/ingestion/ingest.py b/ingestion/ingest.py
@@ -246,10 +246,17 @@ def process_openapi_data(api_name, files_dir, field_map, standard_names):
             filename = f"{files_dir}/{f}"
             df = pd.read_csv(filename)
             df = map_field_names(df, field_map)
-            # TODO: This is a temporary workaround to account for HAPI having
-            # aggregate and disaggregated data in the same tables, where the hierarchy differs by country
-            if api_name == "hapi":
-                df = filter_hdx_df(df, standard_names["admin0_code_field"])
+
+            # Import API-specific processing functions
+            import_str = f"from api.{api_name}_utils import post_process_data"
+            print(f"Processing {filename} with {import_str}")
+            exec(import_str)
+            post_process_str = "post_process_data(df, standard_names)"
+            print("Post processing with", post_process_str)
+            print("      Before shape", df.shape)
+            df = eval(post_process_str)
+            print("      After shape", df.shape)
+
             df.to_csv(filename, index=False)
 
 
@@ -368,42 +375,6 @@ def map_field_names(df, field_map):
     return df
 
 
-def filter_hdx_df(df, admin0_code_field):
-    """
-    Filter a pandas DataFrame by removing columns where all values are null and removing rows where any value is null.
-    Hack to get around the fact HDX mixes total values in with disaggregated values in the API
-
-    Args:
-        df (pandas.DataFrame): The DataFrame to be filtered.
-        admin0_code_field (str): The name of the column containing the admin0 code.
-
-    Returns:
-        pandas.DataFrame: The filtered DataFrame.
-    """
-    df_orig = df.copy()
-
-    if df.shape[0] == 0:
-        return df_orig
-
-    dfs = []
-    if admin0_code_field in df.columns:
-        for country in df[admin0_code_field].unique():
-            df2 = df.copy()
-            df2 = df2[df2[admin0_code_field] == country]
-
-            # Remove any columns where all null
-            df2 = df2.dropna(axis=1, how="all")
-
-            # Remove any rows where one of the values is null
-            df2 = df2.dropna(axis=0, how="any")
-
-            dfs.append(df.iloc[df2.index])
-
-        df = pd.concat(dfs)
-
-    return df
-
-
 def main():
     apis, field_map, standard_names = read_integration_config(INTEGRATION_CONFIG)
     conn = connect_to_db()

diff --git a/recipes-creation/copilot_prompt.txt b/recipes-creation/copilot_prompt.txt
@@ -0,0 +1,137 @@
+Using the database table list below, and the columns provided in each table, generate 
+Python that summarizes the following:
+
+"Count of Organizations which are active on the ground in Mali, by sector"
+
+Coding tips ...
+
+The shapefile in the database will need to be converted to a geoseries for plotting, here is an example:
+
+` ``
+# Convert the data into a DataFrame
+df = pd.DataFrame(rows, columns=["adm1_code", "population", "geometry"])
+
+# Convert the 'geometry' column into a GeoSeries
+df['geometry'] = df['geometry'].apply(lambda x: wkb.loads(x, hex=True))
+
+# Convert the DataFrame into a GeoDataFrame
+gdf = gpd.GeoDataFrame(df, geometry='geometry')
+```
+Also, please always save any images to a .png file.
+
+Always specify a clear title on any graphs or maps. 
+Always add annotations, labels and units on any graphs/maps
+You can use any kind of visualization
+
+IMPORTANT: Generate reusable code, by putting it in a function with arguments, and provide an example of how to call it.
+
+Always print any SQL statements and the size of the results returned
+
+Database connection details are in the following environment variables (saved in the .env file) ...
+
+POSTGRES_DATA_HOST
+POSTGRES_DATA_PORT
+POSTGRES_DATA_DB
+POSTGRES_DATA_USER
+POSTGRES_DATA_PASSWORD
+
+Using python .env module to load these environment varaibles.
+
+In SQL queries with more than one table, always use table aliases to avoid ambiguous columns
+
+Make note of column types, if you are asked to plot the count of something, SUM will not work
+
+Always use country codes instead of names where possible
+
+Tables and their columns ...
+
+{
+"select table_name, summary, columns from table_metadata\n": [
+	{
+		"table_name" : "hapi_admin1",
+		"summary" : "['Locations and Administrative Divisions']",
+		"columns" : "code (text); name (text); adm0_code (text); location_name (text); "
+	},
+	{
+		"table_name" : "hapi_admin2",
+		"summary" : "['Locations and Administrative Divisions']",
+		"columns" : "code (text); name (text); adm1_code (text); adm1_name (text); adm0_code (text); location_name (text); "
+	},
+	{
+		"table_name" : "hapi_age_range",
+		"summary" : "['Age and Gender Disaggregations']",
+		"columns" : "age_min (bigint); age_max (double precision); code (text); "
+	},
+	{
+		"table_name" : "hapi_dataset",
+		"summary" : "['HDX Metadata']",
+		"columns" : "hdx_id (text); hdx_stub (text); title (text); hdx_provider_stub (text); hdx_provider_name (text); hdx_link (text); hdx_api_link (text); "
+	},
+	{
+		"table_name" : "hapi_3w",
+		"summary" : "['3W Operational Presence']",
+		"columns" : "reference_period_end (double precision); dataset_hdx_stub (text); resource_hdx_id (text); org_acronym (text); org_name (text); sector_name (text); adm0_code (text); location_name (text); reference_period_start (text); adm1_code (text); adm1_name (text); adm2_code (text); sector_code (text); adm2_name (text); "
+	},
+	{
+		"table_name" : "hapi_gender",
+		"summary" : "['Age and Gender Disaggregations']",
+		"columns" : "code (text); description (text); "
+	},
+	{
+		"table_name" : "hapi_location",
+		"summary" : "['Locations and Administrative Divisions']",
+		"columns" : "code (text); name (text); "
+	},
+	{
+		"table_name" : "hapi_org",
+		"summary" : "['Humanitarian Organizations and Sectors']",
+		"columns" : "org_type_code (double precision); acronym (text); name (text); org_type_description (text); "
+	},
+	{
+		"table_name" : "hapi_org_type",
+		"summary" : "['Humanitarian Organizations and Sectors']",
+		"columns" : "code (bigint); description (text); "
+	},
+	{
+		"table_name" : "hapi_population_group",
+		"summary" : "['Population Groups and Statuses']",
+		"columns" : "code (text); description (text); "
+	},
+	{
+		"table_name" : "hapi_population_status",
+		"summary" : "['Population Groups and Statuses']",
+		"columns" : "code (text); description (text); "
+	},
+	{
+		"table_name" : "hapi_resource",
+		"summary" : "['HDX Metadata']",
+		"columns" : "is_hxl (boolean); name (text); format (text); update_date (text); download_url (text); dataset_hdx_id (text); dataset_hdx_stub (text); dataset_title (text); dataset_hdx_provider_stub (text); dataset_hdx_provider_name (text); hdx_link (text); hdx_api_link (text); dataset_hdx_link (text); hdx_id (text); dataset_hdx_api_link (text); "
+	},
+	{
+		"table_name" : "hapi_food_security",
+		"summary" : "['Food Security']",
+		"columns" : "population_in_phase (bigint); population_fraction_in_phase (double precision); ipc_phase_code (text); ipc_phase_name (text); ipc_type_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); adm2_name (text); "
+	},
+	{
+		"table_name" : "hapi_humanitarian_needs",
+		"summary" : "['Humanitarian Needs']",
+		"columns" : "population (bigint); age_range_code (text); disabled_marker (text); sector_code (text); sector_name (text); population_status_code (text); population_group_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); gender_code (text); adm2_name (text); "
+	},
+	{
+		"table_name" : "hapi_national_risk",
+		"summary" : "['National Risk']",
+		"columns" : "risk_class (bigint); global_rank (bigint); overall_risk (double precision); hazard_exposure_risk (double precision); vulnerability_risk (double precision); coping_capacity_risk (double precision); meta_missing_indicators_pct (double precision); meta_avg_recentness_years (double precision); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); "
+	},
+	{
+		"table_name" : "hapi_population",
+		"summary" : "['Baseline Population']",
+		"columns" : "population (bigint); age_range_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); gender_code (text); adm2_name (text); "
+	},
+	{
+		"table_name" : "hdx_shape_files",
+		"summary" : "HDX Shape Files",
+		"columns" : "geometry (USER-DEFINED); OBJECTID (double precision); AREA_SQKM (double precision); Shape_Area (double precision); Shape_Leng (double precision); ADM1ALT2FR (text); ADM0_FR (text); adm0_code (text); date (text); validOn (text); validTo (text); ADM2_FR (text); adm2_code (text); ADM2_REF (text); ADM2ALT1FR (text); ADM2ALT2FR (text); ADM1_EN (text); ADM1ALT1EN (text); ADM1ALT2EN (text); ADM0_EN (text); ADM2_EN (text); ADM2ALT1EN (text); ADM2ALT2EN (text); ADM1_ES (text); ADM1ALT1ES (text); ADM1ALT2ES (text); ADM0_ES (text); ADM2_ES (text); ADM2ALT1ES (text); ADM2ALT2ES (text); ValidTo (text); ADM1_HT (text); ADM1ALT1HT (text); ADM1ALT2HT (text); ADM0_HT (text); ADM2_HT (text); ADM2ALT1HT (text); ADM2ALT2HT (text); ADM1_MY (text); ADM1_ALTPC (text); ADM0_MY (text); ADM2_MY (text); ADM1_PT (text); ADM1ALT1PT (text); ADM1ALT2PT (text); ADM0_PT (text); ADM2_PT (text); ADM2ALT1PT (text); ADM2ALT2PT (text); SD_EN (text); SD_PCODE (text); ADM1_AR (text); ADM1ALT1AR (text); ADM1ALT2AR (text); ADM0_AR (text); ADM2_AR (text); ADM2ALT1AR (text); ADM2ALT2AR (text); admin1Name (text); admin1RefN (text); admin1Na_1 (text); admin1AltN (text); admin1Al_1 (text); admin0Name (text); admin2Name (text); admin2RefN (text); admin2Na_1 (text); admin2AltN (text); admin2Al_1 (text); ADM1_UA (text); ADM1_RU (text); ADM0_UA (text); ADM0_RU (text); ADM2_UA (text); ADM2_RU (text); ADM1_FR (text); adm1_code (text); ADM1_REF (text); ADM1ALT1FR (text); "
+	}
+]}
+
+