Skip to content

Commit

Permalink
Merge branch 'main' of github.com:datakind/humanitarian_ai_assistant
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew Harris committed May 15, 2024
2 parents c1f94df + 42c77a1 commit 04efeab
Show file tree
Hide file tree
Showing 11 changed files with 238 additions and 61 deletions.
2 changes: 1 addition & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ IMAGE_HOST=http://localhost:3080/images
# Deployment to Azure #
#==================================================#
AZURE_CONTAINER_REGISTRY=
AZURE_CONTAINER_REGISTRY_USERNAME=d
AZURE_CONTAINER_REGISTRY_REPO=

#==================================================#
# API Settings #
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
name: Run checks on recipes ai repo

on:
push:
branches:
- main
workflow_dispatch:
on: [push, pull_request]

jobs:
build:
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ We will add more details here soon, for now, here are some notes on Azure ...

## Deploying to Azure

A deployment script './deployment/deploy_azure.py' is provided to deploy to an Azure Multicontainer web app you have set up with [these instructions](https://learn.microsoft.com/en-us/azure/app-service/tutorial-multi-container-app). Note: This is for demo purposes only, as Multicontainer web app are still in Public Preview.
A deployment script './deployment/deploy_azure.py' is provided to deploy to an Azure Multicontainer web app you have set up with [these instructions](https://learn.microsoft.com/en-us/azure/app-service/tutorial-multi-container-app). The script is run from the top directory. Note: This is for demo purposes only, as Multicontainer web app are still in Public Preview.

To run the deployment ...

Expand All @@ -181,6 +181,8 @@ Note:

:warning: *This is very much a work in progress, deployment will be automated with fewer compose files soon*

You will need to set key environment variables, see your local `.env` for examples. The exceptions are the tokens needed for authetication, do not use the defaults for these. You can generate them on [this page](https://www.librechat.ai/toolkit/creds_generator).

## Databases

When running in Azure it is useful to use remote databases, at least for the mongodb instance so that user logins are retained with each release. For example, a databse can be configured by following [these instructions](https://docs.librechat.ai/install/configuration/mongodb.html). If doing this, then docker-compose-azure.yml in Azure can have the mongo DB section removed, and any instance of the Mongo URL used by other containers updated with the cloud connection string accordingly.
24 changes: 18 additions & 6 deletions deployment/deploy_azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,28 @@
#

import os
import sys

import docker
from dotenv import load_dotenv

client = docker.from_env()
load_dotenv()

container_registry = os.getenv("AZURE_CONTAINER_REGISTRY")
repo = os.getenv("AZURE_CONTAINER_REGISTRY_REPO")

# Script is run from top directory
docker_compose_file = "docker-compose-deploy.yml"
azure_platform = "linux/amd64"

if sys.platform == "darwin":
print("Running on Mac")
client = docker.DockerClient(
base_url="unix:///Users/matthewharris/.docker/run/docker.sock "
)
else:
client = docker.from_env()


def run_cmd(cmd):
"""
Expand Down Expand Up @@ -52,25 +66,23 @@ def deploy():
should be defined before calling this function.
"""
tags = {
"humanitarian_ai_assistant-api": [f"{container_registry}/{repo}", "api"],
"data-recipes-ai-api": [f"{container_registry}/{repo}", "api"],
"getmeili/meilisearch:v1.7.3": [f"{container_registry}/{repo}", "meilisearch"],
"ghcr.io/danny-avila/librechat-rag-api-dev-lite:latest": [
f"{container_registry}/{repo}",
"rag_api",
],
"ankane/pgvector:latest": [f"{container_registry}/{repo}", "docsdb"],
"humanitarian_ai_assistant-actions": [
"data-recipes-ai-actions": [
f"{container_registry}/{repo}",
"actions",
],
"busybox": [f"{container_registry}/{repo}", "init"],
"humanitarian_ai_assistant-code-interpreter": [
"data-recipes-ai-code-interpreter": [
f"{container_registry}/{repo}",
"code-interpreter",
],
}
docker_compose_file = "docker-compose-deploy.yml"
azure_platform = "linux/amd64"

run_cmd("az login")
run_cmd(f"az acr login --name {container_registry}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@ services:
api:
platform: linux/amd64
container_name: haa-libre-chat
# Have to build a dockerfile as Azure multicontainer apps don't support bind mounts
#image: ghcr.io/danny-avila/librechat:v0.7.0
#image: ghcr.io/danny-avila/librechat-dev:latest
build:
context: .
dockerfile: ./ui/recipes-chat/Dockerfile
Expand All @@ -19,6 +16,8 @@ services:
depends_on:
#- mongodb
- rag-api
env_file:
- .env
restart: always
user: "${UID}:${GID}"
extra_hosts:
Expand Down
3 changes: 0 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@ version: "3.4"
services:
api:
container_name: haa-libre-chat
# Have to build a dockerfile as Azure multicontainer apps don't support bind mounts
#image: ghcr.io/danny-avila/librechat:v0.7.0
#image: ghcr.io/danny-avila/librechat-dev:latest
build:
context: .
dockerfile: ./ui/recipes-chat/Dockerfile
Expand Down
63 changes: 63 additions & 0 deletions ingestion/api/hapi_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import sys

import pandas as pd


def filter_hapi_df(df, admin0_code_field):
"""
Filter a pandas DataFrame by removing columns where all values are null and removing rows where any value is null.
Hack to get around the fact HDX mixes total values in with disaggregated values in the API
Args:
df (pandas.DataFrame): The DataFrame to be filtered.
admin0_code_field (str): The name of the column containing the admin0 code.
Returns:
pandas.DataFrame: The filtered DataFrame.
"""
df_orig = df.copy()

if df.shape[0] == 0:
return df_orig

dfs = []
if admin0_code_field in df.columns:
for country in df[admin0_code_field].unique():
df2 = df.copy()
df2 = df2[df2[admin0_code_field] == country]

# Remove any columns where all null
df2 = df2.dropna(axis=1, how="all")

# Remove any rows where one of the values is null
df2 = df2.dropna(axis=0, how="any")

dfs.append(df.iloc[df2.index])

df = pd.concat(dfs)

return df


def post_process_data(df, standard_names):
"""
Post-processes the data by filtering and renaming columns.
Args:
df (pandas.DataFrame): The DataFrame to be post-processed.
Returns:
pandas.DataFrame: The post-processed DataFrame.
"""
# aggregate and disaggregated data in the same tables, where the hierarchy differs by country
df = filter_hapi_df(df, standard_names["admin0_code_field"])

# Add a flag to indicate latest dataset by HDX ID, useful for LLM queries
if "resource_hdx_id" in df.columns:
df["latest"] = 0
df["reference_period_start"] = pd.to_datetime(df["reference_period_start"])
df["latest"] = df.groupby("dataset_hdx_stub")[
"reference_period_start"
].transform(lambda x: x == x.max())

return df
51 changes: 11 additions & 40 deletions ingestion/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,10 +246,17 @@ def process_openapi_data(api_name, files_dir, field_map, standard_names):
filename = f"{files_dir}/{f}"
df = pd.read_csv(filename)
df = map_field_names(df, field_map)
# TODO: This is a temporary workaround to account for HAPI having
# aggregate and disaggregated data in the same tables, where the hierarchy differs by country
if api_name == "hapi":
df = filter_hdx_df(df, standard_names["admin0_code_field"])

# Import API-specific processing functions
import_str = f"from api.{api_name}_utils import post_process_data"
print(f"Processing {filename} with {import_str}")
exec(import_str)
post_process_str = "post_process_data(df, standard_names)"
print("Post processing with", post_process_str)
print(" Before shape", df.shape)
df = eval(post_process_str)
print(" After shape", df.shape)

df.to_csv(filename, index=False)


Expand Down Expand Up @@ -368,42 +375,6 @@ def map_field_names(df, field_map):
return df


def filter_hdx_df(df, admin0_code_field):
"""
Filter a pandas DataFrame by removing columns where all values are null and removing rows where any value is null.
Hack to get around the fact HDX mixes total values in with disaggregated values in the API
Args:
df (pandas.DataFrame): The DataFrame to be filtered.
admin0_code_field (str): The name of the column containing the admin0 code.
Returns:
pandas.DataFrame: The filtered DataFrame.
"""
df_orig = df.copy()

if df.shape[0] == 0:
return df_orig

dfs = []
if admin0_code_field in df.columns:
for country in df[admin0_code_field].unique():
df2 = df.copy()
df2 = df2[df2[admin0_code_field] == country]

# Remove any columns where all null
df2 = df2.dropna(axis=1, how="all")

# Remove any rows where one of the values is null
df2 = df2.dropna(axis=0, how="any")

dfs.append(df.iloc[df2.index])

df = pd.concat(dfs)

return df


def main():
apis, field_map, standard_names = read_integration_config(INTEGRATION_CONFIG)
conn = connect_to_db()
Expand Down
137 changes: 137 additions & 0 deletions recipes-creation/copilot_prompt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
Using the database table list below, and the columns provided in each table, generate
Python that summarizes the following:

"Count of Organizations which are active on the ground in Mali, by sector"

Coding tips ...

The shapefile in the database will need to be converted to a geoseries for plotting, here is an example:

` ``
# Convert the data into a DataFrame
df = pd.DataFrame(rows, columns=["adm1_code", "population", "geometry"])

# Convert the 'geometry' column into a GeoSeries
df['geometry'] = df['geometry'].apply(lambda x: wkb.loads(x, hex=True))

# Convert the DataFrame into a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry')
```
Also, please always save any images to a .png file.

Always specify a clear title on any graphs or maps.
Always add annotations, labels and units on any graphs/maps
You can use any kind of visualization

IMPORTANT: Generate reusable code, by putting it in a function with arguments, and provide an example of how to call it.

Always print any SQL statements and the size of the results returned

Database connection details are in the following environment variables (saved in the .env file) ...

POSTGRES_DATA_HOST
POSTGRES_DATA_PORT
POSTGRES_DATA_DB
POSTGRES_DATA_USER
POSTGRES_DATA_PASSWORD

Using python .env module to load these environment varaibles.

In SQL queries with more than one table, always use table aliases to avoid ambiguous columns

Make note of column types, if you are asked to plot the count of something, SUM will not work

Always use country codes instead of names where possible

Tables and their columns ...

{
"select table_name, summary, columns from table_metadata\n": [
{
"table_name" : "hapi_admin1",
"summary" : "['Locations and Administrative Divisions']",
"columns" : "code (text); name (text); adm0_code (text); location_name (text); "
},
{
"table_name" : "hapi_admin2",
"summary" : "['Locations and Administrative Divisions']",
"columns" : "code (text); name (text); adm1_code (text); adm1_name (text); adm0_code (text); location_name (text); "
},
{
"table_name" : "hapi_age_range",
"summary" : "['Age and Gender Disaggregations']",
"columns" : "age_min (bigint); age_max (double precision); code (text); "
},
{
"table_name" : "hapi_dataset",
"summary" : "['HDX Metadata']",
"columns" : "hdx_id (text); hdx_stub (text); title (text); hdx_provider_stub (text); hdx_provider_name (text); hdx_link (text); hdx_api_link (text); "
},
{
"table_name" : "hapi_3w",
"summary" : "['3W Operational Presence']",
"columns" : "reference_period_end (double precision); dataset_hdx_stub (text); resource_hdx_id (text); org_acronym (text); org_name (text); sector_name (text); adm0_code (text); location_name (text); reference_period_start (text); adm1_code (text); adm1_name (text); adm2_code (text); sector_code (text); adm2_name (text); "
},
{
"table_name" : "hapi_gender",
"summary" : "['Age and Gender Disaggregations']",
"columns" : "code (text); description (text); "
},
{
"table_name" : "hapi_location",
"summary" : "['Locations and Administrative Divisions']",
"columns" : "code (text); name (text); "
},
{
"table_name" : "hapi_org",
"summary" : "['Humanitarian Organizations and Sectors']",
"columns" : "org_type_code (double precision); acronym (text); name (text); org_type_description (text); "
},
{
"table_name" : "hapi_org_type",
"summary" : "['Humanitarian Organizations and Sectors']",
"columns" : "code (bigint); description (text); "
},
{
"table_name" : "hapi_population_group",
"summary" : "['Population Groups and Statuses']",
"columns" : "code (text); description (text); "
},
{
"table_name" : "hapi_population_status",
"summary" : "['Population Groups and Statuses']",
"columns" : "code (text); description (text); "
},
{
"table_name" : "hapi_resource",
"summary" : "['HDX Metadata']",
"columns" : "is_hxl (boolean); name (text); format (text); update_date (text); download_url (text); dataset_hdx_id (text); dataset_hdx_stub (text); dataset_title (text); dataset_hdx_provider_stub (text); dataset_hdx_provider_name (text); hdx_link (text); hdx_api_link (text); dataset_hdx_link (text); hdx_id (text); dataset_hdx_api_link (text); "
},
{
"table_name" : "hapi_food_security",
"summary" : "['Food Security']",
"columns" : "population_in_phase (bigint); population_fraction_in_phase (double precision); ipc_phase_code (text); ipc_phase_name (text); ipc_type_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); adm2_name (text); "
},
{
"table_name" : "hapi_humanitarian_needs",
"summary" : "['Humanitarian Needs']",
"columns" : "population (bigint); age_range_code (text); disabled_marker (text); sector_code (text); sector_name (text); population_status_code (text); population_group_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); gender_code (text); adm2_name (text); "
},
{
"table_name" : "hapi_national_risk",
"summary" : "['National Risk']",
"columns" : "risk_class (bigint); global_rank (bigint); overall_risk (double precision); hazard_exposure_risk (double precision); vulnerability_risk (double precision); coping_capacity_risk (double precision); meta_missing_indicators_pct (double precision); meta_avg_recentness_years (double precision); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); "
},
{
"table_name" : "hapi_population",
"summary" : "['Baseline Population']",
"columns" : "population (bigint); age_range_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); gender_code (text); adm2_name (text); "
},
{
"table_name" : "hdx_shape_files",
"summary" : "HDX Shape Files",
"columns" : "geometry (USER-DEFINED); OBJECTID (double precision); AREA_SQKM (double precision); Shape_Area (double precision); Shape_Leng (double precision); ADM1ALT2FR (text); ADM0_FR (text); adm0_code (text); date (text); validOn (text); validTo (text); ADM2_FR (text); adm2_code (text); ADM2_REF (text); ADM2ALT1FR (text); ADM2ALT2FR (text); ADM1_EN (text); ADM1ALT1EN (text); ADM1ALT2EN (text); ADM0_EN (text); ADM2_EN (text); ADM2ALT1EN (text); ADM2ALT2EN (text); ADM1_ES (text); ADM1ALT1ES (text); ADM1ALT2ES (text); ADM0_ES (text); ADM2_ES (text); ADM2ALT1ES (text); ADM2ALT2ES (text); ValidTo (text); ADM1_HT (text); ADM1ALT1HT (text); ADM1ALT2HT (text); ADM0_HT (text); ADM2_HT (text); ADM2ALT1HT (text); ADM2ALT2HT (text); ADM1_MY (text); ADM1_ALTPC (text); ADM0_MY (text); ADM2_MY (text); ADM1_PT (text); ADM1ALT1PT (text); ADM1ALT2PT (text); ADM0_PT (text); ADM2_PT (text); ADM2ALT1PT (text); ADM2ALT2PT (text); SD_EN (text); SD_PCODE (text); ADM1_AR (text); ADM1ALT1AR (text); ADM1ALT2AR (text); ADM0_AR (text); ADM2_AR (text); ADM2ALT1AR (text); ADM2ALT2AR (text); admin1Name (text); admin1RefN (text); admin1Na_1 (text); admin1AltN (text); admin1Al_1 (text); admin0Name (text); admin2Name (text); admin2RefN (text); admin2Na_1 (text); admin2AltN (text); admin2Al_1 (text); ADM1_UA (text); ADM1_RU (text); ADM0_UA (text); ADM0_RU (text); ADM2_UA (text); ADM2_RU (text); ADM1_FR (text); adm1_code (text); ADM1_REF (text); ADM1ALT1FR (text); "
}
]}


Loading

0 comments on commit 04efeab

Please sign in to comment.