From c5c2225a8852e80656d645b2ac1385a32ede519a Mon Sep 17 00:00:00 2001 From: Matthew Harris Date: Mon, 13 May 2024 10:37:07 -0400 Subject: [PATCH 01/10] Cycling API Keys --- ui/recipes-chat/librechat.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/recipes-chat/librechat.yaml b/ui/recipes-chat/librechat.yaml index a6834a20..6be20eb5 100644 --- a/ui/recipes-chat/librechat.yaml +++ b/ui/recipes-chat/librechat.yaml @@ -40,7 +40,7 @@ endpoints: assistants: true groups: - group: "region-eastus2" - apiKey: 21e38dd24f114ef7b7ea8cd96112603a + apiKey: ${AZURE_API_KEY_ENV} instanceName: "dkopenai2" version: "2024-02-15-preview" assistants: true From cd712a0903d6af2fe9092e235c7482993a94f485 Mon Sep 17 00:00:00 2001 From: Matthew Harris Date: Mon, 13 May 2024 10:39:17 -0400 Subject: [PATCH 02/10] Cycling API Keys --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3e4f4645..fe763c68 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,7 @@ We will add more details here soon, for now, here are some notes on Azure ... ## Deploying to Azure -A deployment script './deployment/deploy_azure.py' is provided to deploy to an Azure Multicontainer web app you have set up with [these instructions](https://learn.microsoft.com/en-us/azure/app-service/tutorial-multi-container-app). Note: This is for demo purposes only, as Multicontainer web app are still in Public Preview. +A deployment script './deployment/deploy_azure.py' is provided to deploy to an Azure Multicontainer web app you have set up with [these instructions](https://learn.microsoft.com/en-us/azure/app-service/tutorial-multi-container-app). The script is run from the top directory. Note: This is for demo purposes only, as Multicontainer web app are still in Public Preview. To run the deployment ... From 2fb2a80697dd677c7c5d0129c126d42f1f6c85b0 Mon Sep 17 00:00:00 2001 From: Matthew Harris Date: Mon, 13 May 2024 11:03:05 -0400 Subject: [PATCH 03/10] Tweaks to deploy --- .env.example | 2 +- deployment/deploy_azure.py | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/.env.example b/.env.example index 095d1878..bddf1346 100644 --- a/.env.example +++ b/.env.example @@ -63,7 +63,7 @@ IMAGE_HOST=http://localhost:3080/images # Deployment to Azure # #==================================================# AZURE_CONTAINER_REGISTRY= -AZURE_CONTAINER_REGISTRY_USERNAME=d +AZURE_CONTAINER_REGISTRY_REPO= #==================================================# # API Settings # diff --git a/deployment/deploy_azure.py b/deployment/deploy_azure.py index 56331941..f05954a3 100644 --- a/deployment/deploy_azure.py +++ b/deployment/deploy_azure.py @@ -9,14 +9,28 @@ # import os +import sys import docker +from dotenv import load_dotenv -client = docker.from_env() +load_dotenv() container_registry = os.getenv("AZURE_CONTAINER_REGISTRY") repo = os.getenv("AZURE_CONTAINER_REGISTRY_REPO") +# Script is run from top directory +docker_compose_file = "./deployment/docker-compose-deploy.yml" +azure_platform = "linux/amd64" + +if sys.platform == "darwin": + print("Running on Mac") + client = docker.DockerClient( + base_url="unix:///Users/matthewharris/.docker/run/docker.sock " + ) +else: + client = docker.from_env() + def run_cmd(cmd): """ @@ -69,8 +83,6 @@ def deploy(): "code-interpreter", ], } - docker_compose_file = "docker-compose-deploy.yml" - azure_platform = "linux/amd64" run_cmd("az login") run_cmd(f"az acr login --name {container_registry}") From bf957808c7548029d10c584c6f01ae089b915562 Mon Sep 17 00:00:00 2001 From: Matthew Harris Date: Mon, 13 May 2024 11:49:04 -0400 Subject: [PATCH 04/10] A few notes on credentials generation for Azure dpeloyment --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index fe763c68..191035f1 100644 --- a/README.md +++ b/README.md @@ -165,6 +165,8 @@ Note: :warning: *This is very much a work in progress, deployment will be automated with fewer compose files soon* +You will need to set key environment variables, see your local `.env` for examples. The exceptions are the tokens needed for authetication, do not use the defaults for these. You can generate them on [this page](https://www.librechat.ai/toolkit/creds_generator). + ## Databases When running in Azure it is useful to use remote databases, at least for the mongodb instance so that user logins are retained with each release. For example, a databse can be configured by following [these instructions](https://docs.librechat.ai/install/configuration/mongodb.html). If doing this, then docker-compose-azure.yml in Azure can have the mongo DB section removed, and any instance of the Mongo URL used by other containers updated with the cloud connection string accordingly. \ No newline at end of file From 7537dec318689db189184d54ad033c30bf0f3f48 Mon Sep 17 00:00:00 2001 From: Matthew Harris Date: Mon, 13 May 2024 15:34:28 -0400 Subject: [PATCH 05/10] Deployment script names were misaligned after housekeeping move --- deployment/deploy_azure.py | 8 +- docker-compose-deploy.yml | 196 +++++++++++++++++++++++++++++++++++++ ui/recipes-chat/Dockerfile | 6 +- 3 files changed, 203 insertions(+), 7 deletions(-) create mode 100644 docker-compose-deploy.yml diff --git a/deployment/deploy_azure.py b/deployment/deploy_azure.py index f05954a3..84125b41 100644 --- a/deployment/deploy_azure.py +++ b/deployment/deploy_azure.py @@ -20,7 +20,7 @@ repo = os.getenv("AZURE_CONTAINER_REGISTRY_REPO") # Script is run from top directory -docker_compose_file = "./deployment/docker-compose-deploy.yml" +docker_compose_file = "docker-compose-deploy.yml" azure_platform = "linux/amd64" if sys.platform == "darwin": @@ -66,19 +66,19 @@ def deploy(): should be defined before calling this function. """ tags = { - "humanitarian_ai_assistant-api": [f"{container_registry}/{repo}", "api"], + "data-recipes-ai-api": [f"{container_registry}/{repo}", "api"], "getmeili/meilisearch:v1.7.3": [f"{container_registry}/{repo}", "meilisearch"], "ghcr.io/danny-avila/librechat-rag-api-dev-lite:latest": [ f"{container_registry}/{repo}", "rag_api", ], "ankane/pgvector:latest": [f"{container_registry}/{repo}", "docsdb"], - "humanitarian_ai_assistant-actions": [ + "data-recipes-ai-actions": [ f"{container_registry}/{repo}", "actions", ], "busybox": [f"{container_registry}/{repo}", "init"], - "humanitarian_ai_assistant-code-interpreter": [ + "data-recipes-ai-code-interpreter": [ f"{container_registry}/{repo}", "code-interpreter", ], diff --git a/docker-compose-deploy.yml b/docker-compose-deploy.yml new file mode 100644 index 00000000..c9df0e30 --- /dev/null +++ b/docker-compose-deploy.yml @@ -0,0 +1,196 @@ +version: "3.4" + + +services: + api: + platform: linux/amd64 + container_name: haa-libre-chat + # Have to build a dockerfile as Azure multicontainer apps don't support bind mounts + #image: ghcr.io/danny-avila/librechat:v0.7.0 + #image: ghcr.io/danny-avila/librechat-dev:latest + build: + context: . + dockerfile: ./ui/recipes-chat/Dockerfile + ports: + #- "${PORT}:${PORT}" + - 8080:8080 + # DK Added + - 3080:3080 + depends_on: + #- mongodb + - rag-api + restart: always + user: "${UID}:${GID}" + extra_hosts: + - "host.docker.internal:host-gateway" + environment: + HOST: 0.0.0.0 + MONGO_URI: mongodb://mongodb:27017/LibreChat + MEILI_HOST: http://meilisearch:7700 + RAG_PORT: ${RAG_PORT:-8000} + RAG_API_URL: http://rag-api:${RAG_PORT:-8000} + volumes: + - shared-data:/app/client/public/images + #- type: bind + # source: ./.env + # target: /app/.env + #- ./ui/recipes-chat/images:/app/client/public/images + #- ./ui/recipes-chat/logs:/app/api/logs + #- ./ui/recipes-chat/tools:/app/api/app/clients/tools + #- type: bind + # source: ./ui/recipes-chat/librechat.yaml + # target: /app/librechat.yaml + #mongodb: + # platform: linux/amd64 + # container_name: haa-libre-chat-mongodb + # image: mongo:4.4.6 + # restart: always + # user: "${UID}:${GID}" + # #volumes: + # # - ./ui/recipes-chat/data-node:/data/db + # command: mongod --noauth + meilisearch: + platform: linux/amd64 + container_name: haa-libre-chat-meilisearch + image: getmeili/meilisearch:v1.7.3 + restart: always + user: "${UID}:${GID}" + environment: + MEILI_HOST: http://meilisearch:7700 + MEILI_NO_ANALYTICS: true + #volumes: + # - ./ui/recipes-chat/meili_data_v1.7:/meili_data + rag-api: + platform: linux/amd64 + image: ghcr.io/danny-avila/librechat-rag-api-dev-lite:latest + container_name: haa-libre-chat-rag-api + environment: + DB_HOST: docsdb + POSTGRES_DB: docs + POSTGRES_USER: ${POSTGRES_DATA_USER} + POSTGRES_PASSWORD: ${POSTGRES_DATA_PASSWORD} + RAG_PORT: 8000 + RAG_AZURE_OPENAI_API_KEY: ${AZURE_API_KEY_ENV} + EMBEDDINGS_PROVIDER: azure + EMBEDDINGS_MODEL: text-embedding-ada-002 + AZURE_OPENAI_ENDPOINT: https://dkopenai2.openai.azure.com/ + DEBUG_RAG_API: "True" + restart: always + depends_on: + - docsdb + env_file: + - .env + # Using a persistent postgres DB so we can access it + #datadb: + # platform: linux/amd64 + # image: postgis/postgis:12-3.4 + # container_name: haa-datadb + # environment: + # POSTGRES_DB: ${POSTGRES_DATA_DB} + # POSTGRES_USER: ${POSTGRES_DATA_USER} + # POSTGRES_PASSWORD: ${POSTGRES_DATA_PASSWORD} + # restart: always + # ports: + # - 5433:5432 + # #volumes: + # # - ./ui/recipes-chat/datadb:/var/lib/postgresql/data + # env_file: + # - .env + docsdb: + platform: linux/amd64 + image: ankane/pgvector:latest + container_name: haa-docsdb + environment: + POSTGRES_DB: docs + POSTGRES_USER: ${POSTGRES_DATA_USER} + POSTGRES_PASSWORD: ${POSTGRES_DATA_PASSWORD} + restart: always + #ports: + # - 5434:5432 + # #volumes: + # # - ./ui/recipes-chat/docsdb:/var/lib/postgresql/data + env_file: + - .env + #recipedb: + # platform: linux/amd64 + # image: ankane/pgvector:latest + # container_name: haa-datarecipesdb + # environment: + # POSTGRES_DB: ${POSTGRES_RECIPE_DB} + # POSTGRES_USER: ${POSTGRES_RECIPE_USER} + # POSTGRES_PASSWORD: ${POSTGRES_RECIPE_PASSWORD} + # restart: always + # ports: + # - 5435:5432 + # #volumes: + # # - ./actions/actions_plugins/recipe-server/db/:/docker-entrypoint-initdb.d + # # - ./ui/recipes-chat/recipesdb:/var/lib/postgresql/data + # env_file: + # - .env + actions: + platform: linux/amd64 + container_name: haa-robo-actions + user: "1000:1000" + build: + context: . + dockerfile: ./actions/Dockerfile + args: + # Assuming Azure deployment uses hosted postgres rather than Docker. + DATA_DB_CONN_STRING: ${REMOTE_DB_CONN_STRING} + ports: + # API + - 3001:8080 + # Action server portal + - 4001:8087 + environment: + OPENAI_API_TYPE: azure + OPENAI_API_ENDPOINT: https://dkopenai2.openai.azure.com/ + OPENAI_API_VERSION_MEMORY: 2024-02-15-preview + BASE_URL_MEMORY: https://dkopenai2.openai.azure.com/ + MODEL_MEMORY: gpt-4-turbo + OPENAI_TEXT_COMPLETION_DEPLOYMENT_NAME: text-embedding-ada-002 + env_file: + - .env + volumes: + - shared-data:/action-server/actions/actions_plugins/recipe-server/images + # Init container + init: + image: busybox + platform: linux/amd64 + container_name: haa-init + volumes: + - shared-data:/data + command: "sh -c 'chown -R 1000:1000 /data && chmod -R 775 /data'" + user: "root" + depends_on: + - actions + #ingestion: + # platform: linux/amd64 + # container_name: haa-ingestion + # build: + # context: . + # dockerfile: ./ingestion/Dockerfile + # #depends_on: + # # - datadb + # restart: always + # env_file: + # - .env + # volumes: + # - type: bind + # source: ./ingestion + # target: /app + code-interpreter: + #image: ghcr.io/iamgreggarcia/codesphera:latest + platform: linux/amd64 + container_name: haa-code-interpreter + build: + context: . + dockerfile: ./code-interpreter/Dockerfile + ports: + - "3333:3333" + #volumes: + # - ./code-interpreter/static:/app/static + +volumes: + pgdata2: + shared-data: \ No newline at end of file diff --git a/ui/recipes-chat/Dockerfile b/ui/recipes-chat/Dockerfile index f35fe33e..ec8c78e5 100644 --- a/ui/recipes-chat/Dockerfile +++ b/ui/recipes-chat/Dockerfile @@ -1,7 +1,7 @@ #FROM ghcr.io/danny-avila/librechat -FROM ghcr.io/danny-avila/librechat-dev:latest -#FROM ghcr.io/danny-avila/librechat:v0.7.0 -RUN echo "Hi!" +#FROM ghcr.io/danny-avila/librechat-dev:latest +FROM ghcr.io/danny-avila/librechat:v0.7.0 +RUN echo "Hi! " COPY .env /app/.env COPY ./ui/recipes-chat/librechat.yaml /app/librechat.yaml COPY ./ui/recipes-chat/tools /app/api/app/clients/tools From 5f95acc8b2b65379ef4299844cf933d90f69ad55 Mon Sep 17 00:00:00 2001 From: Matthew Harris Date: Mon, 13 May 2024 16:19:02 -0400 Subject: [PATCH 06/10] Renaming repo affected deployment --- deployment/docker-compose-deploy.yml | 196 --------------------------- docker-compose-deploy.yml | 5 +- docker-compose.yml | 3 - ui/recipes-chat/Dockerfile | 4 +- 4 files changed, 4 insertions(+), 204 deletions(-) delete mode 100644 deployment/docker-compose-deploy.yml diff --git a/deployment/docker-compose-deploy.yml b/deployment/docker-compose-deploy.yml deleted file mode 100644 index c9df0e30..00000000 --- a/deployment/docker-compose-deploy.yml +++ /dev/null @@ -1,196 +0,0 @@ -version: "3.4" - - -services: - api: - platform: linux/amd64 - container_name: haa-libre-chat - # Have to build a dockerfile as Azure multicontainer apps don't support bind mounts - #image: ghcr.io/danny-avila/librechat:v0.7.0 - #image: ghcr.io/danny-avila/librechat-dev:latest - build: - context: . - dockerfile: ./ui/recipes-chat/Dockerfile - ports: - #- "${PORT}:${PORT}" - - 8080:8080 - # DK Added - - 3080:3080 - depends_on: - #- mongodb - - rag-api - restart: always - user: "${UID}:${GID}" - extra_hosts: - - "host.docker.internal:host-gateway" - environment: - HOST: 0.0.0.0 - MONGO_URI: mongodb://mongodb:27017/LibreChat - MEILI_HOST: http://meilisearch:7700 - RAG_PORT: ${RAG_PORT:-8000} - RAG_API_URL: http://rag-api:${RAG_PORT:-8000} - volumes: - - shared-data:/app/client/public/images - #- type: bind - # source: ./.env - # target: /app/.env - #- ./ui/recipes-chat/images:/app/client/public/images - #- ./ui/recipes-chat/logs:/app/api/logs - #- ./ui/recipes-chat/tools:/app/api/app/clients/tools - #- type: bind - # source: ./ui/recipes-chat/librechat.yaml - # target: /app/librechat.yaml - #mongodb: - # platform: linux/amd64 - # container_name: haa-libre-chat-mongodb - # image: mongo:4.4.6 - # restart: always - # user: "${UID}:${GID}" - # #volumes: - # # - ./ui/recipes-chat/data-node:/data/db - # command: mongod --noauth - meilisearch: - platform: linux/amd64 - container_name: haa-libre-chat-meilisearch - image: getmeili/meilisearch:v1.7.3 - restart: always - user: "${UID}:${GID}" - environment: - MEILI_HOST: http://meilisearch:7700 - MEILI_NO_ANALYTICS: true - #volumes: - # - ./ui/recipes-chat/meili_data_v1.7:/meili_data - rag-api: - platform: linux/amd64 - image: ghcr.io/danny-avila/librechat-rag-api-dev-lite:latest - container_name: haa-libre-chat-rag-api - environment: - DB_HOST: docsdb - POSTGRES_DB: docs - POSTGRES_USER: ${POSTGRES_DATA_USER} - POSTGRES_PASSWORD: ${POSTGRES_DATA_PASSWORD} - RAG_PORT: 8000 - RAG_AZURE_OPENAI_API_KEY: ${AZURE_API_KEY_ENV} - EMBEDDINGS_PROVIDER: azure - EMBEDDINGS_MODEL: text-embedding-ada-002 - AZURE_OPENAI_ENDPOINT: https://dkopenai2.openai.azure.com/ - DEBUG_RAG_API: "True" - restart: always - depends_on: - - docsdb - env_file: - - .env - # Using a persistent postgres DB so we can access it - #datadb: - # platform: linux/amd64 - # image: postgis/postgis:12-3.4 - # container_name: haa-datadb - # environment: - # POSTGRES_DB: ${POSTGRES_DATA_DB} - # POSTGRES_USER: ${POSTGRES_DATA_USER} - # POSTGRES_PASSWORD: ${POSTGRES_DATA_PASSWORD} - # restart: always - # ports: - # - 5433:5432 - # #volumes: - # # - ./ui/recipes-chat/datadb:/var/lib/postgresql/data - # env_file: - # - .env - docsdb: - platform: linux/amd64 - image: ankane/pgvector:latest - container_name: haa-docsdb - environment: - POSTGRES_DB: docs - POSTGRES_USER: ${POSTGRES_DATA_USER} - POSTGRES_PASSWORD: ${POSTGRES_DATA_PASSWORD} - restart: always - #ports: - # - 5434:5432 - # #volumes: - # # - ./ui/recipes-chat/docsdb:/var/lib/postgresql/data - env_file: - - .env - #recipedb: - # platform: linux/amd64 - # image: ankane/pgvector:latest - # container_name: haa-datarecipesdb - # environment: - # POSTGRES_DB: ${POSTGRES_RECIPE_DB} - # POSTGRES_USER: ${POSTGRES_RECIPE_USER} - # POSTGRES_PASSWORD: ${POSTGRES_RECIPE_PASSWORD} - # restart: always - # ports: - # - 5435:5432 - # #volumes: - # # - ./actions/actions_plugins/recipe-server/db/:/docker-entrypoint-initdb.d - # # - ./ui/recipes-chat/recipesdb:/var/lib/postgresql/data - # env_file: - # - .env - actions: - platform: linux/amd64 - container_name: haa-robo-actions - user: "1000:1000" - build: - context: . - dockerfile: ./actions/Dockerfile - args: - # Assuming Azure deployment uses hosted postgres rather than Docker. - DATA_DB_CONN_STRING: ${REMOTE_DB_CONN_STRING} - ports: - # API - - 3001:8080 - # Action server portal - - 4001:8087 - environment: - OPENAI_API_TYPE: azure - OPENAI_API_ENDPOINT: https://dkopenai2.openai.azure.com/ - OPENAI_API_VERSION_MEMORY: 2024-02-15-preview - BASE_URL_MEMORY: https://dkopenai2.openai.azure.com/ - MODEL_MEMORY: gpt-4-turbo - OPENAI_TEXT_COMPLETION_DEPLOYMENT_NAME: text-embedding-ada-002 - env_file: - - .env - volumes: - - shared-data:/action-server/actions/actions_plugins/recipe-server/images - # Init container - init: - image: busybox - platform: linux/amd64 - container_name: haa-init - volumes: - - shared-data:/data - command: "sh -c 'chown -R 1000:1000 /data && chmod -R 775 /data'" - user: "root" - depends_on: - - actions - #ingestion: - # platform: linux/amd64 - # container_name: haa-ingestion - # build: - # context: . - # dockerfile: ./ingestion/Dockerfile - # #depends_on: - # # - datadb - # restart: always - # env_file: - # - .env - # volumes: - # - type: bind - # source: ./ingestion - # target: /app - code-interpreter: - #image: ghcr.io/iamgreggarcia/codesphera:latest - platform: linux/amd64 - container_name: haa-code-interpreter - build: - context: . - dockerfile: ./code-interpreter/Dockerfile - ports: - - "3333:3333" - #volumes: - # - ./code-interpreter/static:/app/static - -volumes: - pgdata2: - shared-data: \ No newline at end of file diff --git a/docker-compose-deploy.yml b/docker-compose-deploy.yml index c9df0e30..68c805e8 100644 --- a/docker-compose-deploy.yml +++ b/docker-compose-deploy.yml @@ -5,9 +5,6 @@ services: api: platform: linux/amd64 container_name: haa-libre-chat - # Have to build a dockerfile as Azure multicontainer apps don't support bind mounts - #image: ghcr.io/danny-avila/librechat:v0.7.0 - #image: ghcr.io/danny-avila/librechat-dev:latest build: context: . dockerfile: ./ui/recipes-chat/Dockerfile @@ -19,6 +16,8 @@ services: depends_on: #- mongodb - rag-api + env_file: + - .env restart: always user: "${UID}:${GID}" extra_hosts: diff --git a/docker-compose.yml b/docker-compose.yml index 59f18642..cd6b6490 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,9 +3,6 @@ version: "3.4" services: api: container_name: haa-libre-chat - # Have to build a dockerfile as Azure multicontainer apps don't support bind mounts - #image: ghcr.io/danny-avila/librechat:v0.7.0 - #image: ghcr.io/danny-avila/librechat-dev:latest build: context: . dockerfile: ./ui/recipes-chat/Dockerfile diff --git a/ui/recipes-chat/Dockerfile b/ui/recipes-chat/Dockerfile index ec8c78e5..f0792daf 100644 --- a/ui/recipes-chat/Dockerfile +++ b/ui/recipes-chat/Dockerfile @@ -1,6 +1,6 @@ #FROM ghcr.io/danny-avila/librechat -#FROM ghcr.io/danny-avila/librechat-dev:latest -FROM ghcr.io/danny-avila/librechat:v0.7.0 +FROM ghcr.io/danny-avila/librechat-dev:latest +#FROM ghcr.io/danny-avila/librechat:v0.7.0 RUN echo "Hi! " COPY .env /app/.env COPY ./ui/recipes-chat/librechat.yaml /app/librechat.yaml From 70bc542ddaf0bf36769a9a29302655a7ef917ad7 Mon Sep 17 00:00:00 2001 From: Matthew Harris Date: Mon, 13 May 2024 20:09:14 -0400 Subject: [PATCH 07/10] Moved post processing functions out of core code and added flag in HAP for latest ref period by HDX ID --- ingestion/api/hapi_utils.py | 63 +++++++++++++++++++++++++++++++++++++ ingestion/ingest.py | 51 +++++++----------------------- 2 files changed, 74 insertions(+), 40 deletions(-) create mode 100644 ingestion/api/hapi_utils.py diff --git a/ingestion/api/hapi_utils.py b/ingestion/api/hapi_utils.py new file mode 100644 index 00000000..1656d84b --- /dev/null +++ b/ingestion/api/hapi_utils.py @@ -0,0 +1,63 @@ +import sys + +import pandas as pd + + +def filter_hapi_df(df, admin0_code_field): + """ + Filter a pandas DataFrame by removing columns where all values are null and removing rows where any value is null. + Hack to get around the fact HDX mixes total values in with disaggregated values in the API + + Args: + df (pandas.DataFrame): The DataFrame to be filtered. + admin0_code_field (str): The name of the column containing the admin0 code. + + Returns: + pandas.DataFrame: The filtered DataFrame. + """ + df_orig = df.copy() + + if df.shape[0] == 0: + return df_orig + + dfs = [] + if admin0_code_field in df.columns: + for country in df[admin0_code_field].unique(): + df2 = df.copy() + df2 = df2[df2[admin0_code_field] == country] + + # Remove any columns where all null + df2 = df2.dropna(axis=1, how="all") + + # Remove any rows where one of the values is null + df2 = df2.dropna(axis=0, how="any") + + dfs.append(df.iloc[df2.index]) + + df = pd.concat(dfs) + + return df + + +def post_process_data(df, standard_names): + """ + Post-processes the data by filtering and renaming columns. + + Args: + df (pandas.DataFrame): The DataFrame to be post-processed. + + Returns: + pandas.DataFrame: The post-processed DataFrame. + """ + # aggregate and disaggregated data in the same tables, where the hierarchy differs by country + df = filter_hapi_df(df, standard_names["admin0_code_field"]) + + # Add a flag to indicate latest dataset by HDX ID, useful for LLM queries + if "resource_hdx_id" in df.columns: + df["latest"] = 0 + df["reference_period_start"] = pd.to_datetime(df["reference_period_start"]) + df["latest"] = df.groupby("dataset_hdx_stub")[ + "reference_period_start" + ].transform(lambda x: x == x.max()) + + return df diff --git a/ingestion/ingest.py b/ingestion/ingest.py index 2cef7bbf..935ebdb9 100644 --- a/ingestion/ingest.py +++ b/ingestion/ingest.py @@ -246,10 +246,17 @@ def process_openapi_data(api_name, files_dir, field_map, standard_names): filename = f"{files_dir}/{f}" df = pd.read_csv(filename) df = map_field_names(df, field_map) - # TODO: This is a temporary workaround to account for HAPI having - # aggregate and disaggregated data in the same tables, where the hierarchy differs by country - if api_name == "hapi": - df = filter_hdx_df(df, standard_names["admin0_code_field"]) + + # Import API-specific processing functions + import_str = f"from api.{api_name}_utils import post_process_data" + print(f"Processing {filename} with {import_str}") + exec(import_str) + post_process_str = "post_process_data(df, standard_names)" + print("Post processing with", post_process_str) + print(" Before shape", df.shape) + df = eval(post_process_str) + print(" After shape", df.shape) + df.to_csv(filename, index=False) @@ -368,42 +375,6 @@ def map_field_names(df, field_map): return df -def filter_hdx_df(df, admin0_code_field): - """ - Filter a pandas DataFrame by removing columns where all values are null and removing rows where any value is null. - Hack to get around the fact HDX mixes total values in with disaggregated values in the API - - Args: - df (pandas.DataFrame): The DataFrame to be filtered. - admin0_code_field (str): The name of the column containing the admin0 code. - - Returns: - pandas.DataFrame: The filtered DataFrame. - """ - df_orig = df.copy() - - if df.shape[0] == 0: - return df_orig - - dfs = [] - if admin0_code_field in df.columns: - for country in df[admin0_code_field].unique(): - df2 = df.copy() - df2 = df2[df2[admin0_code_field] == country] - - # Remove any columns where all null - df2 = df2.dropna(axis=1, how="all") - - # Remove any rows where one of the values is null - df2 = df2.dropna(axis=0, how="any") - - dfs.append(df.iloc[df2.index]) - - df = pd.concat(dfs) - - return df - - def main(): apis, field_map, standard_names = read_integration_config(INTEGRATION_CONFIG) conn = connect_to_db() From e528c43d43303c066bee1cb90b2e014933b6e6c5 Mon Sep 17 00:00:00 2001 From: Matthew Harris Date: Mon, 13 May 2024 20:16:40 -0400 Subject: [PATCH 08/10] Renaming --- .github/workflows/test_deploy.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .github/workflows/test_deploy.yml diff --git a/.github/workflows/test_deploy.yml b/.github/workflows/test_deploy.yml new file mode 100644 index 00000000..16b23cd0 --- /dev/null +++ b/.github/workflows/test_deploy.yml @@ -0,0 +1,24 @@ +name: Run checks on recipes ai repo + +on: + push: + branches: + - main + workflow_dispatch: + +jobs: + build: + runs-on: 'ubuntu-latest' + + steps: + - uses: actions/checkout@v2 + + code-quality-checks: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: "3.11.4" + - uses: pre-commit/action@v3.0.1 + From 009feea6a5367d5954103c53c98a0e1017f96e1c Mon Sep 17 00:00:00 2001 From: Matthew Harris Date: Mon, 13 May 2024 20:19:59 -0400 Subject: [PATCH 09/10] Renaming --- .../main_ai-assistants-prototypes.yml | 24 ------------------- .github/workflows/test_deploy.yml | 6 +---- 2 files changed, 1 insertion(+), 29 deletions(-) delete mode 100644 .github/workflows/main_ai-assistants-prototypes.yml diff --git a/.github/workflows/main_ai-assistants-prototypes.yml b/.github/workflows/main_ai-assistants-prototypes.yml deleted file mode 100644 index 16b23cd0..00000000 --- a/.github/workflows/main_ai-assistants-prototypes.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: Run checks on recipes ai repo - -on: - push: - branches: - - main - workflow_dispatch: - -jobs: - build: - runs-on: 'ubuntu-latest' - - steps: - - uses: actions/checkout@v2 - - code-quality-checks: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: "3.11.4" - - uses: pre-commit/action@v3.0.1 - diff --git a/.github/workflows/test_deploy.yml b/.github/workflows/test_deploy.yml index 16b23cd0..2f761e1c 100644 --- a/.github/workflows/test_deploy.yml +++ b/.github/workflows/test_deploy.yml @@ -1,10 +1,6 @@ name: Run checks on recipes ai repo -on: - push: - branches: - - main - workflow_dispatch: +on: [push, pull_request] jobs: build: From 42c77a1df97a9c9f22d450af15d4d0f27d413db8 Mon Sep 17 00:00:00 2001 From: Matthew Harris Date: Tue, 14 May 2024 13:55:55 -0400 Subject: [PATCH 10/10] Example prompt for generating recipes with copilot. Incomplete, will revisit. --- recipes-creation/copilot_prompt.txt | 137 ++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 recipes-creation/copilot_prompt.txt diff --git a/recipes-creation/copilot_prompt.txt b/recipes-creation/copilot_prompt.txt new file mode 100644 index 00000000..8f6afc42 --- /dev/null +++ b/recipes-creation/copilot_prompt.txt @@ -0,0 +1,137 @@ +Using the database table list below, and the columns provided in each table, generate +Python that summarizes the following: + +"Count of Organizations which are active on the ground in Mali, by sector" + +Coding tips ... + +The shapefile in the database will need to be converted to a geoseries for plotting, here is an example: + +` `` +# Convert the data into a DataFrame +df = pd.DataFrame(rows, columns=["adm1_code", "population", "geometry"]) + +# Convert the 'geometry' column into a GeoSeries +df['geometry'] = df['geometry'].apply(lambda x: wkb.loads(x, hex=True)) + +# Convert the DataFrame into a GeoDataFrame +gdf = gpd.GeoDataFrame(df, geometry='geometry') +``` +Also, please always save any images to a .png file. + +Always specify a clear title on any graphs or maps. +Always add annotations, labels and units on any graphs/maps +You can use any kind of visualization + +IMPORTANT: Generate reusable code, by putting it in a function with arguments, and provide an example of how to call it. + +Always print any SQL statements and the size of the results returned + +Database connection details are in the following environment variables (saved in the .env file) ... + +POSTGRES_DATA_HOST +POSTGRES_DATA_PORT +POSTGRES_DATA_DB +POSTGRES_DATA_USER +POSTGRES_DATA_PASSWORD + +Using python .env module to load these environment varaibles. + +In SQL queries with more than one table, always use table aliases to avoid ambiguous columns + +Make note of column types, if you are asked to plot the count of something, SUM will not work + +Always use country codes instead of names where possible + +Tables and their columns ... + +{ +"select table_name, summary, columns from table_metadata\n": [ + { + "table_name" : "hapi_admin1", + "summary" : "['Locations and Administrative Divisions']", + "columns" : "code (text); name (text); adm0_code (text); location_name (text); " + }, + { + "table_name" : "hapi_admin2", + "summary" : "['Locations and Administrative Divisions']", + "columns" : "code (text); name (text); adm1_code (text); adm1_name (text); adm0_code (text); location_name (text); " + }, + { + "table_name" : "hapi_age_range", + "summary" : "['Age and Gender Disaggregations']", + "columns" : "age_min (bigint); age_max (double precision); code (text); " + }, + { + "table_name" : "hapi_dataset", + "summary" : "['HDX Metadata']", + "columns" : "hdx_id (text); hdx_stub (text); title (text); hdx_provider_stub (text); hdx_provider_name (text); hdx_link (text); hdx_api_link (text); " + }, + { + "table_name" : "hapi_3w", + "summary" : "['3W Operational Presence']", + "columns" : "reference_period_end (double precision); dataset_hdx_stub (text); resource_hdx_id (text); org_acronym (text); org_name (text); sector_name (text); adm0_code (text); location_name (text); reference_period_start (text); adm1_code (text); adm1_name (text); adm2_code (text); sector_code (text); adm2_name (text); " + }, + { + "table_name" : "hapi_gender", + "summary" : "['Age and Gender Disaggregations']", + "columns" : "code (text); description (text); " + }, + { + "table_name" : "hapi_location", + "summary" : "['Locations and Administrative Divisions']", + "columns" : "code (text); name (text); " + }, + { + "table_name" : "hapi_org", + "summary" : "['Humanitarian Organizations and Sectors']", + "columns" : "org_type_code (double precision); acronym (text); name (text); org_type_description (text); " + }, + { + "table_name" : "hapi_org_type", + "summary" : "['Humanitarian Organizations and Sectors']", + "columns" : "code (bigint); description (text); " + }, + { + "table_name" : "hapi_population_group", + "summary" : "['Population Groups and Statuses']", + "columns" : "code (text); description (text); " + }, + { + "table_name" : "hapi_population_status", + "summary" : "['Population Groups and Statuses']", + "columns" : "code (text); description (text); " + }, + { + "table_name" : "hapi_resource", + "summary" : "['HDX Metadata']", + "columns" : "is_hxl (boolean); name (text); format (text); update_date (text); download_url (text); dataset_hdx_id (text); dataset_hdx_stub (text); dataset_title (text); dataset_hdx_provider_stub (text); dataset_hdx_provider_name (text); hdx_link (text); hdx_api_link (text); dataset_hdx_link (text); hdx_id (text); dataset_hdx_api_link (text); " + }, + { + "table_name" : "hapi_food_security", + "summary" : "['Food Security']", + "columns" : "population_in_phase (bigint); population_fraction_in_phase (double precision); ipc_phase_code (text); ipc_phase_name (text); ipc_type_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); adm2_name (text); " + }, + { + "table_name" : "hapi_humanitarian_needs", + "summary" : "['Humanitarian Needs']", + "columns" : "population (bigint); age_range_code (text); disabled_marker (text); sector_code (text); sector_name (text); population_status_code (text); population_group_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); gender_code (text); adm2_name (text); " + }, + { + "table_name" : "hapi_national_risk", + "summary" : "['National Risk']", + "columns" : "risk_class (bigint); global_rank (bigint); overall_risk (double precision); hazard_exposure_risk (double precision); vulnerability_risk (double precision); coping_capacity_risk (double precision); meta_missing_indicators_pct (double precision); meta_avg_recentness_years (double precision); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); dataset_hdx_provider_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); " + }, + { + "table_name" : "hapi_population", + "summary" : "['Baseline Population']", + "columns" : "population (bigint); age_range_code (text); reference_period_start (text); reference_period_end (text); dataset_hdx_stub (text); resource_hdx_id (text); adm0_code (text); location_name (text); adm1_code (text); adm1_name (text); adm2_code (text); gender_code (text); adm2_name (text); " + }, + { + "table_name" : "hdx_shape_files", + "summary" : "HDX Shape Files", + "columns" : "geometry (USER-DEFINED); OBJECTID (double precision); AREA_SQKM (double precision); Shape_Area (double precision); Shape_Leng (double precision); ADM1ALT2FR (text); ADM0_FR (text); adm0_code (text); date (text); validOn (text); validTo (text); ADM2_FR (text); adm2_code (text); ADM2_REF (text); ADM2ALT1FR (text); ADM2ALT2FR (text); ADM1_EN (text); ADM1ALT1EN (text); ADM1ALT2EN (text); ADM0_EN (text); ADM2_EN (text); ADM2ALT1EN (text); ADM2ALT2EN (text); ADM1_ES (text); ADM1ALT1ES (text); ADM1ALT2ES (text); ADM0_ES (text); ADM2_ES (text); ADM2ALT1ES (text); ADM2ALT2ES (text); ValidTo (text); ADM1_HT (text); ADM1ALT1HT (text); ADM1ALT2HT (text); ADM0_HT (text); ADM2_HT (text); ADM2ALT1HT (text); ADM2ALT2HT (text); ADM1_MY (text); ADM1_ALTPC (text); ADM0_MY (text); ADM2_MY (text); ADM1_PT (text); ADM1ALT1PT (text); ADM1ALT2PT (text); ADM0_PT (text); ADM2_PT (text); ADM2ALT1PT (text); ADM2ALT2PT (text); SD_EN (text); SD_PCODE (text); ADM1_AR (text); ADM1ALT1AR (text); ADM1ALT2AR (text); ADM0_AR (text); ADM2_AR (text); ADM2ALT1AR (text); ADM2ALT2AR (text); admin1Name (text); admin1RefN (text); admin1Na_1 (text); admin1AltN (text); admin1Al_1 (text); admin0Name (text); admin2Name (text); admin2RefN (text); admin2Na_1 (text); admin2AltN (text); admin2Al_1 (text); ADM1_UA (text); ADM1_RU (text); ADM0_UA (text); ADM0_RU (text); ADM2_UA (text); ADM2_RU (text); ADM1_FR (text); adm1_code (text); ADM1_REF (text); ADM1ALT1FR (text); " + } +]} + +