Add execute_sql_rds.sh script to run SQL using RDS Data API (#1847)

### Time to review: __5 mins__ ## Changes proposed - Add `execute_sql_rds.sh` script to run SQL using RDS Data API - Add some sample sql files that it can use ## Context for reviewers During development, this can make it easier to run queries against the dev database. ## Additional information ![Screenshot 2024-04-26 at 10 39 33](https://github.com/HHS/simpler-grants-gov/assets/3811269/38fed12d-ccf1-43ac-8dc5-2f896ed8d0d6) --------- Co-authored-by: Michael Chouinard <[email protected]>
HHS · Oct 18, 2024 · 1fa4cef · 1fa4cef
1 parent 9a7cf92
commit 1fa4cef
Show file tree

Hide file tree

Showing 6 changed files with 174 additions and 20 deletions.
diff --git a/api/bin/execute_sql_rds.sh b/api/bin/execute_sql_rds.sh
@@ -0,0 +1,128 @@
+#!/usr/bin/env bash
+#
+# Execute some SQL using the RDS Data API.
+#
+# Examples:
+#   ./execute_sql_rds.sh <sql/table_list.sql
+#   ./execute_sql_rds.sh --cluster=api-prod --multiple <sql/select_from_foreign_table.sql
+#
+# When using --multiple, provide one SQL statement per input line.
+#
+
+set -o errexit -o pipefail
+
+PROGRAM_NAME=$(basename "$0")
+
+CYAN='\033[96m'
+GREEN='\033[92m'
+RED='\033[01;31m'
+END='\033[0m'
+
+CLUSTER=api-dev
+
+USAGE="Usage: $PROGRAM_NAME [OPTION]
+
+  --multiple         one SQL statement per input line (otherwise expects a single multi-line statement)
+  --cluster=CLUSTER  target RDS cluster (default $CLUSTER)
+"
+
+
+main() {
+  cluster="$CLUSTER"
+  parse_arguments "$@"
+  print_log "using cluster $cluster"
+  read_cluster_arns
+  create_temporary_directory
+
+  # Note that to use jtbl, it needs to be installed directly
+  # by the user with pip - if we wanted it to work with our poetry
+  # setup we'd have to run many of these commands via poetry
+  if ! command -v jtbl 2>&1 >/dev/null
+  then
+    printf "\n${RED}jtbl command not found${END} - please install before running: https://github.com/kellyjonbrazil/jtbl \n\n"
+    exit 1
+  fi
+
+  count=1
+  if [ $multiple ]
+  then
+    while read line
+    do
+      execute_statement "$line"
+      count=$((count + 1))
+    done
+  else
+    execute_statement "$(cat)"
+  fi
+}
+
+
+parse_arguments() {
+  for arg in "$@"
+  do
+    if [ "$arg" == "--multiple" ]; then
+      print_log "multiple mode enabled (one statement per input line)"
+      multiple=1
+    elif [[ "$arg" =~ ^--cluster=(.*)$ ]]; then
+      cluster="${BASH_REMATCH[1]}"
+    else
+      echo "$USAGE"
+      exit 1
+    fi
+  done
+}
+
+
+read_cluster_arns() {
+  resource_arn=$(aws rds describe-db-clusters --db-cluster-identifier="$cluster" \
+                     --query='DBClusters[0].DBClusterArn' --output=text)
+  secret_arn=$(aws rds describe-db-clusters --db-cluster-identifier="$cluster" \
+                   --query='DBClusters[0].MasterUserSecret.SecretArn' --output=text)
+  print_log "database resource $resource_arn"
+}
+
+
+create_temporary_directory() {
+  tmp_dir="/tmp/execute_sql_rds/execute_sql_rds.$(date "+%Y-%m-%d_%H:%M:%S")"
+  mkdir -m "u=rwx,g=,o=" -p "$tmp_dir"
+  print_log "temporary directory $tmp_dir"
+}
+
+
+execute_statement() {
+  print_log "$1"
+  result_path="$tmp_dir/raw_result_$count.json"
+  json_result_path="$tmp_dir/result_$count.json"
+  csv_result_path="$tmp_dir/result_$count.csv"
+
+  aws rds-data execute-statement \
+      --resource-arn "$resource_arn" \
+      --database "app" \
+      --secret-arn "$secret_arn" \
+      --sql "$1" \
+      --continue-after-timeout \
+      --format-records-as JSON \
+      >"$result_path"
+
+  if grep formattedRecords "$result_path" >/dev/null
+  then
+    # Print a pretty table to the user
+    jq -r .formattedRecords "$result_path" | jtbl --truncate --markdown
+    # Pull the results out and write to a CSV + JSON
+    jq -r .formattedRecords "$result_path" | jtbl --csv > $csv_result_path
+    jq -r .formattedRecords "$result_path" > $json_result_path
+    print_log "----"
+    print_log "Output written to $tmp_dir/"
+  else
+    cat "$result_path"
+  fi
+}
+
+
+# Utility functions
+print_log() {
+  printf "$CYAN%s $GREEN%s: $END%s\\n" "$(date "+%Y-%m-%d %H:%M:%S")" "$PROGRAM_NAME" "$*"
+}
+
+# Entry point
+main "$@"
diff --git a/api/bin/setup_localstack.py b/api/bin/setup_localstack.py
@@ -1,5 +1,6 @@
 import logging
 
+import boto3
 import botocore.client
 import botocore.exceptions
 
@@ -27,7 +28,12 @@ def does_s3_bucket_exist(s3_client: botocore.client.BaseClient, bucket_name: str
 
 def setup_s3() -> None:
     s3_config = S3Config()
-    s3_client = get_s3_client(s3_config)
+    # This is only used locally - to avoid any accidental running of commands here
+    # against a real AWS account (ie. you've authed in your local terminal where you're running this)
+    # we'll override the access keys explicitly.
+    s3_client = get_s3_client(
+        s3_config, boto3.Session(aws_access_key_id="NO_CREDS", aws_secret_access_key="NO_CREDS")
+    )
 
     if s3_config.s3_opportunity_bucket is None:
         raise Exception("S3_OPPORTUNITY_BUCKET env var must be set")

diff --git a/api/bin/sql/select_from_foreign_table.sql b/api/bin/sql/select_from_foreign_table.sql
@@ -0,0 +1,18 @@
+SELECT * FROM legacy.tforecast ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tforecast_hist ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tapplicanttypes_forecast ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tapplicanttypes_forecast_hist ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tfundactcat_forecast ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tfundactcat_forecast_hist ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tfundinstr_forecast ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tfundinstr_forecast_hist ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.topportunity ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.topportunity_cfda ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tsynopsis ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tsynopsis_hist ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tapplicanttypes_synopsis ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tapplicanttypes_synopsis_hist ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tfundactcat_synopsis ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tfundactcat_synopsis_hist ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tfundinstr_synopsis ORDER BY created_date DESC LIMIT 8;
+SELECT * FROM legacy.tfundinstr_synopsis_hist ORDER BY created_date DESC LIMIT 8;
diff --git a/api/bin/sql/table_list.sql b/api/bin/sql/table_list.sql
@@ -0,0 +1,15 @@
+SELECT n.nspname as "Schema",
+  c.relname as "Name",
+  CASE c.relkind WHEN 'r' THEN 'table' WHEN 'v' THEN 'view' WHEN 'm' THEN 'materialized view' WHEN 'i' THEN 'index' WHEN 'S' THEN 'sequence' WHEN 't' THEN 'TOAST table' WHEN 'f' THEN 'foreign table' WHEN 'p' THEN 'partitioned table' WHEN 'I' THEN 'partitioned index' END as "Type",
+  pg_catalog.pg_get_userbyid(c.relowner) as "Owner",
+  pg_catalog.pg_size_pretty(pg_catalog.pg_table_size(c.oid)) as "Size",
+  pg_stat_get_last_analyze_time(c.oid) AS last_analyze,
+  pg_stat_get_last_autoanalyze_time(c.oid) AS last_autoanalyze
+FROM pg_catalog.pg_class c
+     LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
+     LEFT JOIN pg_catalog.pg_am am ON am.oid = c.relam
+WHERE c.relkind IN ('r','f','v','m','')
+      AND n.nspname <> 'pg_catalog'
+      AND n.nspname !~ '^pg_toast'
+      AND n.nspname <> 'information_schema'
+ORDER BY 1,2;
diff --git a/api/local.env b/api/local.env
@@ -70,24 +70,6 @@ SEARCH_PORT=9200
 SEARCH_USE_SSL=FALSE
 SEARCH_VERIFY_CERTS=FALSE
 
-############################
-# AWS Defaults
-############################
-# For these secret access keys, don't
-# add them to this file to avoid mistakenly
-# committing them. Set these in your shell
-# by doing `export AWS_ACCESS_KEY_ID=whatever`
-AWS_ACCESS_KEY_ID=DO_NOT_SET_HERE
-AWS_SECRET_ACCESS_KEY=DO_NOT_SET_HERE
-# These next two are commented out as we
-# don't have configuration for individuals
-# to use these at the moment and boto3
-# tries to use them first before the keys above.
-#AWS_SECURITY_TOKEN=DO_NOT_SET_HERE
-#AWS_SESSION_TOKEN=DO_NOT_SET_HERE
-
-AWS_DEFAULT_REGION=us-east-1
-
 ############################
 # Localstack
 ############################

diff --git a/api/src/adapters/aws/s3_adapter.py b/api/src/adapters/aws/s3_adapter.py
@@ -19,12 +19,17 @@ class S3Config(PydanticBaseEnvConfig):
     s3_opportunity_bucket: str | None = None
 
 
-def get_s3_client(s3_config: S3Config | None = None) -> botocore.client.BaseClient:
+def get_s3_client(
+    s3_config: S3Config | None = None, session: boto3.Session | None = None
+) -> botocore.client.BaseClient:
     if s3_config is None:
         s3_config = S3Config()
 
     params = {}
     if s3_config.s3_endpoint_url is not None:
         params["endpoint_url"] = s3_config.s3_endpoint_url
 
+    if session is not None:
+        return session.client("s3", **params)
+
     return boto3.client("s3", **params)