Shard dynamic analysis data loading across ecosystems and simplify im…

…plementation. (#1007) * Move Docker image cloudbuilder.yaml into a better location. Signed-off-by: Caleb Brown <[email protected]> * Add a script and cloudbuild config for loading dynamic results into BQ. Signed-off-by: Caleb Brown <[email protected]> --------- Signed-off-by: Caleb Brown <[email protected]>
ossf · Feb 7, 2024 · 5a5b488 · 5a5b488
1 parent 21a31a0
commit 5a5b488
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 0 deletions.
diff --git a/infra/cloudbuild/dynamic_loader/cloudbuild.yaml b/infra/cloudbuild/dynamic_loader/cloudbuild.yaml
@@ -0,0 +1,12 @@
+steps:
+- name: gcr.io/google.com/cloudsdktool/cloud-sdk
+  env:
+  - 'PROJECT_ID=ossf-malware-analysis'
+  - 'LOAD_DATASET=testing'  # TODO: change after testing is completed
+  - 'LOAD_TABLE_PREFIX=merge_'
+  - 'DEST_DATASET=testing'  # TODO: change after testing is completed
+  - 'DEST_TABLE=analysis'
+  - 'RESULT_BUCKET=gs://ossf-malware-analysis-results'
+  - 'SCHEMA_FILE=function/loader/dynamic-analysis-schema.json'
+  args: ['scripts/bq_load.sh']
+timeout: 43200s  # 12 hours
diff --git a/build/cloudbuild.yaml → infra/cloudbuild/image_build/cloudbuild.yaml b/build/cloudbuild.yaml → infra/cloudbuild/image_build/cloudbuild.yaml
diff --git a/scripts/bq_load.sh b/scripts/bq_load.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+if [ -z "$PROJECT_ID" ]; then
+    echo "PROJECT_ID must be set"
+    exit 1
+fi
+
+if [ -z "$LOAD_DATASET" ]; then
+    echo "LOAD_DATASET must be set"
+    exit 1
+fi
+
+if [ -z "$LOAD_TABLE_PREFIX" ]; then
+    echo "LOAD_TABLE_PREFIX must be set"
+    exit 1
+fi
+
+if [ -z "$DEST_DATASET" ]; then
+    echo "DEST_DATASET must be set"
+    exit 1
+fi
+
+if [ -z "$DEST_TABLE" ]; then
+    echo "DEST_TABLE must be set"
+    exit 1
+fi
+
+if [ -z "$RESULT_BUCKET" ]; then
+    echo "RESULT_BUCKET must be set"
+    exit 1
+fi
+
+if [ -z "$SCHEMA_FILE" ]; then
+    echo "SCHEMA_FILE must be set"
+    exit 1
+fi
+
+union=""
+
+for bucket_prefix in `gsutil ls "$BUCKET"`; do
+    prefix=`echo "$bucket_prefix" | sed "s|$BUCKET/\([^\]*\)/|\1|g"`
+    clean_prefix=`echo "$prefix" | tr -c -d "[:alnum:]"`
+    table_name="$LOAD_TABLE_PREFIX$clean_prefix"
+
+    echo "## Loading $bucket_prefix into \`$PROJECT_ID.$LOAD_DATASET.$table_name\`."
+    bq load \
+        --project_id="$PROJECT_ID" \
+        --dataset_id="$LOAD_DATASET" \
+        --replace \
+        --time_partitioning_type="DAY" \
+        --time_partitioning_field="CreatedTimestamp" \
+        --source_format="NEWLINE_DELIMITED_JSON" \
+        --max_bad_records=10000 \
+        "$table_name" "$bucket_prefix*"  "$SCHEMA_FILE"
+
+    # Construct a UNION query for joining the prefix shards together
+    subquery="SELECT * FROM \`$PROJECT_ID.$LOAD_DATASET.$table_name\`"
+    if [ -n "$union" ]; then
+      union="$union UNION ALL "
+    fi
+    union="$union$subquery"
+done
+
+query="CREATE OR REPLACE TABLE \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` LIKE \`$PROJECT_ID.$LOAD_DATASET.$table_name\` PARTITION BY TIMESTAMP_TRUNC(CreatedTimestamp, DAY) AS $union;"
+
+echo "## Updating \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` from shards."
+echo "Executing query: '$query'"
+
+bq query --nouse_legacy_sql --project_id="$PROJECT_ID" "$query"