Skip to content

Commit

Permalink
Shard dynamic analysis data loading across ecosystems and simplify im…
Browse files Browse the repository at this point in the history
…plementation. (#1007)

* Move Docker image cloudbuilder.yaml into a better location.

Signed-off-by: Caleb Brown <[email protected]>

* Add a script and cloudbuild config for loading dynamic results into BQ.

Signed-off-by: Caleb Brown <[email protected]>

---------

Signed-off-by: Caleb Brown <[email protected]>
  • Loading branch information
calebbrown authored Feb 7, 2024
1 parent 21a31a0 commit 5a5b488
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 0 deletions.
12 changes: 12 additions & 0 deletions infra/cloudbuild/dynamic_loader/cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
steps:
- name: gcr.io/google.com/cloudsdktool/cloud-sdk
env:
- 'PROJECT_ID=ossf-malware-analysis'
- 'LOAD_DATASET=testing' # TODO: change after testing is completed
- 'LOAD_TABLE_PREFIX=merge_'
- 'DEST_DATASET=testing' # TODO: change after testing is completed
- 'DEST_TABLE=analysis'
- 'RESULT_BUCKET=gs://ossf-malware-analysis-results'
- 'SCHEMA_FILE=function/loader/dynamic-analysis-schema.json'
args: ['scripts/bq_load.sh']
timeout: 43200s # 12 hours
File renamed without changes.
69 changes: 69 additions & 0 deletions scripts/bq_load.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash

if [ -z "$PROJECT_ID" ]; then
echo "PROJECT_ID must be set"
exit 1
fi

if [ -z "$LOAD_DATASET" ]; then
echo "LOAD_DATASET must be set"
exit 1
fi

if [ -z "$LOAD_TABLE_PREFIX" ]; then
echo "LOAD_TABLE_PREFIX must be set"
exit 1
fi

if [ -z "$DEST_DATASET" ]; then
echo "DEST_DATASET must be set"
exit 1
fi

if [ -z "$DEST_TABLE" ]; then
echo "DEST_TABLE must be set"
exit 1
fi

if [ -z "$RESULT_BUCKET" ]; then
echo "RESULT_BUCKET must be set"
exit 1
fi

if [ -z "$SCHEMA_FILE" ]; then
echo "SCHEMA_FILE must be set"
exit 1
fi

union=""

for bucket_prefix in `gsutil ls "$BUCKET"`; do
prefix=`echo "$bucket_prefix" | sed "s|$BUCKET/\([^\]*\)/|\1|g"`
clean_prefix=`echo "$prefix" | tr -c -d "[:alnum:]"`
table_name="$LOAD_TABLE_PREFIX$clean_prefix"

echo "## Loading $bucket_prefix into \`$PROJECT_ID.$LOAD_DATASET.$table_name\`."
bq load \
--project_id="$PROJECT_ID" \
--dataset_id="$LOAD_DATASET" \
--replace \
--time_partitioning_type="DAY" \
--time_partitioning_field="CreatedTimestamp" \
--source_format="NEWLINE_DELIMITED_JSON" \
--max_bad_records=10000 \
"$table_name" "$bucket_prefix*" "$SCHEMA_FILE"

# Construct a UNION query for joining the prefix shards together
subquery="SELECT * FROM \`$PROJECT_ID.$LOAD_DATASET.$table_name\`"
if [ -n "$union" ]; then
union="$union UNION ALL "
fi
union="$union$subquery"
done

query="CREATE OR REPLACE TABLE \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` LIKE \`$PROJECT_ID.$LOAD_DATASET.$table_name\` PARTITION BY TIMESTAMP_TRUNC(CreatedTimestamp, DAY) AS $union;"

echo "## Updating \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` from shards."
echo "Executing query: '$query'"

bq query --nouse_legacy_sql --project_id="$PROJECT_ID" "$query"

0 comments on commit 5a5b488

Please sign in to comment.