Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Shard dynamic analysis data loading across ecosystems and simplify implementation. #1007

Merged
merged 2 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions infra/cloudbuild/dynamic_loader/cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
steps:
- name: gcr.io/google.com/cloudsdktool/cloud-sdk
env:
- 'PROJECT_ID=ossf-malware-analysis'
- 'LOAD_DATASET=testing' # TODO: change after testing is completed
- 'LOAD_TABLE_PREFIX=merge_'
- 'DEST_DATASET=testing' # TODO: change after testing is completed
- 'DEST_TABLE=analysis'
- 'RESULT_BUCKET=gs://ossf-malware-analysis-results'
- 'SCHEMA_FILE=function/loader/dynamic-analysis-schema.json'
args: ['scripts/bq_load.sh']
timeout: 43200s # 12 hours
File renamed without changes.
69 changes: 69 additions & 0 deletions scripts/bq_load.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash

if [ -z "$PROJECT_ID" ]; then
echo "PROJECT_ID must be set"
exit 1
fi

if [ -z "$LOAD_DATASET" ]; then
echo "LOAD_DATASET must be set"
exit 1
fi

if [ -z "$LOAD_TABLE_PREFIX" ]; then
echo "LOAD_TABLE_PREFIX must be set"
exit 1
fi

if [ -z "$DEST_DATASET" ]; then
echo "DEST_DATASET must be set"
exit 1
fi

if [ -z "$DEST_TABLE" ]; then
echo "DEST_TABLE must be set"
exit 1
fi

if [ -z "$RESULT_BUCKET" ]; then
echo "RESULT_BUCKET must be set"
exit 1
fi

if [ -z "$SCHEMA_FILE" ]; then
echo "SCHEMA_FILE must be set"
exit 1
fi

union=""

for bucket_prefix in `gsutil ls "$BUCKET"`; do
prefix=`echo "$bucket_prefix" | sed "s|$BUCKET/\([^\]*\)/|\1|g"`
clean_prefix=`echo "$prefix" | tr -c -d "[:alnum:]"`
table_name="$LOAD_TABLE_PREFIX$clean_prefix"

echo "## Loading $bucket_prefix into \`$PROJECT_ID.$LOAD_DATASET.$table_name\`."
bq load \
--project_id="$PROJECT_ID" \
--dataset_id="$LOAD_DATASET" \
--replace \
--time_partitioning_type="DAY" \
--time_partitioning_field="CreatedTimestamp" \
--source_format="NEWLINE_DELIMITED_JSON" \
--max_bad_records=10000 \
"$table_name" "$bucket_prefix*" "$SCHEMA_FILE"

# Construct a UNION query for joining the prefix shards together
subquery="SELECT * FROM \`$PROJECT_ID.$LOAD_DATASET.$table_name\`"
if [ -n "$union" ]; then
union="$union UNION ALL "
fi
union="$union$subquery"
done

query="CREATE OR REPLACE TABLE \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` LIKE \`$PROJECT_ID.$LOAD_DATASET.$table_name\` PARTITION BY TIMESTAMP_TRUNC(CreatedTimestamp, DAY) AS $union;"

echo "## Updating \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` from shards."
echo "Executing query: '$query'"

bq query --nouse_legacy_sql --project_id="$PROJECT_ID" "$query"
Loading