diff --git a/.github/workflows/osv-scanner-pr.yml b/.github/workflows/osv-scanner-pr.yml index 4775867b..dbf3ab32 100644 --- a/.github/workflows/osv-scanner-pr.yml +++ b/.github/workflows/osv-scanner-pr.yml @@ -16,4 +16,4 @@ permissions: jobs: scan-pr: - uses: "google/osv-scanner/.github/workflows/osv-scanner-reusable-pr.yml@main" + uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable-pr.yml@v1.6.2-beta1" diff --git a/.github/workflows/osv-scanner-scheduled.yml b/.github/workflows/osv-scanner-scheduled.yml index e555cb1d..a2546389 100644 --- a/.github/workflows/osv-scanner-scheduled.yml +++ b/.github/workflows/osv-scanner-scheduled.yml @@ -15,4 +15,4 @@ permissions: jobs: scan-scheduled: - uses: "google/osv-scanner/.github/workflows/osv-scanner-reusable.yml@main" + uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@v1.6.2-beta1" diff --git a/infra/cloudbuild/dynamic_loader/cloudbuild.yaml b/infra/cloudbuild/dynamic_loader/cloudbuild.yaml new file mode 100644 index 00000000..248bfcc5 --- /dev/null +++ b/infra/cloudbuild/dynamic_loader/cloudbuild.yaml @@ -0,0 +1,15 @@ +steps: +- name: gcr.io/google.com/cloudsdktool/cloud-sdk + env: + - 'PROJECT_ID=ossf-malware-analysis' + - 'LOAD_DATASET=testing' # TODO: change after testing is completed + - 'LOAD_TABLE_PREFIX=merge_' + - 'DEST_DATASET=testing' # TODO: change after testing is completed + - 'DEST_TABLE=analysis' + - 'RESULT_BUCKET=gs://ossf-malware-analysis-results' + - 'SCHEMA_FILE=function/loader/dynamic-analysis-schema.json' + entrypoint: '/bin/bash' + args: ['scripts/bq_load.sh'] +timeout: 43200s # 12 hours +options: + logging: CLOUD_LOGGING_ONLY diff --git a/build/cloudbuild.yaml b/infra/cloudbuild/image_build/cloudbuild.yaml similarity index 100% rename from build/cloudbuild.yaml rename to infra/cloudbuild/image_build/cloudbuild.yaml diff --git a/scripts/bq_load.sh b/scripts/bq_load.sh new file mode 100755 index 00000000..888c4d4f --- /dev/null +++ b/scripts/bq_load.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +if [ -z "$PROJECT_ID" ]; then + echo "PROJECT_ID must be set" + exit 1 +fi + +if [ -z "$LOAD_DATASET" ]; then + echo "LOAD_DATASET must be set" + exit 1 +fi + +if [ -z "$LOAD_TABLE_PREFIX" ]; then + echo "LOAD_TABLE_PREFIX must be set" + exit 1 +fi + +if [ -z "$DEST_DATASET" ]; then + echo "DEST_DATASET must be set" + exit 1 +fi + +if [ -z "$DEST_TABLE" ]; then + echo "DEST_TABLE must be set" + exit 1 +fi + +if [ -z "$RESULT_BUCKET" ]; then + echo "RESULT_BUCKET must be set" + exit 1 +fi + +if [ -z "$SCHEMA_FILE" ]; then + echo "SCHEMA_FILE must be set" + exit 1 +fi + +union="" + +for bucket_prefix in `gsutil ls "$RESULT_BUCKET"`; do + prefix=`echo "$bucket_prefix" | sed "s|$RESULT_BUCKET/\([^\]*\)/|\1|g"` + clean_prefix=`echo "$prefix" | tr -c -d "[:alnum:]"` + table_name="$LOAD_TABLE_PREFIX$clean_prefix" + + echo "## Loading $bucket_prefix into \`$PROJECT_ID.$LOAD_DATASET.$table_name\`." + bq load \ + --headless \ + --project_id="$PROJECT_ID" \ + --dataset_id="$LOAD_DATASET" \ + --replace \ + --time_partitioning_type="DAY" \ + --time_partitioning_field="CreatedTimestamp" \ + --source_format="NEWLINE_DELIMITED_JSON" \ + --max_bad_records=10000 \ + "$table_name" "$bucket_prefix*" "$SCHEMA_FILE" + + # Construct a UNION query for joining the prefix shards together + subquery="SELECT * FROM \`$PROJECT_ID.$LOAD_DATASET.$table_name\`" + if [ -n "$union" ]; then + union="$union UNION ALL " + fi + union="$union$subquery" +done + +query="CREATE OR REPLACE TABLE \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` LIKE \`$PROJECT_ID.$LOAD_DATASET.$table_name\` PARTITION BY TIMESTAMP_TRUNC(CreatedTimestamp, DAY) OPTIONS(expiration_timestamp=NULL) AS $union;" + +echo "## Updating \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` from shards." +echo "Executing query: '$query'" + +bq query --headless --nouse_legacy_sql --project_id="$PROJECT_ID" "$query"