From 21a31a0dcd482a4baf147f55f003297af12996af Mon Sep 17 00:00:00 2001 From: Rex P <106129829+another-rex@users.noreply.github.com> Date: Fri, 2 Feb 2024 12:39:27 +1100 Subject: [PATCH 1/7] Switch to real repo, pin version (#1006) Signed-off-by: Rex P --- .github/workflows/osv-scanner-pr.yml | 2 +- .github/workflows/osv-scanner-scheduled.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/osv-scanner-pr.yml b/.github/workflows/osv-scanner-pr.yml index 4775867b..dbf3ab32 100644 --- a/.github/workflows/osv-scanner-pr.yml +++ b/.github/workflows/osv-scanner-pr.yml @@ -16,4 +16,4 @@ permissions: jobs: scan-pr: - uses: "google/osv-scanner/.github/workflows/osv-scanner-reusable-pr.yml@main" + uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable-pr.yml@v1.6.2-beta1" diff --git a/.github/workflows/osv-scanner-scheduled.yml b/.github/workflows/osv-scanner-scheduled.yml index e555cb1d..a2546389 100644 --- a/.github/workflows/osv-scanner-scheduled.yml +++ b/.github/workflows/osv-scanner-scheduled.yml @@ -15,4 +15,4 @@ permissions: jobs: scan-scheduled: - uses: "google/osv-scanner/.github/workflows/osv-scanner-reusable.yml@main" + uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@v1.6.2-beta1" From 5a5b488cd790ed0b337472dc9bf58d31b5cc83d3 Mon Sep 17 00:00:00 2001 From: Caleb Brown Date: Wed, 7 Feb 2024 13:17:11 +1100 Subject: [PATCH 2/7] Shard dynamic analysis data loading across ecosystems and simplify implementation. (#1007) * Move Docker image cloudbuilder.yaml into a better location. Signed-off-by: Caleb Brown * Add a script and cloudbuild config for loading dynamic results into BQ. Signed-off-by: Caleb Brown --------- Signed-off-by: Caleb Brown --- .../cloudbuild/dynamic_loader/cloudbuild.yaml | 12 ++++ .../cloudbuild/image_build}/cloudbuild.yaml | 0 scripts/bq_load.sh | 69 +++++++++++++++++++ 3 files changed, 81 insertions(+) create mode 100644 infra/cloudbuild/dynamic_loader/cloudbuild.yaml rename {build => infra/cloudbuild/image_build}/cloudbuild.yaml (100%) create mode 100644 scripts/bq_load.sh diff --git a/infra/cloudbuild/dynamic_loader/cloudbuild.yaml b/infra/cloudbuild/dynamic_loader/cloudbuild.yaml new file mode 100644 index 00000000..c878ba91 --- /dev/null +++ b/infra/cloudbuild/dynamic_loader/cloudbuild.yaml @@ -0,0 +1,12 @@ +steps: +- name: gcr.io/google.com/cloudsdktool/cloud-sdk + env: + - 'PROJECT_ID=ossf-malware-analysis' + - 'LOAD_DATASET=testing' # TODO: change after testing is completed + - 'LOAD_TABLE_PREFIX=merge_' + - 'DEST_DATASET=testing' # TODO: change after testing is completed + - 'DEST_TABLE=analysis' + - 'RESULT_BUCKET=gs://ossf-malware-analysis-results' + - 'SCHEMA_FILE=function/loader/dynamic-analysis-schema.json' + args: ['scripts/bq_load.sh'] +timeout: 43200s # 12 hours diff --git a/build/cloudbuild.yaml b/infra/cloudbuild/image_build/cloudbuild.yaml similarity index 100% rename from build/cloudbuild.yaml rename to infra/cloudbuild/image_build/cloudbuild.yaml diff --git a/scripts/bq_load.sh b/scripts/bq_load.sh new file mode 100644 index 00000000..c555d023 --- /dev/null +++ b/scripts/bq_load.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +if [ -z "$PROJECT_ID" ]; then + echo "PROJECT_ID must be set" + exit 1 +fi + +if [ -z "$LOAD_DATASET" ]; then + echo "LOAD_DATASET must be set" + exit 1 +fi + +if [ -z "$LOAD_TABLE_PREFIX" ]; then + echo "LOAD_TABLE_PREFIX must be set" + exit 1 +fi + +if [ -z "$DEST_DATASET" ]; then + echo "DEST_DATASET must be set" + exit 1 +fi + +if [ -z "$DEST_TABLE" ]; then + echo "DEST_TABLE must be set" + exit 1 +fi + +if [ -z "$RESULT_BUCKET" ]; then + echo "RESULT_BUCKET must be set" + exit 1 +fi + +if [ -z "$SCHEMA_FILE" ]; then + echo "SCHEMA_FILE must be set" + exit 1 +fi + +union="" + +for bucket_prefix in `gsutil ls "$BUCKET"`; do + prefix=`echo "$bucket_prefix" | sed "s|$BUCKET/\([^\]*\)/|\1|g"` + clean_prefix=`echo "$prefix" | tr -c -d "[:alnum:]"` + table_name="$LOAD_TABLE_PREFIX$clean_prefix" + + echo "## Loading $bucket_prefix into \`$PROJECT_ID.$LOAD_DATASET.$table_name\`." + bq load \ + --project_id="$PROJECT_ID" \ + --dataset_id="$LOAD_DATASET" \ + --replace \ + --time_partitioning_type="DAY" \ + --time_partitioning_field="CreatedTimestamp" \ + --source_format="NEWLINE_DELIMITED_JSON" \ + --max_bad_records=10000 \ + "$table_name" "$bucket_prefix*" "$SCHEMA_FILE" + + # Construct a UNION query for joining the prefix shards together + subquery="SELECT * FROM \`$PROJECT_ID.$LOAD_DATASET.$table_name\`" + if [ -n "$union" ]; then + union="$union UNION ALL " + fi + union="$union$subquery" +done + +query="CREATE OR REPLACE TABLE \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` LIKE \`$PROJECT_ID.$LOAD_DATASET.$table_name\` PARTITION BY TIMESTAMP_TRUNC(CreatedTimestamp, DAY) AS $union;" + +echo "## Updating \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` from shards." +echo "Executing query: '$query'" + +bq query --nouse_legacy_sql --project_id="$PROJECT_ID" "$query" From d2372b7d1c19dcfe39bb7659043c45d4afd37931 Mon Sep 17 00:00:00 2001 From: Caleb Brown Date: Wed, 7 Feb 2024 13:57:01 +1100 Subject: [PATCH 3/7] Add option to force cloud logging. (#1008) Signed-off-by: Caleb Brown --- infra/cloudbuild/dynamic_loader/cloudbuild.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/infra/cloudbuild/dynamic_loader/cloudbuild.yaml b/infra/cloudbuild/dynamic_loader/cloudbuild.yaml index c878ba91..b5bd6163 100644 --- a/infra/cloudbuild/dynamic_loader/cloudbuild.yaml +++ b/infra/cloudbuild/dynamic_loader/cloudbuild.yaml @@ -10,3 +10,5 @@ steps: - 'SCHEMA_FILE=function/loader/dynamic-analysis-schema.json' args: ['scripts/bq_load.sh'] timeout: 43200s # 12 hours +options: + logging: CLOUD_LOGGING_ONLY From 8f1905e9260eb0203c902b86870a865a75b47aa1 Mon Sep 17 00:00:00 2001 From: Caleb Brown Date: Wed, 7 Feb 2024 15:06:38 +1100 Subject: [PATCH 4/7] Set the entrypoint explicitly to /bin/bash in the cloudbuild.yaml (#1009) Signed-off-by: Caleb Brown --- infra/cloudbuild/dynamic_loader/cloudbuild.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/infra/cloudbuild/dynamic_loader/cloudbuild.yaml b/infra/cloudbuild/dynamic_loader/cloudbuild.yaml index b5bd6163..248bfcc5 100644 --- a/infra/cloudbuild/dynamic_loader/cloudbuild.yaml +++ b/infra/cloudbuild/dynamic_loader/cloudbuild.yaml @@ -8,6 +8,7 @@ steps: - 'DEST_TABLE=analysis' - 'RESULT_BUCKET=gs://ossf-malware-analysis-results' - 'SCHEMA_FILE=function/loader/dynamic-analysis-schema.json' + entrypoint: '/bin/bash' args: ['scripts/bq_load.sh'] timeout: 43200s # 12 hours options: From b6761328bf732ca6817dd48dd6effecd427b3c32 Mon Sep 17 00:00:00 2001 From: Caleb Brown Date: Wed, 7 Feb 2024 15:34:10 +1100 Subject: [PATCH 5/7] Fix a bug where the RESULT_BUCKET env var wasn't used correctly. (#1010) Signed-off-by: Caleb Brown --- scripts/bq_load.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) mode change 100644 => 100755 scripts/bq_load.sh diff --git a/scripts/bq_load.sh b/scripts/bq_load.sh old mode 100644 new mode 100755 index c555d023..350eba85 --- a/scripts/bq_load.sh +++ b/scripts/bq_load.sh @@ -37,8 +37,8 @@ fi union="" -for bucket_prefix in `gsutil ls "$BUCKET"`; do - prefix=`echo "$bucket_prefix" | sed "s|$BUCKET/\([^\]*\)/|\1|g"` +for bucket_prefix in `gsutil ls "$RESULT_BUCKET"`; do + prefix=`echo "$bucket_prefix" | sed "s|$RESULT_BUCKET/\([^\]*\)/|\1|g"` clean_prefix=`echo "$prefix" | tr -c -d "[:alnum:]"` table_name="$LOAD_TABLE_PREFIX$clean_prefix" From 1933a7504c55035fe395b8f7a8985f5a9fde6917 Mon Sep 17 00:00:00 2001 From: Caleb Brown Date: Wed, 7 Feb 2024 16:18:11 +1100 Subject: [PATCH 6/7] Add headless flag to improve output. (#1011) Signed-off-by: Caleb Brown --- scripts/bq_load.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/bq_load.sh b/scripts/bq_load.sh index 350eba85..b7831f7e 100755 --- a/scripts/bq_load.sh +++ b/scripts/bq_load.sh @@ -44,6 +44,7 @@ for bucket_prefix in `gsutil ls "$RESULT_BUCKET"`; do echo "## Loading $bucket_prefix into \`$PROJECT_ID.$LOAD_DATASET.$table_name\`." bq load \ + --headless \ --project_id="$PROJECT_ID" \ --dataset_id="$LOAD_DATASET" \ --replace \ @@ -66,4 +67,4 @@ query="CREATE OR REPLACE TABLE \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` LIKE \` echo "## Updating \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` from shards." echo "Executing query: '$query'" -bq query --nouse_legacy_sql --project_id="$PROJECT_ID" "$query" +bq query --headless --nouse_legacy_sql --project_id="$PROJECT_ID" "$query" From 6b875a6dec5a75e56fbd787e3079cee0cecd0617 Mon Sep 17 00:00:00 2001 From: Caleb Brown Date: Thu, 8 Feb 2024 09:13:04 +1100 Subject: [PATCH 7/7] Add option to bq load query to remove expiration. (#1012) Signed-off-by: Caleb Brown --- scripts/bq_load.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/bq_load.sh b/scripts/bq_load.sh index b7831f7e..888c4d4f 100755 --- a/scripts/bq_load.sh +++ b/scripts/bq_load.sh @@ -62,7 +62,7 @@ for bucket_prefix in `gsutil ls "$RESULT_BUCKET"`; do union="$union$subquery" done -query="CREATE OR REPLACE TABLE \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` LIKE \`$PROJECT_ID.$LOAD_DATASET.$table_name\` PARTITION BY TIMESTAMP_TRUNC(CreatedTimestamp, DAY) AS $union;" +query="CREATE OR REPLACE TABLE \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` LIKE \`$PROJECT_ID.$LOAD_DATASET.$table_name\` PARTITION BY TIMESTAMP_TRUNC(CreatedTimestamp, DAY) OPTIONS(expiration_timestamp=NULL) AS $union;" echo "## Updating \`$PROJECT_ID.$DEST_DATASET.$DEST_TABLE\` from shards." echo "Executing query: '$query'"