From d44f2ae43fabf9f660c0d836fdf2e537fbbf8880 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 26 Mar 2024 12:41:59 -0700 Subject: [PATCH 1/5] Start GH Action workflow for automation Currently just runs the ingest workflow and uploads the results to AWS S3. Subsequent commits will add automation for the phylogenetic workflow. --- .github/workflows/ingest-to-phylogenetic.yaml | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/ingest-to-phylogenetic.yaml diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml new file mode 100644 index 0000000..1177c25 --- /dev/null +++ b/.github/workflows/ingest-to-phylogenetic.yaml @@ -0,0 +1,40 @@ +name: Ingest to phylogenetic + +defaults: + run: + # This is the same as GitHub Action's `bash` keyword as of 20 June 2023: + # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell + # + # Completely spelling it out here so that GitHub can't change it out from under us + # and we don't have to refer to the docs to know the expected behavior. + shell: bash --noprofile --norc -eo pipefail {0} + +on: + workflow_dispatch: + +jobs: + ingest: + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + # Starting with the default docker runtime + # We can migrate to AWS Batch when/if we need to for more resources or if + # the job runs longer than the GH Action limit of 6 hours. + runtime: docker + run: | + nextstrain build \ + --env AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY \ + ingest \ + upload_all \ + --configfile build-configs/nextstrain-automation/config.yaml + # Specifying artifact name to differentiate ingest build outputs from + # the phylogenetic build outputs + artifact-name: ingest-build-output + artifact-paths: | + ingest/results/ + ingest/benchmarks/ + ingest/logs/ + ingest/.snakemake/log/ From 2c415e709e53673ae3ed3e636ec7e9c1784e6aa9 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 26 Mar 2024 12:43:37 -0700 Subject: [PATCH 2/5] ingest-to-phylogenetic: Add phylogenetic job The phylogenetic workflow will run after the ingest workflow has completed successfully to use the latest available data. Subsequent commits will check if the ingest results included new data to only run the phylogenetic workflow when there's new data. --- .github/workflows/ingest-to-phylogenetic.yaml | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml index 1177c25..03d65b3 100644 --- a/.github/workflows/ingest-to-phylogenetic.yaml +++ b/.github/workflows/ingest-to-phylogenetic.yaml @@ -38,3 +38,34 @@ jobs: ingest/benchmarks/ ingest/logs/ ingest/.snakemake/log/ + + # TKTK check if ingest results include new data + # potentially use actions/cache to store Metadata.sha256sum of S3 files + + phylogenetic: + needs: [ingest] + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + # Starting with the default docker runtime + # We can migrate to AWS Batch when/if we need to for more resources or if + # the job runs longer than the GH Action limit of 6 hours. + runtime: docker + run: | + nextstrain build \ + --env AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY \ + phylogenetic \ + deploy_all \ + --configfile build-configs/nextstrain-automation/config.yaml + # Specifying artifact name to differentiate ingest build outputs from + # the phylogenetic build outputs + artifact-name: phylogenetic-build-output + artifact-paths: | + phylogenetic/auspice/ + phylogenetic/results/ + phylogenetic/benchmarks/ + phylogenetic/logs/ + phylogenetic/.snakemake/log/ From eb5e76de0f5e69666c11a64df7a59242601a3d1f Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 26 Mar 2024 16:59:59 -0700 Subject: [PATCH 3/5] ingest-to-phylogenetic: Use cache to check new data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Uses GitHub Actions cache to store a file that contains the `Metadata.sh256sum` of the ingest files on S3 and use the `hashFiles` function to create a unique cache key. Then the existence of the cache key is an indicator that the ingest file contents have not been updated since a previous run on GH Actions. This does come with a big caveat that GH will remove any cache entries that have not been accessed in over 7 days.¹ If the workflow is not being automatically run within 7 days, then it will always run the phylogenetic job. If this works well, then we may want to consider moving this within the `pathogen-repo-build` reusable workflow to have the same functionality across pathogen automation workflows. ¹ https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#usage-limits-and-eviction-policy --- .github/workflows/ingest-to-phylogenetic.yaml | 44 +++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml index 03d65b3..980a399 100644 --- a/.github/workflows/ingest-to-phylogenetic.yaml +++ b/.github/workflows/ingest-to-phylogenetic.yaml @@ -39,11 +39,49 @@ jobs: ingest/logs/ ingest/.snakemake/log/ - # TKTK check if ingest results include new data - # potentially use actions/cache to store Metadata.sha256sum of S3 files + # Check if ingest results include new data by checking for the cache + # of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3) + # GitHub will remove any cache entries that have not been accessed in over 7 days, + # so if the workflow has not been run over 7 days then it will trigger phylogenetic. + check-new-data: + needs: [ingest] + runs-on: ubuntu-latest + outputs: + cache-hit: ${{ steps.check-cache.outputs.cache-hit }} + steps: + - name: Get sha256sum + id: get-sha256sum + run: | + s3_urls=( + "s3://nextstrain-data/files/workflows/zika/metadata.tsv.zst" + "s3://nextstrain-data/files/workflows/zika/sequences.fasta.zst" + ) + + # Code below is modified from ingest/upload-to-s3 + # https://github.com/nextstrain/ingest/blob/c0b4c6bb5e6ccbba86374d2c09b42077768aac23/upload-to-s3#L23-L29 + + no_hash=0000000000000000000000000000000000000000000000000000000000000000 + + for s3_url in "${s3_urls[@]}"; do + s3path="${s3_url#s3://}" + bucket="${s3path%%/*}" + key="${s3path#*/}" + + s3_hash="$(aws s3api head-object --no-sign-request --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" + echo "${s3_hash}" >> ingest-output-sha256sum + done + + - name: Check cache + id: check-cache + uses: actions/cache@v4 + with: + path: ingest-output-sha256sum + key: ingest-output-sha256sum-${{ hashFiles('ingest-output-sha256sum') }} + lookup-only: true phylogenetic: - needs: [ingest] + needs: [check-new-data] + if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }} permissions: id-token: write uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master From 65a8acc303f510318df58bb7aa8893dcc0421660 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 29 Mar 2024 15:27:21 -0700 Subject: [PATCH 4/5] ingest-to-phylo: Add inputs for Docker image Add individuals inputs per workflow to override the default Docker image used by `nextstrain build`. Having this input has been extremely helpful to continue running pathogen workflows when we run into new bugs that are not present in older nextstrain-base images. I've made separate image inputs for the two workflows because they use different tools and may require different versions of images. --- .github/workflows/ingest-to-phylogenetic.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml index 980a399..bed143e 100644 --- a/.github/workflows/ingest-to-phylogenetic.yaml +++ b/.github/workflows/ingest-to-phylogenetic.yaml @@ -11,6 +11,13 @@ defaults: on: workflow_dispatch: + inputs: + ingest_image: + description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")' + required: false + phylogenetic_image: + description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")' + required: false jobs: ingest: @@ -23,6 +30,8 @@ jobs: # We can migrate to AWS Batch when/if we need to for more resources or if # the job runs longer than the GH Action limit of 6 hours. runtime: docker + env: | + NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.ingest_image }} run: | nextstrain build \ --env AWS_ACCESS_KEY_ID \ @@ -91,6 +100,8 @@ jobs: # We can migrate to AWS Batch when/if we need to for more resources or if # the job runs longer than the GH Action limit of 6 hours. runtime: docker + env: | + NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.phylogenetic_image }} run: | nextstrain build \ --env AWS_ACCESS_KEY_ID \ From 77ca1d423ca753887a39dadf6db46e03c575a41e Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 29 Mar 2024 16:24:36 -0700 Subject: [PATCH 5/5] ingest-to-phylo: Add schedule Copied daily schedule of mpox ingest https://github.com/nextstrain/mpox/blob/e439235ff1c1d66e7285b774e9536e2896d9cd2f/.github/workflows/fetch-and-ingest.yaml#L4-L21 Daily runs seem fine since the ingest workflow currently takes less than 2 minutes to complete and it will not trigger the phylogenetic workflow if there's no new data. We can bring this down to once a week if it seems like overkill. --- .github/workflows/ingest-to-phylogenetic.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml index bed143e..42216c4 100644 --- a/.github/workflows/ingest-to-phylogenetic.yaml +++ b/.github/workflows/ingest-to-phylogenetic.yaml @@ -10,6 +10,25 @@ defaults: shell: bash --noprofile --norc -eo pipefail {0} on: + schedule: + # Note times are in UTC, which is 1 or 2 hours behind CET depending on daylight savings. + # + # Note the actual runs might be late. + # Numerous people were confused, about that, including me: + # - https://github.community/t/scheduled-action-running-consistently-late/138025/11 + # - https://github.com/github/docs/issues/3059 + # + # Note, '*' is a special character in YAML, so you have to quote this string. + # + # Docs: + # - https://docs.github.com/en/actions/learn-github-actions/events-that-trigger-workflows#schedule + # + # Tool that deciphers this particular format of crontab string: + # - https://crontab.guru/ + # + # Runs at 4pm UTC (12pm EDT) since curation by NCBI happens on the East Coast. + - cron: '0 16 * * *' + workflow_dispatch: inputs: ingest_image: