From bd676bb588beb3de75f88ca6bfd0ba4ab56a4a3c Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 11 Apr 2024 17:14:54 -0700 Subject: [PATCH 1/6] Split out ingest and phylogenetic GH Action workflows Create a reusable GH Action workflows for ingest and phylogenetic that are called within the ingest-to-phylogenetic workflow. Future commits will update the independent GH Action workflows to allow manual workflow dispatches and trial runs. --- .github/workflows/ingest-to-phylogenetic.yaml | 49 ++----------------- .github/workflows/ingest.yaml | 47 ++++++++++++++++++ .github/workflows/phylogenetic.yaml | 48 ++++++++++++++++++ 3 files changed, 99 insertions(+), 45 deletions(-) create mode 100644 .github/workflows/ingest.yaml create mode 100644 .github/workflows/phylogenetic.yaml diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml index 690e4f3..34db5d6 100644 --- a/.github/workflows/ingest-to-phylogenetic.yaml +++ b/.github/workflows/ingest-to-phylogenetic.yaml @@ -42,30 +42,10 @@ jobs: ingest: permissions: id-token: write - uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + uses: ./.github/workflows/ingest.yaml secrets: inherit with: - # Starting with the default docker runtime - # We can migrate to AWS Batch when/if we need to for more resources or if - # the job runs longer than the GH Action limit of 6 hours. - runtime: docker - env: | - NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.ingest_image }} - run: | - nextstrain build \ - --env AWS_ACCESS_KEY_ID \ - --env AWS_SECRET_ACCESS_KEY \ - ingest \ - upload_all \ - --configfile build-configs/nextstrain-automation/config.yaml - # Specifying artifact name to differentiate ingest build outputs from - # the phylogenetic build outputs - artifact-name: ingest-build-output - artifact-paths: | - ingest/results/ - ingest/benchmarks/ - ingest/logs/ - ingest/.snakemake/log/ + image: ${{ inputs.ingest_image }} # Check if ingest results include new data by checking for the cache # of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3) @@ -114,28 +94,7 @@ jobs: if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }} permissions: id-token: write - uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + uses: ./.github/workflows/phylogenetic.yaml secrets: inherit with: - # Starting with the default docker runtime - # We can migrate to AWS Batch when/if we need to for more resources or if - # the job runs longer than the GH Action limit of 6 hours. - runtime: docker - env: | - NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.phylogenetic_image }} - run: | - nextstrain build \ - --env AWS_ACCESS_KEY_ID \ - --env AWS_SECRET_ACCESS_KEY \ - phylogenetic \ - deploy_all \ - --configfile build-configs/nextstrain-automation/config.yaml - # Specifying artifact name to differentiate ingest build outputs from - # the phylogenetic build outputs - artifact-name: phylogenetic-build-output - artifact-paths: | - phylogenetic/auspice/ - phylogenetic/results/ - phylogenetic/benchmarks/ - phylogenetic/logs/ - phylogenetic/.snakemake/log/ + image: ${{ inputs.phylogenetic_image }} diff --git a/.github/workflows/ingest.yaml b/.github/workflows/ingest.yaml new file mode 100644 index 0000000..99dfd74 --- /dev/null +++ b/.github/workflows/ingest.yaml @@ -0,0 +1,47 @@ +name: Ingest + +defaults: + run: + # This is the same as GitHub Action's `bash` keyword as of 20 June 2023: + # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell + # + # Completely spelling it out here so that GitHub can't change it out from under us + # and we don't have to refer to the docs to know the expected behavior. + shell: bash --noprofile --norc -eo pipefail {0} + +on: + workflow_call: + inputs: + image: + description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")' + required: false + type: string + +jobs: + ingest: + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + # Starting with the default docker runtime + # We can migrate to AWS Batch when/if we need to for more resources or if + # the job runs longer than the GH Action limit of 6 hours. + runtime: docker + env: | + NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }} + run: | + nextstrain build \ + --env AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY \ + ingest \ + upload_all \ + --configfile build-configs/nextstrain-automation/config.yaml + # Specifying artifact name to differentiate ingest build outputs from + # the phylogenetic build outputs + artifact-name: ingest-build-output + artifact-paths: | + ingest/results/ + ingest/benchmarks/ + ingest/logs/ + ingest/.snakemake/log/ diff --git a/.github/workflows/phylogenetic.yaml b/.github/workflows/phylogenetic.yaml new file mode 100644 index 0000000..f80f578 --- /dev/null +++ b/.github/workflows/phylogenetic.yaml @@ -0,0 +1,48 @@ +name: Phylogenetic + +defaults: + run: + # This is the same as GitHub Action's `bash` keyword as of 20 June 2023: + # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell + # + # Completely spelling it out here so that GitHub can't change it out from under us + # and we don't have to refer to the docs to know the expected behavior. + shell: bash --noprofile --norc -eo pipefail {0} + +on: + workflow_call: + inputs: + image: + description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")' + required: false + type: string + +jobs: + phylogenetic: + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + # Starting with the default docker runtime + # We can migrate to AWS Batch when/if we need to for more resources or if + # the job runs longer than the GH Action limit of 6 hours. + runtime: docker + env: | + NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }} + run: | + nextstrain build \ + --env AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY \ + phylogenetic \ + deploy_all \ + --configfile build-configs/nextstrain-automation/config.yaml + # Specifying artifact name to differentiate ingest build outputs from + # the phylogenetic build outputs + artifact-name: phylogenetic-build-output + artifact-paths: | + phylogenetic/auspice/ + phylogenetic/results/ + phylogenetic/benchmarks/ + phylogenetic/logs/ + phylogenetic/.snakemake/log/ From 52ed62e597bde6815f81cb5063cd44cc86490bae Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 11 Apr 2024 17:34:50 -0700 Subject: [PATCH 2/6] ingest/phylo GH Actions: allow manual triggers This will allow manual triggers of the independent ingest and phylogenetic workflows. Currently just includes the existing `image` input but subsequent commits will allow trial runs of each workflow. --- .github/workflows/ingest.yaml | 7 +++++++ .github/workflows/phylogenetic.yaml | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/.github/workflows/ingest.yaml b/.github/workflows/ingest.yaml index 99dfd74..3e0cc52 100644 --- a/.github/workflows/ingest.yaml +++ b/.github/workflows/ingest.yaml @@ -17,6 +17,13 @@ on: required: false type: string + workflow_dispatch: + inputs: + image: + description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")' + required: false + type: string + jobs: ingest: permissions: diff --git a/.github/workflows/phylogenetic.yaml b/.github/workflows/phylogenetic.yaml index f80f578..391c789 100644 --- a/.github/workflows/phylogenetic.yaml +++ b/.github/workflows/phylogenetic.yaml @@ -17,6 +17,13 @@ on: required: false type: string + workflow_dispatch: + inputs: + image: + description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")' + required: false + type: string + jobs: phylogenetic: permissions: From fa3ba4739d61afc4406716adcccaa269bc5bac31 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 12 Apr 2024 13:08:54 -0700 Subject: [PATCH 3/6] ingest/phylo GH Actions: add `trial_name` inputs Allow trial builds of the ingest and phylogenetic workflows to be uploaded to trial S3 URLs. The `trial_name` inputs are _not_ added to the `workflow_call` trigger since the `workflow_call` trigger is only being used by our automated ingest-to-phylogenetic workflow. This may change in the future, but leaving it out for now. For phylogenetic workflow to run with trial outputs from the ingest workflow, the phylogenetic workflow needs to accept inputs from the config. I'll make these changes separately. --- .github/workflows/ingest.yaml | 30 ++++++++++++++++++++++++++++- .github/workflows/phylogenetic.yaml | 30 ++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ingest.yaml b/.github/workflows/ingest.yaml index 3e0cc52..9de5d3a 100644 --- a/.github/workflows/ingest.yaml +++ b/.github/workflows/ingest.yaml @@ -23,9 +23,35 @@ on: description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")' required: false type: string + trial_name: + description: | + Trial name for outputs. + If not set, outputs will overwrite files at s3://nextstrain-data/files/workflows/zika/ + If set, outputs will be uploaded to s3://nextstrain-data/files/workflows/zika/trials// + required: false + type: string jobs: + set_config_overrides: + runs-on: ubuntu-latest + steps: + - id: config + name: Set config overrides + env: + TRIAL_NAME: ${{ inputs.trial_name }} + run: | + config="" + if [[ "$TRIAL_NAME" ]]; then + config+="--config" + config+=" s3_dst='s3://nextstrain-data/files/workflows/zika/trials/"$TRIAL_NAME"'" + fi + + echo "config=$config" >> "$GITHUB_OUTPUT" + outputs: + config_overrides: ${{ steps.config.outputs.config }} + ingest: + needs: [set_config_overrides] permissions: id-token: write uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master @@ -37,13 +63,15 @@ jobs: runtime: docker env: | NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }} + CONFIG_OVERRIDES: ${{ needs.set_config_overrides.outputs.config_overrides }} run: | nextstrain build \ --env AWS_ACCESS_KEY_ID \ --env AWS_SECRET_ACCESS_KEY \ ingest \ upload_all \ - --configfile build-configs/nextstrain-automation/config.yaml + --configfile build-configs/nextstrain-automation/config.yaml \ + $CONFIG_OVERRIDES # Specifying artifact name to differentiate ingest build outputs from # the phylogenetic build outputs artifact-name: ingest-build-output diff --git a/.github/workflows/phylogenetic.yaml b/.github/workflows/phylogenetic.yaml index 391c789..2ff990e 100644 --- a/.github/workflows/phylogenetic.yaml +++ b/.github/workflows/phylogenetic.yaml @@ -23,9 +23,35 @@ on: description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")' required: false type: string + trial_name: + description: | + Trial name for deploying builds. + If not set, builds will overwrite existing builds at s3://nextstrain-data/zika* + If set, builds will be deployed to s3://nextstrain-staging/zika_trials__* + required: false + type: string jobs: + set_config_overrides: + runs-on: ubuntu-latest + steps: + - id: config + name: Set config overrides + env: + TRIAL_NAME: ${{ inputs.trial_name }} + run: | + config="" + if [[ "$TRIAL_NAME" ]]; then + config+="--config" + config+=" deploy_url='s3://nextstrain-staging/zika_trials_"$TRIAL_NAME"_'" + fi + + echo "config=$config" >> "$GITHUB_OUTPUT" + outputs: + config_overrides: ${{ steps.config.outputs.config }} + phylogenetic: + needs: [set_config_overrides] permissions: id-token: write uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master @@ -37,13 +63,15 @@ jobs: runtime: docker env: | NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }} + CONFIG_OVERRIDES: ${{ needs.set_config_overrides.outputs.config_overrides }} run: | nextstrain build \ --env AWS_ACCESS_KEY_ID \ --env AWS_SECRET_ACCESS_KEY \ phylogenetic \ deploy_all \ - --configfile build-configs/nextstrain-automation/config.yaml + --configfile build-configs/nextstrain-automation/config.yaml \ + $CONFIG_OVERRIDES # Specifying artifact name to differentiate ingest build outputs from # the phylogenetic build outputs artifact-name: phylogenetic-build-output From 27c655720630dff1266660e5198f61168842f4cf Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 12 Apr 2024 14:52:19 -0700 Subject: [PATCH 4/6] phylogenetic: Move input URLs to config YAML Allows us to easily update the input URLs both from the config YAML and `--config` args. This is mostly motivated by the desire to run phylogenetic builds from trial ingest outputs. --- phylogenetic/defaults/config_zika.yaml | 5 +++++ phylogenetic/rules/prepare_sequences.smk | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/phylogenetic/defaults/config_zika.yaml b/phylogenetic/defaults/config_zika.yaml index 9b0ab9a..d25c2c6 100644 --- a/phylogenetic/defaults/config_zika.yaml +++ b/phylogenetic/defaults/config_zika.yaml @@ -1,3 +1,8 @@ +# Sequences must be FASTA and metadata must be TSV +# Both files must be zstd compressed +sequences_url: "https://data.nextstrain.org/files/workflows/zika/sequences.fasta.zst" +metadata_url: "https://data.nextstrain.org/files/workflows/zika/metadata.tsv.zst" + strain_id_field: "accession" display_strain_field: "strain" diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index fda49c3..39bd66f 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -27,8 +27,8 @@ rule download: sequences = "data/sequences.fasta.zst", metadata = "data/metadata.tsv.zst" params: - sequences_url = "https://data.nextstrain.org/files/workflows/zika/sequences.fasta.zst", - metadata_url = "https://data.nextstrain.org/files/workflows/zika/metadata.tsv.zst" + sequences_url = config["sequences_url"], + metadata_url = config["metadata_url"], shell: """ curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences} @@ -101,4 +101,4 @@ rule align: --output {output.alignment} \ --fill-gaps \ --remove-reference - """ \ No newline at end of file + """ From 779af70da183efeb9c29bd58dc9ce9a552cb6828 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 12 Apr 2024 15:01:24 -0700 Subject: [PATCH 5/6] phylogenetic GH Action: Add input URLs Allows customization of input sequences and metadata URLs. This will allow us to test trial outputs from ingest workflows. --- .github/workflows/phylogenetic.yaml | 31 +++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/.github/workflows/phylogenetic.yaml b/.github/workflows/phylogenetic.yaml index 2ff990e..857b021 100644 --- a/.github/workflows/phylogenetic.yaml +++ b/.github/workflows/phylogenetic.yaml @@ -30,6 +30,18 @@ on: If set, builds will be deployed to s3://nextstrain-staging/zika_trials__* required: false type: string + sequences_url: + description: | + URL for a sequences.fasta.zst file. + If not provided, will use default sequences_url from phylogenetic/defaults/config_zika.yaml + required: false + type: string + metadata_url: + description: | + URL for a metadata.tsv.zst file. + If not provided, will use default metadata_url from phylogenetic/defaults/config_zika.yaml + required: false + type: string jobs: set_config_overrides: @@ -39,11 +51,26 @@ jobs: name: Set config overrides env: TRIAL_NAME: ${{ inputs.trial_name }} + SEQUENCES_URL: ${{ inputs.sequences_url }} + METADATA_URL: ${{ inputs.metadata_url }} run: | config="" - if [[ "$TRIAL_NAME" ]]; then + if [[ "$TRIAL_NAME" || "$SEQUENCES_URL" || "$METADATA_URL" ]]; then + config+="--config" - config+=" deploy_url='s3://nextstrain-staging/zika_trials_"$TRIAL_NAME"_'" + + if [[ "$TRIAL_NAME" ]]; then + config+=" deploy_url='s3://nextstrain-staging/zika_trials_"$TRIAL_NAME"_'" + fi + + if [[ "$SEQUENCES_URL" ]]; then + config+=" sequences_url='"$SEQUENCES_URL"'" + fi + + if [[ "$METADATA_URL" ]]; then + config+=" metadata_url='"$METADATA_URL"'" + fi + fi echo "config=$config" >> "$GITHUB_OUTPUT" From 267e72d19ea6d0b7418941901e12bf3c3c72b637 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 18 Apr 2024 15:08:48 -0700 Subject: [PATCH 6/6] phylogenetic.yaml: Simplify config override logic Co-authored-by: Victor Lin <13424970+victorlin@users.noreply.github.com> --- .github/workflows/phylogenetic.yaml | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/.github/workflows/phylogenetic.yaml b/.github/workflows/phylogenetic.yaml index 857b021..04901b7 100644 --- a/.github/workflows/phylogenetic.yaml +++ b/.github/workflows/phylogenetic.yaml @@ -55,22 +55,21 @@ jobs: METADATA_URL: ${{ inputs.metadata_url }} run: | config="" - if [[ "$TRIAL_NAME" || "$SEQUENCES_URL" || "$METADATA_URL" ]]; then - config+="--config" - - if [[ "$TRIAL_NAME" ]]; then - config+=" deploy_url='s3://nextstrain-staging/zika_trials_"$TRIAL_NAME"_'" - fi + if [[ "$TRIAL_NAME" ]]; then + config+=" deploy_url='s3://nextstrain-staging/zika_trials_"$TRIAL_NAME"_'" + fi - if [[ "$SEQUENCES_URL" ]]; then - config+=" sequences_url='"$SEQUENCES_URL"'" - fi + if [[ "$SEQUENCES_URL" ]]; then + config+=" sequences_url='"$SEQUENCES_URL"'" + fi - if [[ "$METADATA_URL" ]]; then - config+=" metadata_url='"$METADATA_URL"'" - fi + if [[ "$METADATA_URL" ]]; then + config+=" metadata_url='"$METADATA_URL"'" + fi + if [[ $config ]]; then + config="--config $config" fi echo "config=$config" >> "$GITHUB_OUTPUT"