diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml index 690e4f3..34db5d6 100644 --- a/.github/workflows/ingest-to-phylogenetic.yaml +++ b/.github/workflows/ingest-to-phylogenetic.yaml @@ -42,30 +42,10 @@ jobs: ingest: permissions: id-token: write - uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + uses: ./.github/workflows/ingest.yaml secrets: inherit with: - # Starting with the default docker runtime - # We can migrate to AWS Batch when/if we need to for more resources or if - # the job runs longer than the GH Action limit of 6 hours. - runtime: docker - env: | - NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.ingest_image }} - run: | - nextstrain build \ - --env AWS_ACCESS_KEY_ID \ - --env AWS_SECRET_ACCESS_KEY \ - ingest \ - upload_all \ - --configfile build-configs/nextstrain-automation/config.yaml - # Specifying artifact name to differentiate ingest build outputs from - # the phylogenetic build outputs - artifact-name: ingest-build-output - artifact-paths: | - ingest/results/ - ingest/benchmarks/ - ingest/logs/ - ingest/.snakemake/log/ + image: ${{ inputs.ingest_image }} # Check if ingest results include new data by checking for the cache # of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3) @@ -114,28 +94,7 @@ jobs: if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }} permissions: id-token: write - uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + uses: ./.github/workflows/phylogenetic.yaml secrets: inherit with: - # Starting with the default docker runtime - # We can migrate to AWS Batch when/if we need to for more resources or if - # the job runs longer than the GH Action limit of 6 hours. - runtime: docker - env: | - NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.phylogenetic_image }} - run: | - nextstrain build \ - --env AWS_ACCESS_KEY_ID \ - --env AWS_SECRET_ACCESS_KEY \ - phylogenetic \ - deploy_all \ - --configfile build-configs/nextstrain-automation/config.yaml - # Specifying artifact name to differentiate ingest build outputs from - # the phylogenetic build outputs - artifact-name: phylogenetic-build-output - artifact-paths: | - phylogenetic/auspice/ - phylogenetic/results/ - phylogenetic/benchmarks/ - phylogenetic/logs/ - phylogenetic/.snakemake/log/ + image: ${{ inputs.phylogenetic_image }} diff --git a/.github/workflows/ingest.yaml b/.github/workflows/ingest.yaml new file mode 100644 index 0000000..9de5d3a --- /dev/null +++ b/.github/workflows/ingest.yaml @@ -0,0 +1,82 @@ +name: Ingest + +defaults: + run: + # This is the same as GitHub Action's `bash` keyword as of 20 June 2023: + # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell + # + # Completely spelling it out here so that GitHub can't change it out from under us + # and we don't have to refer to the docs to know the expected behavior. + shell: bash --noprofile --norc -eo pipefail {0} + +on: + workflow_call: + inputs: + image: + description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")' + required: false + type: string + + workflow_dispatch: + inputs: + image: + description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")' + required: false + type: string + trial_name: + description: | + Trial name for outputs. + If not set, outputs will overwrite files at s3://nextstrain-data/files/workflows/zika/ + If set, outputs will be uploaded to s3://nextstrain-data/files/workflows/zika/trials// + required: false + type: string + +jobs: + set_config_overrides: + runs-on: ubuntu-latest + steps: + - id: config + name: Set config overrides + env: + TRIAL_NAME: ${{ inputs.trial_name }} + run: | + config="" + if [[ "$TRIAL_NAME" ]]; then + config+="--config" + config+=" s3_dst='s3://nextstrain-data/files/workflows/zika/trials/"$TRIAL_NAME"'" + fi + + echo "config=$config" >> "$GITHUB_OUTPUT" + outputs: + config_overrides: ${{ steps.config.outputs.config }} + + ingest: + needs: [set_config_overrides] + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + # Starting with the default docker runtime + # We can migrate to AWS Batch when/if we need to for more resources or if + # the job runs longer than the GH Action limit of 6 hours. + runtime: docker + env: | + NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }} + CONFIG_OVERRIDES: ${{ needs.set_config_overrides.outputs.config_overrides }} + run: | + nextstrain build \ + --env AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY \ + ingest \ + upload_all \ + --configfile build-configs/nextstrain-automation/config.yaml \ + $CONFIG_OVERRIDES + # Specifying artifact name to differentiate ingest build outputs from + # the phylogenetic build outputs + artifact-name: ingest-build-output + artifact-paths: | + ingest/results/ + ingest/benchmarks/ + ingest/logs/ + ingest/.snakemake/log/ diff --git a/.github/workflows/phylogenetic.yaml b/.github/workflows/phylogenetic.yaml new file mode 100644 index 0000000..04901b7 --- /dev/null +++ b/.github/workflows/phylogenetic.yaml @@ -0,0 +1,109 @@ +name: Phylogenetic + +defaults: + run: + # This is the same as GitHub Action's `bash` keyword as of 20 June 2023: + # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell + # + # Completely spelling it out here so that GitHub can't change it out from under us + # and we don't have to refer to the docs to know the expected behavior. + shell: bash --noprofile --norc -eo pipefail {0} + +on: + workflow_call: + inputs: + image: + description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")' + required: false + type: string + + workflow_dispatch: + inputs: + image: + description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")' + required: false + type: string + trial_name: + description: | + Trial name for deploying builds. + If not set, builds will overwrite existing builds at s3://nextstrain-data/zika* + If set, builds will be deployed to s3://nextstrain-staging/zika_trials__* + required: false + type: string + sequences_url: + description: | + URL for a sequences.fasta.zst file. + If not provided, will use default sequences_url from phylogenetic/defaults/config_zika.yaml + required: false + type: string + metadata_url: + description: | + URL for a metadata.tsv.zst file. + If not provided, will use default metadata_url from phylogenetic/defaults/config_zika.yaml + required: false + type: string + +jobs: + set_config_overrides: + runs-on: ubuntu-latest + steps: + - id: config + name: Set config overrides + env: + TRIAL_NAME: ${{ inputs.trial_name }} + SEQUENCES_URL: ${{ inputs.sequences_url }} + METADATA_URL: ${{ inputs.metadata_url }} + run: | + config="" + + if [[ "$TRIAL_NAME" ]]; then + config+=" deploy_url='s3://nextstrain-staging/zika_trials_"$TRIAL_NAME"_'" + fi + + if [[ "$SEQUENCES_URL" ]]; then + config+=" sequences_url='"$SEQUENCES_URL"'" + fi + + if [[ "$METADATA_URL" ]]; then + config+=" metadata_url='"$METADATA_URL"'" + fi + + if [[ $config ]]; then + config="--config $config" + fi + + echo "config=$config" >> "$GITHUB_OUTPUT" + outputs: + config_overrides: ${{ steps.config.outputs.config }} + + phylogenetic: + needs: [set_config_overrides] + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + # Starting with the default docker runtime + # We can migrate to AWS Batch when/if we need to for more resources or if + # the job runs longer than the GH Action limit of 6 hours. + runtime: docker + env: | + NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }} + CONFIG_OVERRIDES: ${{ needs.set_config_overrides.outputs.config_overrides }} + run: | + nextstrain build \ + --env AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY \ + phylogenetic \ + deploy_all \ + --configfile build-configs/nextstrain-automation/config.yaml \ + $CONFIG_OVERRIDES + # Specifying artifact name to differentiate ingest build outputs from + # the phylogenetic build outputs + artifact-name: phylogenetic-build-output + artifact-paths: | + phylogenetic/auspice/ + phylogenetic/results/ + phylogenetic/benchmarks/ + phylogenetic/logs/ + phylogenetic/.snakemake/log/ diff --git a/phylogenetic/defaults/config_zika.yaml b/phylogenetic/defaults/config_zika.yaml index 9b0ab9a..d25c2c6 100644 --- a/phylogenetic/defaults/config_zika.yaml +++ b/phylogenetic/defaults/config_zika.yaml @@ -1,3 +1,8 @@ +# Sequences must be FASTA and metadata must be TSV +# Both files must be zstd compressed +sequences_url: "https://data.nextstrain.org/files/workflows/zika/sequences.fasta.zst" +metadata_url: "https://data.nextstrain.org/files/workflows/zika/metadata.tsv.zst" + strain_id_field: "accession" display_strain_field: "strain" diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index fda49c3..39bd66f 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -27,8 +27,8 @@ rule download: sequences = "data/sequences.fasta.zst", metadata = "data/metadata.tsv.zst" params: - sequences_url = "https://data.nextstrain.org/files/workflows/zika/sequences.fasta.zst", - metadata_url = "https://data.nextstrain.org/files/workflows/zika/metadata.tsv.zst" + sequences_url = config["sequences_url"], + metadata_url = config["metadata_url"], shell: """ curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences} @@ -101,4 +101,4 @@ rule align: --output {output.alignment} \ --fill-gaps \ --remove-reference - """ \ No newline at end of file + """