Skip to content

ingest-to-phylogenetic: Use cache to check new data #6

ingest-to-phylogenetic: Use cache to check new data

ingest-to-phylogenetic: Use cache to check new data #6

name: Ingest to phylogenetic
on:
workflow_dispatch:
jobs:
ingest:
permissions:
id-token: write
uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
secrets: inherit
with:
# Starting with the default docker runtime
# We can migrate to AWS Batch when/if we need to for more resources or if
# the job runs longer than the GH Action limit of 6 hours.
runtime: docker
run: |
nextstrain build \
--env AWS_ACCESS_KEY_ID \
--env AWS_SECRET_ACCESS_KEY \
ingest \
upload_all \
--configfile build-configs/nextstrain-automation/config.yaml
# Specifying artifact name to differentiate ingest build outputs from
# the phylogenetic build outputs
artifact-name: ingest-build-output
artifact-paths: |
ingest/results/
ingest/benchmarks/
ingest/logs/
ingest/.snakemake/log/
# Check if ingest results include new data by checking for the cache
# of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3)
# GitHub will remove any cache entries that have not been accessed in over 7 days,
# so if the workflow has not been run over 7 days then it will trigger phylogenetic.
check-new-data:
needs: [ingest]
runs-on: ubuntu-latest
outputs:
cache-hit: ${{ steps.check-cache.outputs.cache-hit }}
steps:
- name: Get sha256sum
id: get-sha256sum
run: |
s3_urls=(
"s3://nextstrain-data/files/workflows/zika/metadata.tsv.zst"
"s3://nextstrain-data/files/workflows/zika/sequences.fasta.zst"
)
for s3_url in "${s3_urls[@]}"; do
s3path="${s3_url#s3://}"
bucket="${s3path%%/*}"
key="${s3path#*/}"
s3_hash="$(aws s3api head-object --no-sign-request --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
echo "${s3_hash}" >> ingest-output-sha256sum
done
- name: Check cache
id: check-cache
uses:uses: actions/cache@v4
with:
path: ingest-output-sha256sum
key: ingest-output-sha256sum-${{ hashFiles('ingest-output-sha256sum') }}
lookup-only: true
phylogenetic:
needs: [check-new-data]
if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }}
permissions:
id-token: write
uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
secrets: inherit
with:
# Starting with the default docker runtime
# We can migrate to AWS Batch when/if we need to for more resources or if
# the job runs longer than the GH Action limit of 6 hours.
runtime: docker
run: |
nextstrain build \
--env AWS_ACCESS_KEY_ID \
--env AWS_SECRET_ACCESS_KEY \
phylogenetic \
deploy_all \
--configfile build-configs/nextstrain-automation/config.yaml
# Specifying artifact name to differentiate ingest build outputs from
# the phylogenetic build outputs
artifact-name: phylogenetic-build-output
artifact-paths: |
phylogenetic/auspice/
phylogenetic/results/
phylogenetic/benchmarks/
phylogenetic/logs/
phylogenetic/.snakemake/log/