From d44f2ae43fabf9f660c0d836fdf2e537fbbf8880 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 26 Mar 2024 12:41:59 -0700
Subject: [PATCH 1/5] Start GH Action workflow for automation

Currently just runs the ingest workflow and uploads the results
to AWS S3. Subsequent commits will add automation for the phylogenetic
workflow.
---
 .github/workflows/ingest-to-phylogenetic.yaml | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 .github/workflows/ingest-to-phylogenetic.yaml

diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml
new file mode 100644
index 0000000..1177c25
--- /dev/null
+++ b/.github/workflows/ingest-to-phylogenetic.yaml
@@ -0,0 +1,40 @@
+name: Ingest to phylogenetic
+
+defaults:
+  run:
+    # This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
+    # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
+    #
+    # Completely spelling it out here so that GitHub can't change it out from under us
+    # and we don't have to refer to the docs to know the expected behavior.
+    shell: bash --noprofile --norc -eo pipefail {0}
+
+on:
+  workflow_dispatch:
+
+jobs:
+  ingest:
+    permissions:
+      id-token: write
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
+    secrets: inherit
+    with:
+      # Starting with the default docker runtime
+      # We can migrate to AWS Batch when/if we need to for more resources or if
+      # the job runs longer than the GH Action limit of 6 hours.
+      runtime: docker
+      run: |
+        nextstrain build \
+          --env AWS_ACCESS_KEY_ID \
+          --env AWS_SECRET_ACCESS_KEY \
+          ingest \
+            upload_all \
+            --configfile build-configs/nextstrain-automation/config.yaml
+      # Specifying artifact name to differentiate ingest build outputs from
+      # the phylogenetic build outputs
+      artifact-name: ingest-build-output
+      artifact-paths: |
+        ingest/results/
+        ingest/benchmarks/
+        ingest/logs/
+        ingest/.snakemake/log/

From 2c415e709e53673ae3ed3e636ec7e9c1784e6aa9 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 26 Mar 2024 12:43:37 -0700
Subject: [PATCH 2/5] ingest-to-phylogenetic: Add phylogenetic job

The phylogenetic workflow will run after the ingest workflow has
completed successfully to use the latest available data.

Subsequent commits will check if the ingest results included new
data to only run the phylogenetic workflow when there's new data.
---
 .github/workflows/ingest-to-phylogenetic.yaml | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml
index 1177c25..03d65b3 100644
--- a/.github/workflows/ingest-to-phylogenetic.yaml
+++ b/.github/workflows/ingest-to-phylogenetic.yaml
@@ -38,3 +38,34 @@ jobs:
         ingest/benchmarks/
         ingest/logs/
         ingest/.snakemake/log/
+
+  # TKTK check if ingest results include new data
+  # potentially use actions/cache to store Metadata.sha256sum of S3 files
+
+  phylogenetic:
+    needs: [ingest]
+    permissions:
+      id-token: write
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
+    secrets: inherit
+    with:
+      # Starting with the default docker runtime
+      # We can migrate to AWS Batch when/if we need to for more resources or if
+      # the job runs longer than the GH Action limit of 6 hours.
+      runtime: docker
+      run: |
+        nextstrain build \
+          --env AWS_ACCESS_KEY_ID \
+          --env AWS_SECRET_ACCESS_KEY \
+          phylogenetic \
+            deploy_all \
+            --configfile build-configs/nextstrain-automation/config.yaml
+      # Specifying artifact name to differentiate ingest build outputs from
+      # the phylogenetic build outputs
+      artifact-name: phylogenetic-build-output
+      artifact-paths: |
+        phylogenetic/auspice/
+        phylogenetic/results/
+        phylogenetic/benchmarks/
+        phylogenetic/logs/
+        phylogenetic/.snakemake/log/

From eb5e76de0f5e69666c11a64df7a59242601a3d1f Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 26 Mar 2024 16:59:59 -0700
Subject: [PATCH 3/5] ingest-to-phylogenetic: Use cache to check new data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Uses GitHub Actions cache to store a file that contains the
`Metadata.sh256sum` of the ingest files on S3 and use
the `hashFiles` function to create a unique cache key.

Then the existence of the cache key is an indicator that the ingest
file contents have not been updated since a previous run on GH Actions.
This does come with a big caveat that GH will remove any cache entries
that have not been accessed in over 7 days.¹ If the workflow is not
being automatically run within 7 days, then it will always run the
phylogenetic job.

If this works well, then we may want to consider moving this within
the `pathogen-repo-build` reusable workflow to have the same
functionality across pathogen automation workflows.

¹ https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#usage-limits-and-eviction-policy
---
 .github/workflows/ingest-to-phylogenetic.yaml | 44 +++++++++++++++++--
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml
index 03d65b3..980a399 100644
--- a/.github/workflows/ingest-to-phylogenetic.yaml
+++ b/.github/workflows/ingest-to-phylogenetic.yaml
@@ -39,11 +39,49 @@ jobs:
         ingest/logs/
         ingest/.snakemake/log/
 
-  # TKTK check if ingest results include new data
-  # potentially use actions/cache to store Metadata.sha256sum of S3 files
+  # Check if ingest results include new data by checking for the cache
+  # of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3)
+  # GitHub will remove any cache entries that have not been accessed in over 7 days,
+  # so if the workflow has not been run over 7 days then it will trigger phylogenetic.
+  check-new-data:
+    needs: [ingest]
+    runs-on: ubuntu-latest
+    outputs:
+      cache-hit: ${{ steps.check-cache.outputs.cache-hit }}
+    steps:
+      - name: Get sha256sum
+        id: get-sha256sum
+        run: |
+          s3_urls=(
+            "s3://nextstrain-data/files/workflows/zika/metadata.tsv.zst"
+            "s3://nextstrain-data/files/workflows/zika/sequences.fasta.zst"
+          )
+
+          # Code below is modified from ingest/upload-to-s3
+          # https://github.com/nextstrain/ingest/blob/c0b4c6bb5e6ccbba86374d2c09b42077768aac23/upload-to-s3#L23-L29
+
+          no_hash=0000000000000000000000000000000000000000000000000000000000000000
+
+          for s3_url in "${s3_urls[@]}"; do
+            s3path="${s3_url#s3://}"
+            bucket="${s3path%%/*}"
+            key="${s3path#*/}"
+
+            s3_hash="$(aws s3api head-object --no-sign-request --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
+            echo "${s3_hash}" >> ingest-output-sha256sum
+          done
+
+      - name: Check cache
+        id: check-cache
+        uses: actions/cache@v4
+        with:
+          path: ingest-output-sha256sum
+          key: ingest-output-sha256sum-${{ hashFiles('ingest-output-sha256sum') }}
+          lookup-only: true
 
   phylogenetic:
-    needs: [ingest]
+    needs: [check-new-data]
+    if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }}
     permissions:
       id-token: write
     uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master

From 65a8acc303f510318df58bb7aa8893dcc0421660 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Fri, 29 Mar 2024 15:27:21 -0700
Subject: [PATCH 4/5] ingest-to-phylo: Add inputs for Docker image

Add individuals inputs per workflow to override the default Docker image
used by `nextstrain build`. Having this input has been extremely helpful
to continue running pathogen workflows when we run into new bugs that
are not present in older nextstrain-base images.

I've made separate image inputs for the two workflows because they use
different tools and may require different versions of images.
---
 .github/workflows/ingest-to-phylogenetic.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml
index 980a399..bed143e 100644
--- a/.github/workflows/ingest-to-phylogenetic.yaml
+++ b/.github/workflows/ingest-to-phylogenetic.yaml
@@ -11,6 +11,13 @@ defaults:
 
 on:
   workflow_dispatch:
+    inputs:
+      ingest_image:
+        description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
+        required: false
+      phylogenetic_image:
+        description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")'
+        required: false
 
 jobs:
   ingest:
@@ -23,6 +30,8 @@ jobs:
       # We can migrate to AWS Batch when/if we need to for more resources or if
       # the job runs longer than the GH Action limit of 6 hours.
       runtime: docker
+      env: |
+        NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.ingest_image }}
       run: |
         nextstrain build \
           --env AWS_ACCESS_KEY_ID \
@@ -91,6 +100,8 @@ jobs:
       # We can migrate to AWS Batch when/if we need to for more resources or if
       # the job runs longer than the GH Action limit of 6 hours.
       runtime: docker
+      env: |
+        NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.phylogenetic_image }}
       run: |
         nextstrain build \
           --env AWS_ACCESS_KEY_ID \

From 77ca1d423ca753887a39dadf6db46e03c575a41e Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Fri, 29 Mar 2024 16:24:36 -0700
Subject: [PATCH 5/5] ingest-to-phylo: Add schedule

Copied daily schedule of mpox ingest
https://github.com/nextstrain/mpox/blob/e439235ff1c1d66e7285b774e9536e2896d9cd2f/.github/workflows/fetch-and-ingest.yaml#L4-L21

Daily runs seem fine since the ingest workflow currently takes less
than 2 minutes to complete and it will not trigger the phylogenetic
workflow if there's no new data.

We can bring this down to once a week if it seems like overkill.
---
 .github/workflows/ingest-to-phylogenetic.yaml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml
index bed143e..42216c4 100644
--- a/.github/workflows/ingest-to-phylogenetic.yaml
+++ b/.github/workflows/ingest-to-phylogenetic.yaml
@@ -10,6 +10,25 @@ defaults:
     shell: bash --noprofile --norc -eo pipefail {0}
 
 on:
+  schedule:
+    # Note times are in UTC, which is 1 or 2 hours behind CET depending on daylight savings.
+    #
+    # Note the actual runs might be late.
+    # Numerous people were confused, about that, including me:
+    #  - https://github.community/t/scheduled-action-running-consistently-late/138025/11
+    #  - https://github.com/github/docs/issues/3059
+    #
+    # Note, '*' is a special character in YAML, so you have to quote this string.
+    #
+    # Docs:
+    #  - https://docs.github.com/en/actions/learn-github-actions/events-that-trigger-workflows#schedule
+    #
+    # Tool that deciphers this particular format of crontab string:
+    #  - https://crontab.guru/
+    #
+    # Runs at 4pm UTC (12pm EDT) since curation by NCBI happens on the East Coast.
+    - cron: '0 16 * * *'
+
   workflow_dispatch:
     inputs:
       ingest_image: