diff --git a/ingest/README.md b/ingest/README.md index 5d0bb0f..51c86be 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -31,7 +31,12 @@ This will produce two files (within the `ingest` directory): Run the complete ingest pipeline and upload results to AWS S3 with ```sh -nextstrain build . --configfiles defaults/config.yaml defaults/optional.yaml +nextstrain build \ + --env AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY \ + . \ + upload_all \ + --configfile build-configs/nextstrain-automation/config.yaml ``` ### Adding new sequences not from GenBank diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml new file mode 100644 index 0000000..4490f64 --- /dev/null +++ b/ingest/build-configs/nextstrain-automation/config.yaml @@ -0,0 +1,20 @@ +# This configuration file should contain all required configuration parameters +# for the ingest workflow to run with additional Nextstrain automation rules. + +# Custom rules to run as part of the Nextstrain automated workflow +# The paths should be relative to the ingest directory. +custom_rules: + - build-configs/nextstrain-automation/upload.smk + +# Nextstrain CloudFront domain to ensure that we invalidate CloudFront after the S3 uploads +# This is required as long as we are using the AWS CLI for uploads +cloudfront_domain: "data.nextstrain.org" + +# Nextstrain AWS S3 Bucket with pathogen prefix +# Replace with the pathogen repo name. +s3_dst: "s3://nextstrain-data/files/workflows/zika" + +files_to_upload: + metadata.tsv.zst: results/metadata.tsv + sequences.fasta.zst: results/sequences.fasta + diff --git a/ingest/build-configs/nextstrain-automation/upload.smk b/ingest/build-configs/nextstrain-automation/upload.smk new file mode 100644 index 0000000..26d1346 --- /dev/null +++ b/ingest/build-configs/nextstrain-automation/upload.smk @@ -0,0 +1,47 @@ +""" +This part of the workflow handles uploading files to AWS S3. + +Files to upload must be defined in the `files_to_upload` config param, where +the keys are the remote files and the values are the local filepaths +relative to the ingest directory. + +Produces a single file for each uploaded file: + "results/upload/{remote_file}.upload" + +The rule `upload_all` can be used as a target to upload all files. +""" +import os + +slack_envvars_defined = "SLACK_CHANNELS" in os.environ and "SLACK_TOKEN" in os.environ +send_notifications = ( + config.get("send_slack_notifications", False) and slack_envvars_defined +) + + +rule upload_to_s3: + input: + file_to_upload=lambda wildcards: config["files_to_upload"][wildcards.remote_file], + output: + "results/upload/{remote_file}.upload", + params: + quiet="" if send_notifications else "--quiet", + s3_dst=config["s3_dst"], + cloudfront_domain=config["cloudfront_domain"], + shell: + """ + ./vendored/upload-to-s3 \ + {params.quiet} \ + {input.file_to_upload:q} \ + {params.s3_dst:q}/{wildcards.remote_file:q} \ + {params.cloudfront_domain} 2>&1 | tee {output} + """ + + +rule upload_all: + input: + uploads=[ + f"results/upload/{remote_file}.upload" + for remote_file in config["files_to_upload"].keys() + ], + output: + touch("results/upload_all.done") \ No newline at end of file diff --git a/ingest/defaults/optional.yaml b/ingest/defaults/optional.yaml deleted file mode 100644 index 9f00a7e..0000000 --- a/ingest/defaults/optional.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# Optional configs used by Nextstrain team -# Params for uploads -upload: - # Upload params for AWS S3 - s3: - # AWS S3 Bucket with prefix - dst: 's3://nextstrain-data/files/workflows/zika' - # Mapping of files to upload, with key as remote file name and the value - # the local file path relative to the ingest directory. - files_to_upload: - genbank.ndjson.xz: data/genbank.ndjson - all_sequences.ndjson.xz: data/sequences.ndjson - metadata.tsv.gz: results/metadata.tsv - sequences.fasta.xz: results/sequences.fasta - alignment.fasta.xz: data/alignment.fasta - insertions.csv.gz: data/insertions.csv - translations.zip: data/translations.zip - - cloudfront_domain: 'data.nextstrain.org' - -# Toggle for Slack notifications -send_slack_notifications: True - -# Toggle for triggering builds -trigger_rebuild: True diff --git a/ingest/rules/slack_notifications.smk b/ingest/rules/slack_notifications.smk deleted file mode 100644 index 2b7ec61..0000000 --- a/ingest/rules/slack_notifications.smk +++ /dev/null @@ -1,55 +0,0 @@ -""" -This part of the workflow handles various Slack notifications. -Designed to be used internally by the Nextstrain team with hard-coded paths -to files on AWS S3. - -All rules here require two environment variables: - * SLACK_TOKEN - * SLACK_CHANNELS -""" -import os -import sys - -slack_envvars_defined = "SLACK_CHANNELS" in os.environ and "SLACK_TOKEN" in os.environ -if not slack_envvars_defined: - print( - "ERROR: Slack notifications require two environment variables: 'SLACK_CHANNELS' and 'SLACK_TOKEN'.", - file=sys.stderr, - ) - sys.exit(1) - -S3_SRC = "s3://nextstrain-data/files/workflows/zika" - - -rule notify_on_genbank_record_change: - input: - genbank_ndjson="data/genbank.ndjson", - output: - touch("data/notify/genbank-record-change.done"), - params: - s3_src=S3_SRC, - shell: - """ - ./vendored/notify-on-record-change {input.genbank_ndjson} {params.s3_src:q}/genbank.ndjson.xz Genbank - """ - - -rule notify_on_metadata_diff: - input: - metadata="results/metadata.tsv", - output: - touch("data/notify/metadata-diff.done"), - params: - s3_src=S3_SRC, - shell: - """ - ./vendored/notify-on-diff {input.metadata} {params.s3_src:q}/metadata.tsv.gz - """ - - -onstart: - shell("./vendored/notify-on-job-start Ingest nextstrain/zika") - - -onerror: - shell("./vendored/notify-on-job-fail Ingest nextstrain/zika") diff --git a/ingest/rules/trigger_rebuild.smk b/ingest/rules/trigger_rebuild.smk deleted file mode 100644 index 0cf6731..0000000 --- a/ingest/rules/trigger_rebuild.smk +++ /dev/null @@ -1,22 +0,0 @@ -""" -This part of the workflow handles triggering new zika builds after the -latest metadata TSV and sequence FASTA files have been uploaded to S3. - -Designed to be used internally by the Nextstrain team with hard-coded paths -to expected upload flag files. -""" - - -rule trigger_build: - """ - Triggering zika builds via repository action type `rebuild`. - """ - input: - metadata_upload="data/upload/s3/metadata.tsv.gz.done", - fasta_upload="data/upload/s3/sequences.fasta.xz.done", - output: - touch("data/trigger/rebuild.done"), - shell: - """ - ./vendored/trigger-on-new-data nextstrain/zika rebuild {input.metadata_upload} {input.fasta_upload} - """ diff --git a/ingest/rules/upload.smk b/ingest/rules/upload.smk deleted file mode 100644 index 60c5c9b..0000000 --- a/ingest/rules/upload.smk +++ /dev/null @@ -1,64 +0,0 @@ -""" -This part of the workflow handles uploading files to a specified destination. - -Uses predefined wildcard `file_to_upload` determine input and predefined -wildcard `remote_file_name` as the remote file name in the specified destination. - -Produces output files as `data/upload/{upload_target_name}/{remote_file_name}.done`. - -Currently only supports uploads to AWS S3, but additional upload rules can -be easily added as long as they follow the output pattern described above. -""" -import os - -slack_envvars_defined = "SLACK_CHANNELS" in os.environ and "SLACK_TOKEN" in os.environ -send_notifications = ( - config.get("send_slack_notifications", False) and slack_envvars_defined -) - - -def _get_upload_inputs(wildcards): - """ - If the file_to_upload has Slack notifications that depend on diffs with S3 files, - then we want the upload rule to run after the notification rule. - - This function is mostly to keep track of which flag files to expect for - the rules in `slack_notifications.smk`, so it only includes flag files if - `send_notifications` is True. - """ - inputs = { - "file_to_upload": config["upload"]["s3"]["files_to_upload"][ - wildcards.remote_file_name - ], - } - - if send_notifications: - flag_file = [] - - if file_to_upload == "data/genbank.ndjson": - flag_file = "data/notify/genbank-record-change.done" - elif file_to_upload == "results/metadata.tsv": - flag_file = "data/notify/metadata-diff.done" - - inputs["notify_flag_file"] = flag_file - - return inputs - - -rule upload_to_s3: - input: - unpack(_get_upload_inputs), - output: - "data/upload/s3/{remote_file_name}.done", - params: - quiet="" if send_notifications else "--quiet", - s3_dst=config["upload"].get("s3", {}).get("dst", ""), - cloudfront_domain=config["upload"].get("s3", {}).get("cloudfront_domain", ""), - shell: - """ - ./vendored/upload-to-s3 \ - {params.quiet} \ - {input.file_to_upload:q} \ - {params.s3_dst:q}/{wildcards.remote_file_name:q} \ - {params.cloudfront_domain} 2>&1 | tee {output} - """