diff --git a/config/auspice_config_h5n1-cattle-outbreak.json b/config/auspice_config_h5n1-cattle-outbreak.json index 9d9b226..c723817 100755 --- a/config/auspice_config_h5n1-cattle-outbreak.json +++ b/config/auspice_config_h5n1-cattle-outbreak.json @@ -80,6 +80,11 @@ "title": "Subtype", "type": "categorical" }, + { + "key": "clade", + "title": "Nextclade Clade", + "type": "categorical" + }, { "key": "furin_cleavage_motif", "title": "Furin Cleavage Motif", @@ -127,6 +132,7 @@ "country", "division", "subtype", + "clade", "author", "originating_lab", "submitting_lab", diff --git a/config/auspice_config_h5n1.json b/config/auspice_config_h5n1.json index 264d44c..ae10c8c 100755 --- a/config/auspice_config_h5n1.json +++ b/config/auspice_config_h5n1.json @@ -39,7 +39,7 @@ "key": "division", "title": "Admin Division", "type": "categorical" - }, + }, { "key": "host", "title": "Host", @@ -65,6 +65,11 @@ "title": "GISAID Clade", "type": "categorical" }, + { + "key": "clade", + "title": "Nextclade Clade", + "type": "categorical" + }, { "key": "furin_cleavage_motif", "title": "Furin Cleavage Motif", @@ -109,6 +114,7 @@ "subtype", "h5_label_clade", "gisaid_clade", + "clade", "authors", "originating_lab", "submitting_lab" diff --git a/config/auspice_config_h5nx.json b/config/auspice_config_h5nx.json index dcda973..88a943f 100755 --- a/config/auspice_config_h5nx.json +++ b/config/auspice_config_h5nx.json @@ -65,6 +65,11 @@ "title": "GISAID Clade", "type": "categorical" }, + { + "key": "clade", + "title": "Nextclade Clade", + "type": "categorical" + }, { "key": "furin_cleavage_motif", "title": "Furin Cleavage Motif", @@ -109,6 +114,7 @@ "subtype", "h5_label_clade", "gisaid_clade", + "clade", "authors", "originating_lab", "submitting_lab" diff --git a/ingest/Snakefile b/ingest/Snakefile index 7cb238f..c8a48f3 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -20,9 +20,11 @@ rule upload_all: input: sequences=expand("fauna/s3/sequences_{segment}.done", segment=config["segments"]), metadata="fauna/s3/metadata.done", + nextclade="fauna/s3/nextclade.done", include: "rules/ingest_fauna.smk" include: "rules/merge_segment_metadata.smk" +include: "rules/nextclade.smk" include: "rules/upload_to_s3.smk" # Allow users to import custom rules provided via the config. diff --git a/ingest/build-configs/ncbi/Snakefile b/ingest/build-configs/ncbi/Snakefile index c1052d9..a8856f0 100644 --- a/ingest/build-configs/ncbi/Snakefile +++ b/ingest/build-configs/ncbi/Snakefile @@ -40,6 +40,7 @@ rule upload_all_ncbi: expand([ "{data_source}/s3/sequences_{segment}.done", "{data_source}/s3/metadata.done", + "{data_source}/s3/nextclade.done", ], data_source=NCBI_DATA_SOURCES, segment=config["segments"]), diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index cdbcfef..3596973 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -10,3 +10,8 @@ segments: s3_dst: fauna: s3://nextstrain-data-private/files/workflows/avian-flu + +nextclade: + dataset_name: community/moncla-lab/iav-h5/ha/all-clades + field_map: defaults/nextclade_field_map.tsv + id_field: seqName diff --git a/ingest/defaults/nextclade_field_map.tsv b/ingest/defaults/nextclade_field_map.tsv new file mode 100644 index 0000000..e23c864 --- /dev/null +++ b/ingest/defaults/nextclade_field_map.tsv @@ -0,0 +1,28 @@ +# TSV file that is a mapping of column names for Nextclade output TSV +# The first column should be the original column name of the Nextclade TSV +# The second column should be the new column name to use in the final metadata TSV +# Nextclade can have pathogen specific output columns so make sure to check which +# columns would be useful for your downstream phylogenetic analysis. +seqName seqName +clade clade +coverage coverage +totalMissing missing_data +totalSubstitutions divergence +totalNonACGTNs nonACGTN +qc.overallStatus QC_overall +qc.missingData.status QC_missing_data +qc.mixedSites.status QC_mixed_sites +qc.privateMutations.status QC_rare_mutations +qc.snpClusters.status QC_snp_clusters +qc.frameShifts.status QC_frame_shifts +qc.stopCodons.status QC_stop_codons +frameShifts frame_shifts +privateNucMutations.reversionSubstitutions private_reversion_substitutions +privateNucMutations.labeledSubstitutions private_labeled_substitutions +privateNucMutations.unlabeledSubstitutions private_unlabeled_substitutions +privateNucMutations.totalReversionSubstitutions private_total_reversion_substitutions +privateNucMutations.totalLabeledSubstitutions private_total_labeled_substitutions +privateNucMutations.totalUnlabeledSubstitutions private_total_unlabeled_substitutions +privateNucMutations.totalPrivateSubstitutions private_total_private_substitutions +qc.snpClusters.clusteredSNPs private_snp_clusters +qc.snpClusters.totalSNPs private_total_snp_clusters diff --git a/ingest/rules/merge_segment_metadata.smk b/ingest/rules/merge_segment_metadata.smk index d421167..8c00bbd 100644 --- a/ingest/rules/merge_segment_metadata.smk +++ b/ingest/rules/merge_segment_metadata.smk @@ -16,7 +16,7 @@ rule merge_segment_metadata: segments = expand("{{data_source}}/data/metadata_{segment}.tsv", segment=config["segments"]), metadata = "{data_source}/data/metadata_ha.tsv", output: - metadata = "{data_source}/results/metadata.tsv", + metadata = "{data_source}/data/merged_segment_metadata.tsv", shell: """ python scripts/add_segment_counts.py \ diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk new file mode 100644 index 0000000..ffbc8c3 --- /dev/null +++ b/ingest/rules/nextclade.smk @@ -0,0 +1,78 @@ +""" +This part of the workflow handles running Nextclade on the curated metadata +and sequences. +""" + + +DATASET_NAME = config["nextclade"]["dataset_name"] + + +rule get_nextclade_dataset: + """Download Nextclade dataset""" + output: + dataset=f"data/nextclade/{DATASET_NAME}.zip", + benchmark: + "benchmarks/get_nextclade_dataset.txt" + params: + dataset_name=DATASET_NAME + shell: + """ + nextclade3 dataset get \ + --name={params.dataset_name:q} \ + --output-zip={output.dataset} \ + --verbose + """ + + +rule run_nextclade: + input: + dataset=f"data/nextclade/{DATASET_NAME}.zip", + # The H5NX datasets should only be for the HA segment + sequences="{data_source}/results/sequences_ha.fasta", + output: + nextclade="{data_source}/results/nextclade.tsv", + benchmark: + "{data_source}/benchmarks/run_nextclade.txt" + shell: + """ + nextclade3 run \ + {input.sequences} \ + --input-dataset {input.dataset} \ + --output-tsv {output.nextclade} + """ + + +rule join_metadata_and_nextclade: + input: + nextclade="{data_source}/results/nextclade.tsv", + metadata="{data_source}/data/merged_segment_metadata.tsv", + nextclade_field_map=config["nextclade"]["field_map"], + output: + metadata="{data_source}/results/metadata.tsv", + params: + # Making this param optional because we don't have curate pipeline for fauna data + metadata_id_field=config.get("curate", {}).get("output_id_field", "strain"), + nextclade_id_field=config["nextclade"]["id_field"], + shell: + """ + export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'` + + csvtk fix-quotes -t {input.nextclade} \ + | csvtk -t cut -f $SUBSET_FIELDS \ + | csvtk -t rename2 \ + -F \ + -f '*' \ + -p '(.+)' \ + -r '{{kv}}' \ + -k {input.nextclade_field_map} \ + | csvtk del-quotes -t \ + | tsv-join -H \ + --filter-file - \ + --key-fields {params.nextclade_id_field} \ + --data-fields {params.metadata_id_field} \ + --append-fields '*' \ + --write-all ? \ + {input.metadata} \ + | tsv-select -H --exclude {params.nextclade_id_field} \ + > {output.metadata} + """ diff --git a/ingest/rules/upload_to_s3.smk b/ingest/rules/upload_to_s3.smk index 5fb123c..a9c309a 100644 --- a/ingest/rules/upload_to_s3.smk +++ b/ingest/rules/upload_to_s3.smk @@ -37,3 +37,21 @@ rule upload_metadata: {params.s3_dst:q}/metadata.tsv.zst \ {params.cloudfront_domain} 2>&1 | tee {output.flag} """ + + +rule upload_nextclade_tsv: + input: + nextclade="{data_source}/results/nextclade.tsv", + output: + flag="{data_source}/s3/nextclade.done", + params: + s3_dst=lambda wildcards: config["s3_dst"][wildcards.data_source], + cloudfront_domain=config.get("cloudfront_domain", ""), + shell: + """ + ./vendored/upload-to-s3 \ + --quiet \ + {input.nextclade:q} \ + {params.s3_dst:q}/nextclade.tsv.zst \ + {params.cloudfront_domain} 2>&1 | tee {output.flag} + """