Skip to content

Commit

Permalink
ingest: join metadata and Nextclade output
Browse files Browse the repository at this point in the history
Using the nextclade_field_map that's currently used in the measles
ingest workflow.¹ We can cut down on the columns used if they are not
useful for avian flu.

¹ <https://github.com/nextstrain/measles/blob/957fc744c64b8f5a722b5c525687d0746755add6/ingest/defaults/nextclade_field_map.tsv>
  • Loading branch information
joverlee521 committed Jun 21, 2024
1 parent b37234e commit 245f152
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 2 deletions.
4 changes: 3 additions & 1 deletion ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,6 @@ s3_dst:
fauna: s3://nextstrain-data-private/files/workflows/avian-flu

nextclade:
dataset_name: "community/moncla-lab/iav-h5/ha/all-clades"
dataset_name: community/moncla-lab/iav-h5/ha/all-clades
field_map: defaults/nextclade_field_map.tsv
id_field: seqName
28 changes: 28 additions & 0 deletions ingest/defaults/nextclade_field_map.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# TSV file that is a mapping of column names for Nextclade output TSV
# The first column should be the original column name of the Nextclade TSV
# The second column should be the new column name to use in the final metadata TSV
# Nextclade can have pathogen specific output columns so make sure to check which
# columns would be useful for your downstream phylogenetic analysis.
seqName seqName
clade clade
coverage coverage
totalMissing missing_data
totalSubstitutions divergence
totalNonACGTNs nonACGTN
qc.overallStatus QC_overall
qc.missingData.status QC_missing_data
qc.mixedSites.status QC_mixed_sites
qc.privateMutations.status QC_rare_mutations
qc.snpClusters.status QC_snp_clusters
qc.frameShifts.status QC_frame_shifts
qc.stopCodons.status QC_stop_codons
frameShifts frame_shifts
privateNucMutations.reversionSubstitutions private_reversion_substitutions
privateNucMutations.labeledSubstitutions private_labeled_substitutions
privateNucMutations.unlabeledSubstitutions private_unlabeled_substitutions
privateNucMutations.totalReversionSubstitutions private_total_reversion_substitutions
privateNucMutations.totalLabeledSubstitutions private_total_labeled_substitutions
privateNucMutations.totalUnlabeledSubstitutions private_total_unlabeled_substitutions
privateNucMutations.totalPrivateSubstitutions private_total_private_substitutions
qc.snpClusters.clusteredSNPs private_snp_clusters
qc.snpClusters.totalSNPs private_total_snp_clusters
2 changes: 1 addition & 1 deletion ingest/rules/merge_segment_metadata.smk
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ rule merge_segment_metadata:
segments = expand("{{data_source}}/data/metadata_{segment}.tsv", segment=config["segments"]),
metadata = "{data_source}/data/metadata_ha.tsv",
output:
metadata = "{data_source}/results/metadata.tsv",
metadata = "{data_source}/data/merged_segment_metadata.tsv",
shell:
"""
python scripts/add_segment_counts.py \
Expand Down
36 changes: 36 additions & 0 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,39 @@ rule run_nextclade:
--output-tsv {output.nextclade} \
--output-fasta {output.alignment}
"""


rule join_metadata_and_nextclade:
input:
nextclade="{data_source}/results/nextclade.tsv",
metadata="{data_source}/data/merged_segment_metadata.tsv",
nextclade_field_map=config["nextclade"]["field_map"],
output:
metadata="{data_source}/results/metadata.tsv",
params:
# Making this param optional because we don't have curate pipeline for fauna data
metadata_id_field=config.get("curate", {}).get("output_id_field", "strain"),
nextclade_id_field=config["nextclade"]["id_field"],
shell:
"""
export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'`
csvtk fix-quotes -t {input.nextclade} \
| csvtk -t cut -f $SUBSET_FIELDS \
| csvtk -t rename2 \
-F \
-f '*' \
-p '(.+)' \
-r '{{kv}}' \
-k {input.nextclade_field_map} \
| csvtk del-quotes -t \
| tsv-join -H \
--filter-file - \
--key-fields {params.nextclade_id_field} \
--data-fields {params.metadata_id_field} \
--append-fields '*' \
--write-all ? \
{input.metadata} \
| tsv-select -H --exclude {params.nextclade_id_field} \
> {output.metadata}
"""

0 comments on commit 245f152

Please sign in to comment.