diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 967ec6e..3596973 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -12,4 +12,6 @@ s3_dst: fauna: s3://nextstrain-data-private/files/workflows/avian-flu nextclade: - dataset_name: "community/moncla-lab/iav-h5/ha/all-clades" + dataset_name: community/moncla-lab/iav-h5/ha/all-clades + field_map: defaults/nextclade_field_map.tsv + id_field: seqName diff --git a/ingest/defaults/nextclade_field_map.tsv b/ingest/defaults/nextclade_field_map.tsv new file mode 100644 index 0000000..e23c864 --- /dev/null +++ b/ingest/defaults/nextclade_field_map.tsv @@ -0,0 +1,28 @@ +# TSV file that is a mapping of column names for Nextclade output TSV +# The first column should be the original column name of the Nextclade TSV +# The second column should be the new column name to use in the final metadata TSV +# Nextclade can have pathogen specific output columns so make sure to check which +# columns would be useful for your downstream phylogenetic analysis. +seqName seqName +clade clade +coverage coverage +totalMissing missing_data +totalSubstitutions divergence +totalNonACGTNs nonACGTN +qc.overallStatus QC_overall +qc.missingData.status QC_missing_data +qc.mixedSites.status QC_mixed_sites +qc.privateMutations.status QC_rare_mutations +qc.snpClusters.status QC_snp_clusters +qc.frameShifts.status QC_frame_shifts +qc.stopCodons.status QC_stop_codons +frameShifts frame_shifts +privateNucMutations.reversionSubstitutions private_reversion_substitutions +privateNucMutations.labeledSubstitutions private_labeled_substitutions +privateNucMutations.unlabeledSubstitutions private_unlabeled_substitutions +privateNucMutations.totalReversionSubstitutions private_total_reversion_substitutions +privateNucMutations.totalLabeledSubstitutions private_total_labeled_substitutions +privateNucMutations.totalUnlabeledSubstitutions private_total_unlabeled_substitutions +privateNucMutations.totalPrivateSubstitutions private_total_private_substitutions +qc.snpClusters.clusteredSNPs private_snp_clusters +qc.snpClusters.totalSNPs private_total_snp_clusters diff --git a/ingest/rules/merge_segment_metadata.smk b/ingest/rules/merge_segment_metadata.smk index d421167..8c00bbd 100644 --- a/ingest/rules/merge_segment_metadata.smk +++ b/ingest/rules/merge_segment_metadata.smk @@ -16,7 +16,7 @@ rule merge_segment_metadata: segments = expand("{{data_source}}/data/metadata_{segment}.tsv", segment=config["segments"]), metadata = "{data_source}/data/metadata_ha.tsv", output: - metadata = "{data_source}/results/metadata.tsv", + metadata = "{data_source}/data/merged_segment_metadata.tsv", shell: """ python scripts/add_segment_counts.py \ diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 8c15160..a81ad7c 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -42,3 +42,39 @@ rule run_nextclade: --output-tsv {output.nextclade} \ --output-fasta {output.alignment} """ + + +rule join_metadata_and_nextclade: + input: + nextclade="{data_source}/results/nextclade.tsv", + metadata="{data_source}/data/merged_segment_metadata.tsv", + nextclade_field_map=config["nextclade"]["field_map"], + output: + metadata="{data_source}/results/metadata.tsv", + params: + # Making this param optional because we don't have curate pipeline for fauna data + metadata_id_field=config.get("curate", {}).get("output_id_field", "strain"), + nextclade_id_field=config["nextclade"]["id_field"], + shell: + """ + export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'` + + csvtk fix-quotes -t {input.nextclade} \ + | csvtk -t cut -f $SUBSET_FIELDS \ + | csvtk -t rename2 \ + -F \ + -f '*' \ + -p '(.+)' \ + -r '{{kv}}' \ + -k {input.nextclade_field_map} \ + | csvtk del-quotes -t \ + | tsv-join -H \ + --filter-file - \ + --key-fields {params.nextclade_id_field} \ + --data-fields {params.metadata_id_field} \ + --append-fields '*' \ + --write-all ? \ + {input.metadata} \ + | tsv-select -H --exclude {params.nextclade_id_field} \ + > {output.metadata} + """