From 8d87951e16fe699351f8d488ea34b9d15f36562c Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 21 Nov 2023 13:32:25 -0800 Subject: [PATCH] Append usvi data --- phylogenetic/Snakefile | 14 +- phylogenetic/example_data/metadata_usvi.tsv | 2 + .../example_data/sequences_usvi.fasta | 137 ++++++++++++++ phylogenetic/scripts/uniq_merge.py | 171 ++++++++++++++++++ .../workflow/snakemake_rules/usvi.smk | 47 +++++ 5 files changed, 365 insertions(+), 6 deletions(-) create mode 100644 phylogenetic/example_data/metadata_usvi.tsv create mode 100644 phylogenetic/example_data/sequences_usvi.fasta create mode 100644 phylogenetic/scripts/uniq_merge.py create mode 100644 phylogenetic/workflow/snakemake_rules/usvi.smk diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 15ff6a9..ddd301d 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -16,6 +16,8 @@ rule files: files = rules.files.params +include: "workflow/snakemake_rules/usvi.smk" + rule download: """Downloading sequences and metadata from data.nextstrain.org""" output: @@ -53,8 +55,8 @@ rule filter: - minimum genome length of {params.min_length} (50% of Zika virus genome) """ input: - sequences = "data/sequences.fasta", - metadata = "data/metadata.tsv", + sequences = "data/sequences_all.fasta", + metadata = "data/metadata_all.tsv", exclude = files.dropped_strains output: sequences = "results/filtered.fasta" @@ -122,7 +124,7 @@ rule refine: input: tree = "results/tree_raw.nwk", alignment = "results/aligned.fasta", - metadata = "data/metadata.tsv" + metadata = "data/metadata_all.tsv" output: tree = "results/tree.nwk", node_data = "results/branch_lengths.json" @@ -189,7 +191,7 @@ rule traits: """ input: tree = "results/tree.nwk", - metadata = "data/metadata.tsv" + metadata = "data/metadata_all.tsv" output: node_data = "results/traits.json", params: @@ -212,7 +214,7 @@ rule export: """Exporting data files for for auspice""" input: tree = "results/tree.nwk", - metadata = "data/metadata.tsv", + metadata = "data/metadata_all.tsv", branch_lengths = "results/branch_lengths.json", traits = "results/traits.json", nt_muts = "results/nt_muts.json", @@ -242,7 +244,7 @@ rule export: rule final_strain_name: input: auspice_json="results/raw_zika.json", - metadata="data/metadata.tsv", + metadata="data/metadata_all.tsv", root_sequence="results/raw_zika_root-sequence.json", output: auspice_json="auspice/zika.json", diff --git a/phylogenetic/example_data/metadata_usvi.tsv b/phylogenetic/example_data/metadata_usvi.tsv new file mode 100644 index 0000000..96d3d52 --- /dev/null +++ b/phylogenetic/example_data/metadata_usvi.tsv @@ -0,0 +1,2 @@ +genbank_accession genbank_accession_rev accession strain date region country division location length host release_date update_date sra_accessions authors institution url +USVI/37/2016 VI37 USVI/37/2016 2016-10-06 North America Usvi Saint Croix Saint Croix 10807 Homo sapiens Black et al FH https://github.com/blab/zika-usvi/ diff --git a/phylogenetic/example_data/sequences_usvi.fasta b/phylogenetic/example_data/sequences_usvi.fasta new file mode 100644 index 0000000..5cc712f --- /dev/null +++ b/phylogenetic/example_data/sequences_usvi.fasta @@ -0,0 +1,137 @@ +>USVI/37/2016 +nnnnnnnnnnnnnnnnnnnnnnnnnnnngacagttcgagtttgaagcgaaagctagcaacagtatcaacaggttttattt +tggatttggaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgtcaatatgctaaa +acgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggacttctgctgggtcatgggcccatca +ggatggtcttggcgattctagcctttttgagattcacggcaatcaagccatcactgggcctcatcaatagatggggttca +gtggggaaaaaagaggctatggaaacaataaagaagttcaagaaagatctggctgccatgctgagaataatcaatgctag +gaaggagaagaagagacgaggcgcagatactagtgtcggaattgttggcctcctgctgaccacagctatggcagcggagg +tcactagacgtgggagtgcatactatatgtacttggacagaaacgatgctggggaggccatatcttttccaaccacattg +gggatgaataagtgttatatacagatcatggatcttggacacatgtgtgatgccaccatgagctatgaatgccctatgct +ggatgagggggtggaaccagatgacgtcgattgttggtgcaacacgacgtcaacttgggttgtgtacggaacctgccatc +acaaaaaaggtgaagcacggagatctagaagagctgtgacgctcccctcccattccaccaggaagctgcaaacgcggtcg +caaacctggttggaatcaagagaatacacaaagcacttgattagagtcgaaaattggatattcaggaaccctggcttcgc +gttagcagcagctgccatcgcttggcttttgggaagctcaacgagccaaaaagtcatatacttggtcatgatactgctga +ttgccccggcatacagcatcaggtgcataggagtcagcaatagggactttgtggaaggtatgtcaggtgggacttgggtt +gatgttgtcttggaacatggaggttgtgtcaccgtaatggcacaggacaaaccgactgtcgacatagagctggttacaac +aacagtcagcaacatggcggaggtaagatcctactgctatgaggcatcaatatcagacatggcttctgacagccgctgcc +caacacaaggtgaagcctaccttgacaagcaatcagacactcaatatgtctgcaaaagaacgttagtggacagaggctgg +ggaaatggatgtggactttttggcaaagggagcctggtgacatgcgctaagtttgcatgctccaagaaaatgaccgggaa +gagcatccagccagagaatctggagtaccggataatgctgtcagttcatggctcccagcacagtgggatgatcgttaatg +acacaggacatgaaactgatgagaatagagcgaaagttgagataacgcccaattcaccgagagccgaagccaccctgggg +ggttttggaagcctaggacttgattgtgaaccgaggacaggccttgacttttcagatttgtattacttgactatgaataa +caagcactggttggttcacaaggagtggttccacgacattccattaccttggcacgctggggcagacaccggaactccac +actggaacaacaaagaagcactggtagagttcaaggacgcacatgccaaaaggcaaactgtcgtggttctagggagtcaa +gaaggagcagttcacacggcccttgctggagctctggaggctgagatggatggtgcaaagggaaggctgtcctctggcca +cttgaaatgtcgcctgaaaatggataaacttagattgaagggcgtgtcatactccttgtgtactgcagcgttcacattca +ccaagatcccggctgaaacactgcacgggacagtcacagtggaggtacagtacgcagggacagatggaccttgcaaggtt +ccagctcagatggcggtggacatgcaaactctgaccccagttgggaggttgataaccgctaaccccgtaatcactgaaag +cactgagaactctaagatgatgctggaacttgatccaccatttggggactcttacattgtcataggagtcggggagaaga +agatcacccaccactggcacaggagtggcagcaccattggaaaagcatttgaagccactgtgagaggtgccaagagaatg +gcagtcttgggagacacagcctgggactttggatcagttggaggcgctctcaactcattgggcaagggcatccatcaaat +ttttggagcagctttcaaatcattgtttggaggaatgtcctggttctcacaaattctcattggaacgttgctgatgtggt +tgggtctgaacacaaagaatggatctatttcccttatgtgcttggccttagggggagtgttgatcttcttatccacagcc +gtctctgctgatgtggggtgctcggtggacttctcaaagaaggagacgagatgcggtacaggggtgttcgtctataacga +cgttgaagcctggagggacaggtacaagtaccatcctgactccccccgtagattggcagcagcagttaagcaagcctggg +aagatggtatctgcgggatctcctctgtttcaagaatggaaaacatcatgtggagatcagtagaaggggagctcaacgca +atcctggaagagaatggagttcaactgacggtcgttgtgggatctgtaaaaaaccccatgtggagaggtccacagagatt +gcccgtgcctgtgaacgagctgccccacggctggaaggcttgggggaaatcgtacttcgtcagagcagcaaagacaaata +acagctttgtcgtggatggtgacacactgaaggaatgcccactcaaacatagagcatggaacagctttcttgtggaggat +catgggttcggggtatttcacactagtgtctggctcaaggttagagaagattattcattagagtgtgatccagccgttat +tggaacagctgttaagggaaaggaggctgtacacagtgatctaggctactggattgagagtgagaagaatgacacatgga +ggctggagagggcccatctgatcgagatgaaaacatgtgaatggccaaagtcccacacattgtggacagatggaatagaa +gagagtgatctgatcatacccaagtctttagctgggccactcagccatcacaataccagagagggctacaggacccaaat +gaaagggccatggcacagtgaagagcttgaaattcggtttgaggaatgcccaggcactaaggtccacgtggaggaaacat +gtggaacaagaggaccatctctgagatcaaccactgcaagcggaagggtgatcgaggaatggtgctgcagggagtgcaca +atgcccccactgtcgttccgggctaaagatggctgttggtatggaatggagataaggcccaggaaagaaccagaaagcaa +cttagtaaggtcaatggtgactgcaggatcaactgatcacatggaccacttctcccttggagtgcttgtgatcctgctca +tggtgcaggaagggctgaagaagagaatgaccacaaagatcatcataagcacatcaatggcagtgctggtagctatgatc +ctgggaggattttcaatgagtgacctggctaagcttgcaattttgatgggtgccaccttcgcggaaatgaacactggagg +agatgtagctcatctggcgctgatagcggcattcaaagtcagaccagcgttgctggtatctttcatcttcagagctaatt +ggacaccccgtgaaagcatgctgctggccttggcctcgtgtcttttgcaaactgcgatctccgccttggaaggcgacctg +atggttctcatcaatggttttgctttggcctggttggcaatacgagcgatggttgttccacgcactgataacatcacctt +ggcaatcctggctgctctgacaccactggcccggggcacactgcttgtggcgtggagagcaggccttgctacttgcgggg +ggtttatgctcctctctctgaagggaaaaggcagtgtgaagaagaacttaccatttgtcatggccctgggactaaccgct +gtgaggctggtcgaccccatcaacgtggtgggactgctgttgctcacaaggagtgggaagcggagctggccccctagcga +agtactcacagctgttggcctgatatgcgcattggctggagggttcgccaaggcagatatagagatggctgggcccatgg +ccgcggtcggtctgctaattgtcagttacgtggtctcaggaaagagtgtggacatgtacattgaaagagcaggtgacatc +acatgggaaaaagatgcggaagtcactggaaacagtccccggctcgatgtggcgctagatgagagtggtgatttctccct +ggtggaggatgacggtccccccatgagagagatcatactcaaggtggtcctgatgaccatctgtggcatgaacccaatag +ccataccctttgcagctggagcgtggtacgtatacgtgaagactggaaaaaggagtggtgctctatgggatgtgcctgct +cccaaggaagtaaaaaagggggagaccacagatggagtgtacagagtaatgactcgtagactgctaggttcaacacaagt +tggagtgggagttatgcaagagggggtctttcacactatgtggcacgtcacaaaaggatccgcgctgagaagcggtgaag +ggagacttgatccatactggggagatgtcaagcaggatctggtgtcatactgtggtccatggaagctagatgccgcctgg +gatgggcacagcgaggtgcagctcttggccgtgccccccggagagagagcgaggaacatccagactctgcccggaatatt +taagacaaaggatggggacattggagcggttgcgctggattacccagcaggaacttcaggatctccaatcctagacaagt +gtgggagagtgataggactttatggcaatggggtcgtgatcaaaaacgggagttatgttagtgccatcacccaagggagg +agggaggaagagactcctgttgagtgcttcgagccctcgatgctgaagaagaagcagctaactgtcttagacttgcatcc +tggagctgggaaaaccaggagagttcttcctgaaatagtccgtgaagccataaaaacaagactccgtactgtgatcttag +ctccaaccagggttgtcgctgctgaaatggaggaggcccttagagggcttccagtgcgttatatgacaacagcagtcaat +gtcacccactctggaacagaaatcgtcgacttaatgtgccatgccaccttcacttcacgtctactacagccaatcagagt +ccccaactataatctgtatattatggatgaggcccacttcacagatccctcaagtatagcagcaagaggatacatttcaa +caagggttgagatgggcgaggcggctgccatcttcatgaccgccacgccaccaggaacccgtgacgcatttccggactcc +aactcaccaattatggacaccgaagtggaagtcccagagagagcctggagctcaggctttgattgggtgacggatcattc +tggaaaaacagtttggtttgttccaagcgtgaggaacggcaatgagatcgcagcttgtctgacaaaggctggaaaacggg +tcatacagctcagcagaaagacttttgagacagagttccagaaaacaaaacatcaagagtgggactttgtcgtgacaact +gacatttcagagatgggcgccaactttaaagctgaccgtgtcatagattccaggagatgcctaaagccggtcatacttga +tggcgagagagtcattctggctggacccatgcctgtcacacatgccagcgctgcccagaggagggggcgcataggcagga +atcccaacaaacctggagatgagtatctgtatggaggtgggtgcgcagagactgacgaagaccatgcacactggcttgaa +gcaagaatgctccttgacaatatttacctccaagatggcctcatagcctcgctctatcgacctgaggccgacaaagtagc +agccattgagggagagttcaagcttaggacggagcaaaggaagacctttgtggaactcatgaaaagaggagatcttcctg +tttggctggcctatcaggttgcatctgccggaataacctacacagatagaagatggtgctttgatggcacgaccaacaac +accataatggaagacagtgtgccggcagaggtgtggaccagacacggagagaaaagagtgctcaaaccgaggtggatgga +cgccagagtttgttcagatcatgcggccctgaagtcattcaaggagtttgccgctgggaaaagaggagcggcttttggag +tgatggaagccctgggaacactgccaggacacatgacnnagagattccaggaagcnattgacaacctcgctgtgctcatg +cgngcagagactggaagcaggccttacaaagccgcggcggcccaattgccggagaccctagagaccataatgcntttggg +gttgctgggaacagtctcgctgggaatcttcttcgtcttgatgaggaacaagggcatagggaagatgggctttggaatgg +tgactcttggggccagcgcatggctcatgtggctctcggaaattgagccagccagaattgcatgtgtcctcattgttgtg +ttcctattgctggtggtgctcatacctgagccagaaaagcaaagatctccccaggacaaccaaatggcaatcatcatcat +ggtagcagtaggtcttttgggcttgattaccgccaatgaactcggatggttggagagaacaaagagtgacctaagccatc +taatgggaaggagagaggagggggcaaccataggattctcaatggacattgacctgcggccagcctcagcttgggccatc +tatgctgccttgacaactttcattaccccagccgtccaacatgcagtgaccacctcatacaacaactactccttaatggc +gatggccacgcaagctggagtgttgtttggcatgggcaaagggatgccattctacgcatgggactttggagtcccgctgc +taatgataggttgctactcacaattaacacccctgaccctaatagtggccatcattttgctcgtggcgcactacatgtac +ttgatcccagggctgcaggcagcagctgcgcgtgctgcccagaagagaacggcagctggcatcatgaagaaccctgttgt +ggatggaatagtggtgactgacattgacacaatgacaattgacccccaagtggagaaaaagatgggacaggtgctactca +tagcagtggccgtctccagcgccatactgtcgcggaccgcctgggggtggggggaggctggggctctgatcacagccgca +acttccactttgtgggaaggctctccgaacaagtactggaactcctctacagccacttcactgtgtaacatttttagggg +aagttacttggctggagcttctctaatctacacagtaacaagaaacgctggcttggtcaagagacgtgggggtggaacag +gagagaccctgggagagaaatggaaggcccgcttgaaccagatgtcggccctggagttctactcctacaaaaagtcaggc +atcaccgaggtgtgcagagaagaggcccgccgcgccctcaaggacggtgtggcaacgggaggccatgctgtgtcccgagg +aagtgcaaagctgagatggttggtggagcggggatacctgcagccctatggaaaggtcattgatcttggatgtggcagag +ggggctggagttactacgccgccaccatccgcaaagttcaagaagtgaaaggatacacaaaaggaggccctggtcatgaa +gaacccgtgttggtgcaaagctatgggtggaacatagtccgtcttaagagtggggtggacgtctttcatatggcggctga +gccgtgtgacacgttgctgtgtgacataggtgagtcatcatctagtcctgaagtggaagaagcacggacgctcagagtcc +tctccatggtgggggattggcttgaaaaaagaccaggagccttttgtataaaagtgttgtgcccatacaccagcactatg +atggaaaccctggagcgactgcagcgtaggtatgggggaggactggtcagagtgccactctcccgcaactctacacatga +gatgtactgggtctctggagcgaaaagcaacaccataaaaagtgtgtccaccacgagccagctcctcttggggcgcatgg +acgggcctaggaggccagtgaaatatgaggaggatgtgaatctcggctctggcacgcgggctgtggtaagctgcgctgaa +gctcccaacatgaagatcattggtaaccgcattgaaaggatccgcagtgagcacgcggaaacgtggttctttgacgagaa +ccacccatataggacatgggcttaccatggaagctatgaggcccccacacaagggtcagcgtcctctctaataaacgggg +ttgtcaggctcctgtcaaaaccctgggatgtggtgactggagtcacaggaatagccatgaccgacaccacaccgtatggt +cagcaaagagttttcaaggaaaaagtggacactagggtgccagacccccaagaaggcactcgtcaggttatgagcatggt +ctcttcctggttgtggaaagagctaggcaaacacaaacggccacgagtctgcaccaaagaagagttcatcaacaaggttc +gtagcaatgcagcattaggggcaatatttgaggaggaaaaagagtggaagactgcagtggaagctgtgaacgatccaagg +ttctgggctctagtggacaaggaaagagagcaccacctgagaggagagtgccagagctgtgtgtacaacatgatgggaaa +aagagaaaagaaacaaggggaatttggaaaggccaagggcagccgcgccatctggtatatgtggctaggggctagatttc +tagagttcgaagcccttggattcttgaacgaggatcactggatggggagagagaactcaggaggtggtgttgaagggctg +ggattacaaagactcggatatgtcctagaagagatgagtcgtataccaggaggaaggatgtatgcagatgacactgctgg +ctgggacacccgcattagcaggtttgatctggagaatgaagctctaatcaccaaccaaatggagaaagggcacagggcct +tggcattggccataatcaagtacacataccaaaacaaagtggtaaaggtccttagaccagctgaaaaagggaaaacagtt +atggacattatttcgagacaagaccaaagggggagcggacaagttgtcacttacgctcttaacacatttaccaacctagt +ggtgcaactcattcggaatatggaggctgaggaagttctagagatgcaagacttgtggctgctgcggaggtcagagaaag +tgaccaactggttgcagagcaacggatgggataggctcaaacgaatggcagtcagtggagatgattgcgttgtgaagcca +attgatgataggtttgcacatgccctcaggttcttgaatgatatgggaaaagttaggaaggacacacaagagtggaaacc +ctcaactggatgggacaactgggaagaagttccgttttgctcccaccacttcaacaagctccatctcaaggacgggaggt +ccattgtggttccctgccgccaccaagatgaactgattggtcgggcccgcgtctctccaggggcgggatggagcatccgg +gagactgcttgcctagcaaaatcatatgcgcaaatgtggcagctcctttatttccacagaagggacctccgactgatggc +caatgccatttgttcatctgtgccagttgactgggttccaactgggagaactacctggtcaatccatggaaagggagaat +ggatgaccactgaagacatgcttgtggtgtggaacagagtgtggatnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnn diff --git a/phylogenetic/scripts/uniq_merge.py b/phylogenetic/scripts/uniq_merge.py new file mode 100644 index 0000000..fd08b10 --- /dev/null +++ b/phylogenetic/scripts/uniq_merge.py @@ -0,0 +1,171 @@ +#! /usr/bin/env python + +"""Harmonize and merge pandas DataTables such that conflicting data is not lost. + +Leave one blank line. The rest of this docstring should contain an +overall description of the module or program. Optionally, it may also +contain a brief description of exported classes and functions and/or usage +examples. + + Typical usage example: + + one_df = pd.DataFrame( + {'strain': ['A', 'B', 'C'], + 'date': ['2022-01-01', '2022-02-02', '2022-03-03'], + 'clade': ['alpha', 'beta', 'gamma'], + 'geo':['iowa', 'washington', np.nan]}) + one_df['age'] = '-N/A-' + + two_df = pd.DataFrame( + {'strain': ['D', 'B', 'C'], + 'clade': ['delta', 'beta2', 'gamma'], + 'patient': ['bob', 'marley', 'rick']}) + two_df['col_date'] = np.nan + two_df['group'] = '' + + print(one_df) + print(two_df) + + merged_df = merge_two(one_df, two_df) + print(merged_df) +""" +# ===== Dependencies +import argparse +import os +import sys + +import numpy as np +import pandas as pd + + +# (2) Define command line arguments +def parse_args(): + # Main help command + parser = argparse.ArgumentParser( + description="Harmonize and merge pandas DataTables such that conflicting data is not lost." + ) + # Add first argument + parser.add_argument("--cache", help="Path to cache of cleaned data.", required=True) + parser.add_argument("--new", help="Path to new data.", required=True) + parser.add_argument( + "--cache_delim", + default="\t", + help="delimiter for cache of cleaned data.", + required=False, + ) + parser.add_argument( + "--new_delim", default="\t", help="delimiter for new data.", required=False + ) + parser.add_argument( + "--outfile", + default="merged_cache_new.tsv", + help="Merged file [default: merged_cache_new.tsv].", + required=False, + ) + parser.add_argument( + "--outfile_excel", + help="Will export an excell merged file if defined. [Example: merged_cache_new.xlsx].", + required=False, + ) + parser.add_argument( + "--outfile_delim", + default="\t", + help="delimiter for outfile data.", + required=False, + ) + parser.add_argument( + "--groupby_col", + default="strain", + help="Group by column name [default 'strain'].", + required=False, + ) + parser.add_argument( + "--drop_uninformative_cols", + default=False, + help="Drop uninformative columns [default False].", + required=False, + ) + + return parser.parse_args() + + +# ===== Reusable functions +def _drop_uninformative_cols(df: pd.DataFrame) -> pd.DataFrame: + """Drops uninformative columns from a pandas DataFrame for being all empty. Used by merge_two.""" + return df.replace("", np.nan).replace("-N/A-", np.nan).dropna(how="all", axis=1) + + +def _uniq_merge(x: "pd.Series[str]") -> str: + """Merges unique values by group and joins conflicting values in a comma separated list. Used by merge_two.""" + cx = x.replace("", np.nan).replace("-N/A-", np.nan).replace("?",np.nan).dropna().unique() + if len(cx) >= 1: + # split substrings by delimiter and flatten list + my_list = [i.split(',') for i in cx] + flat_list = [item for sublist in my_list for item in sublist] + # return unique values joined by delimiter + return ",".join(list(set(flat_list))) + else: + return "" + +# Merge and harmonize two datasets, flag conflicts with commas +def merge_two( + df1: pd.DataFrame, df2: pd.DataFrame, groupby_col: str = "strain", drop_uninformative_cols: bool = False +) -> pd.DataFrame: + """Harmonizes and merges two pandas DataFrames. + + Takes two pandas DataFrames through the following 3 steps: + + 1. Optionally drops any columns in either which are all NA, "-N/A", or empty strings + 2. Harmonizes their columns such that columns in the left DataFrame are preferentially listed first + 3. Combines the DataFrames by group defined in groupby_col such that: + * unique values are merged + * conflicting values are joined in a comma separated list + + Args: + df1: + The left hand side (lhs) pandas DataTable, will preferentially decide column order of merged DataTable + df2: + The right hand side (rhs) pandas DataTable, will be merged with df1 and new columns will be listed later. + groupby_col: + The id column that is shared by both df1 and df2 to allow for merging and harmonization of datasets + + Returns: + A merged and harmonized dataset of containing information from df1 and df2. + + Raises: + TBD + """ + # Optionally drop uninformative columns + if(drop_uninformative_cols): + df1 = _drop_uninformative_cols(df1) + df2 = _drop_uninformative_cols(df2) + + # Harmonize columns + new_col = [x for x in df2.columns.tolist() if x not in set(df1.columns.tolist())] + h_df1_df = df1.reindex(df1.columns.tolist() + new_col, axis=1) + h_df2_df = df2.reindex(df1.columns.tolist() + new_col, axis=1) + + # Unique and merge conflicting data + merged_df = pd.concat([h_df1_df, h_df2_df]).groupby(groupby_col).agg([_uniq_merge]) + merged_df.columns=[i[0] for i in merged_df.columns] + return merged_df + + +def main(): + args = parse_args() + + old = pd.read_csv(args.cache, sep=args.cache_delim, header=0, dtype=str) + new = pd.read_csv(args.new, sep=args.new_delim, header=0, dtype=str) + + merged = merge_two(old, new, groupby_col=args.groupby_col, drop_uninformative_cols=args.drop_uninformative_cols) + + # Export merged file as Excel or delimited file + if(args.outfile_excel is not None): + args.outfile_excel = os.path.splitext(args.outfile)[0] + ".xlsx" + merged.to_excel(args.outfile_excel) + else: + merged.to_csv(args.outfile, sep=args.outfile_delim) + + +if __name__ == "__main__": + main() diff --git a/phylogenetic/workflow/snakemake_rules/usvi.smk b/phylogenetic/workflow/snakemake_rules/usvi.smk new file mode 100644 index 0000000..a458574 --- /dev/null +++ b/phylogenetic/workflow/snakemake_rules/usvi.smk @@ -0,0 +1,47 @@ +rule download_usvi: + """Downloading sequences and metadata from data.nextstrain.org""" + output: + sequences = "data/sequences_usvi.fasta.zst", + metadata = "data/metadata_usvi.tsv.zst" + params: + sequences_url = "https://data.nextstrain.org/files/zika/sequences_usvi.fasta.zst", + metadata_url = "https://data.nextstrain.org/files/zika/metadata_usvi.tsv.zst" + shell: + """ + curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences} + curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata} + """ + +rule decompress_usvi: + """Decompressing sequences and metadata""" + input: + sequences = "data/sequences_usvi.fasta.zst", + metadata = "data/metadata_usvi.tsv.zst" + output: + sequences = "data/sequences_usvi.fasta", + metadata = "data/metadata_usvi.tsv" + shell: + """ + zstd -d -c {input.sequences} > {output.sequences} + zstd -d -c {input.metadata} > {output.metadata} + """ + +rule append_usvi: + """Appending USVI sequences""" + input: + sequences = "data/sequences.fasta", + metadata = "data/metadata.tsv", + usvi_sequences = "data/sequences_usvi.fasta", + usvi_metadata = "data/metadata_usvi.tsv" + output: + sequences = "data/sequences_all.fasta", + metadata = "data/metadata_all.tsv" + shell: + """ + cat {input.sequences} {input.usvi_sequences} > {output.sequences} + python scripts/uniq_merge.py \ + --cache {input.metadata} \ + --new {input.usvi_metadata} \ + --groupby_col genbank_accession \ + --outfile {output.metadata} + """ \ No newline at end of file