Skip to content

Commit

Permalink
Use genbank_accession column as ID column
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Nov 14, 2023
1 parent b117665 commit f241e07
Show file tree
Hide file tree
Showing 6 changed files with 230 additions and 155 deletions.
40 changes: 37 additions & 3 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
if not config:
configfile: "config/config_zika.yaml"

rule all:
input:
auspice_json = "auspice/zika.json",
Expand Down Expand Up @@ -59,12 +62,14 @@ rule filter:
group_by = "country year month",
sequences_per_group = 40,
min_date = 2012,
min_length = 5385
min_length = 5385,
strain_id = config.get("strain_id_field", "strain"),
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--output {output.sequences} \
--group-by {params.group_by} \
Expand Down Expand Up @@ -124,13 +129,15 @@ rule refine:
params:
coalescent = "opt",
date_inference = "marginal",
clock_filter_iqd = 4
clock_filter_iqd = 4,
strain_id = config.get("strain_id_field", "strain"),
shell:
"""
augur refine \
--tree {input.tree} \
--alignment {input.alignment} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output-tree {output.tree} \
--output-node-data {output.node_data} \
--timetree \
Expand Down Expand Up @@ -212,12 +219,16 @@ rule export:
auspice_config = files.auspice_config,
description = files.description
output:
auspice_json = rules.all.input.auspice_json
auspice_json = "results/raw_zika.json",
root_sequence = "results/raw_zika_root-sequence.json",
params:
strain_id = config.get("strain_id_field", "strain"),
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \
--colors {input.colors} \
--auspice-config {input.auspice_config} \
Expand All @@ -226,6 +237,29 @@ rule export:
--output {output.auspice_json}
"""

rule final_strain_name:
input:
auspice_json="results/raw_zika.json",
metadata="data/metadata.tsv",
root_sequence="results/raw_zika_root-sequence.json",
output:
auspice_json="auspice/zika.json",
root_sequence="auspice/zika_root-sequence.json",
params:
strain_id=config["strain_id_field"],
display_strain_field=config.get("display_strain_field", "strain"),
shell:
"""
python3 scripts/set_final_strain_name.py \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--input-auspice-json {input.auspice_json} \
--display-strain-name {params.display_strain_field} \
--output {output.auspice_json}
cp {input.root_sequence} {output.root_sequence}
"""

rule clean:
"""Removing directories: {params}"""
params:
Expand Down
2 changes: 2 additions & 0 deletions config/config_zika.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
strain_id_field: "genbank_accession"
display_strain_field: "strain"
167 changes: 84 additions & 83 deletions config/dropped_strains.txt
Original file line number Diff line number Diff line change
@@ -1,86 +1,87 @@
PF13/251013_18 # reference included in config/zika_reference.gb
AFMC_U # too basal
AFMC_S # too basal
Boracay/16423 # too basal
JMB_185 # too basal
PHL/2012/CPC_0740 # too basal
MG827392
KX369547 # PF13/251013_18 # reference included in config/zika_reference.gb
KY553111 # AFMC_U # too basal
KY962729 # AFMC_S # too basal
KY120353 # Boracay/16423 # too basal
KU179098 # JMB_185 # too basal
KU681082 # PHL/2012/CPC_0740 # too basal
VIE/Bra/2016 # too basal
Dominican_Republic/2016/PD2 # duplicate of other strain in dataset
GD01 # duplicate of other strain in dataset
GDZ16001 # duplicate of other strain in dataset
VEN/UF_2/2016 # duplicate of other strain in dataset
ZZ_1 # duplicate of other strain in dataset
VR10599/Pavia/2016 # export with unknown origin
34997/Pavia/2016 # export with unknown origin
COL/FLR_00001/2015 # duplicate of COL/FLR/2015
COL/FLR_00002/2015 # duplicate of COL/FLR/2015
COL/FLR_00003/2015 # duplicate of COL/FLR/2015
COL/FLR_00004/2015 # duplicate of COL/FLR/2015
COL/FLR_00005/2015 # duplicate of COL/FLR/2015
COL/FLR_00006/2015 # duplicate of COL/FLR/2015
COL/FLR_00007/2015 # duplicate of COL/FLR/2015
COL/FLR_00008/2015 # duplicate of COL/FLR/2015
COL/FLR_00009/2015 # duplicate of COL/FLR/2015
COL/FLR_00010/2015 # duplicate of COL/FLR/2015
COL/FLR_00011/2015 # duplicate of COL/FLR/2015
COL/FLR_00012/2015 # duplicate of COL/FLR/2015
COL/FLR_00013/2015 # duplicate of COL/FLR/2015
COL/FLR_00014/2015 # duplicate of COL/FLR/2015
COL/FLR_00015/2015 # duplicate of COL/FLR/2015
COL/FLR_00016/2015 # duplicate of COL/FLR/2015
COL/FLR_00017/2015 # duplicate of COL/FLR/2015
COL/FLR_00018/2015 # duplicate of COL/FLR/2015
COL/FLR_00019/2015 # duplicate of COL/FLR/2015
COL/FLR_00020/2015 # duplicate of COL/FLR/2015
COL/FLR_00021/2015 # duplicate of COL/FLR/2015
COL/FLR_00022/2015 # duplicate of COL/FLR/2015
COL/FLR_00023/2015 # duplicate of COL/FLR/2015
COL/FLR_00024/2015 # duplicate of COL/FLR/2015
COL/FLR_00025/2015 # duplicate of COL/FLR/2015
COL/FLR_00026/2015 # duplicate of COL/FLR/2015
COL/FLR_00034/2015 # duplicate of COL/FLR/2015
COL/FLR_00035/2015 # duplicate of COL/FLR/2015
COL/FLR_00036/2015 # duplicate of COL/FLR/2015
COL/FLR_00038/2015 # duplicate of COL/FLR/2015
COL/FLR_00040/2015 # duplicate of COL/FLR/2015
COL/FLR_00041/2015 # duplicate of COL/FLR/2015
COL/FLR_00042/2015 # duplicate of COL/FLR/2015
COL/PRV_00027/2015 # misdated
COL/PRV_00028/2015 # misdated
COL/PAN_00029/2015 # misdated
COL/PAN_00030/2015 # misdated
BRA/2016/FC_DQ12D1 # large indel
Brazil/2016/ZBRX8 # large indel
Brazil/2016/ZBRX11 # large indel
CX17 # large indel
MEX/2016/mex27 # large indel
MEX/2016/mex50 # large indel
SLV/2016/ElSalvador_1055 # large indel
USVI/20/2016 # large indel
KU853013 # Dominican_Republic/2016/PD2 # duplicate of other strain in dataset
KU740184 # GD01 # duplicate of other strain in dataset
KU761564 # GDZ16001 # duplicate of other strain in dataset
KX893855 # VEN/UF_2/2016 # duplicate of other strain in dataset
KY927808 # ZZ_1 # duplicate of other strain in dataset
KY003154 # VR10599/Pavia/2016 # export with unknown origin
KY003153 # 34997/Pavia/2016 # export with unknown origin
MF574552 # COL/FLR_00001/2015 # duplicate of COL/FLR/2015
MF574559 # COL/FLR_00002/2015 # duplicate of COL/FLR/2015
MF574560 # COL/FLR_00003/2015 # duplicate of COL/FLR/2015
MF574561 # COL/FLR_00004/2015 # duplicate of COL/FLR/2015
MF574571 # COL/FLR_00005/2015 # duplicate of COL/FLR/2015
MF574555 # COL/FLR_00006/2015 # duplicate of COL/FLR/2015
MF574557 # COL/FLR_00007/2015 # duplicate of COL/FLR/2015
MF574562 # COL/FLR_00008/2015 # duplicate of COL/FLR/2015
MF574572 # COL/FLR_00009/2015 # duplicate of COL/FLR/2015
MF574570 # COL/FLR_00010/2015 # duplicate of COL/FLR/2015
MF574565 # COL/FLR_00011/2015 # duplicate of COL/FLR/2015
MF574568 # COL/FLR_00012/2015 # duplicate of COL/FLR/2015
MF574558 # COL/FLR_00013/2015 # duplicate of COL/FLR/2015
MF574576 # COL/FLR_00014/2015 # duplicate of COL/FLR/2015
MF574567 # COL/FLR_00015/2015 # duplicate of COL/FLR/2015
MF574575 # COL/FLR_00016/2015 # duplicate of COL/FLR/2015
MF574553 # COL/FLR_00017/2015 # duplicate of COL/FLR/2015
MF574573 # COL/FLR_00018/2015 # duplicate of COL/FLR/2015
MF574574 # COL/FLR_00019/2015 # duplicate of COL/FLR/2015
MF574577 # COL/FLR_00020/2015 # duplicate of COL/FLR/2015
MF574556 # COL/FLR_00021/2015 # duplicate of COL/FLR/2015
MF574554 # COL/FLR_00022/2015 # duplicate of COL/FLR/2015
MF574566 # COL/FLR_00023/2015 # duplicate of COL/FLR/2015
MF574569 # COL/FLR_00024/2015 # duplicate of COL/FLR/2015
MF574563 # COL/FLR_00025/2015 # duplicate of COL/FLR/2015
MF574564 # COL/FLR_00026/2015 # duplicate of COL/FLR/2015
MF574581 # COL/FLR_00034/2015 # duplicate of COL/FLR/2015
MF574588 # COL/FLR_00035/2015 # duplicate of COL/FLR/2015
MF574582 # COL/FLR_00036/2015 # duplicate of COL/FLR/2015
MF574586 # COL/FLR_00038/2015 # duplicate of COL/FLR/2015
MF574584 # COL/FLR_00040/2015 # duplicate of COL/FLR/2015
MF574583 # COL/FLR_00041/2015 # duplicate of COL/FLR/2015
MF574580 # COL/FLR_00042/2015 # duplicate of COL/FLR/2015
MF574579 # COL/PRV_00027/2015 # misdated
MF574578 # COL/PRV_00028/2015 # misdated
MF574585 # COL/PAN_00029/2015 # misdated
MF574587 # COL/PAN_00030/2015 # misdated
KY785436 # BRA/2016/FC_DQ12D1 # large indel
KY559010 # Brazil/2016/ZBRX8 # large indel
KY559011 # Brazil/2016/ZBRX11 # large indel
KX986761 # CX17 # large indel
MF801405 # MEX/2016/mex27 # large indel
MF801424 # MEX/2016/mex50 # large indel
MF801377 # SLV/2016/ElSalvador_1055 # large indel
VI20_12plex # USVI/20/2016 # large indel
USVI/21/2016 # large indel
USVI/23/2016 # large indel
USVI/27/2016 # large indel
USVI/30/2016 # large indel
USVI/32/2016 # large indel
Thailand/1605aTw # excess divergence
VE_Ganxian # excess divergence
ZK_YN001 # excess divergence
Haiti/0029/2014 # contamination present
Haiti/0033/2014 # contamination present
Haiti/0036/2014 # contamination present
Haiti/0054/2014 # contamination present
Haiti/0074/2014 # contamination present
Haiti/0097/2014 # contamination present
mosquito/Haiti/1682/2016 # contamination present
VI23_12plex # USVI/23/2016 # large indel
VI27_1d # USVI/27/2016 # large indel
VI30_1d # USVI/30/2016 # large indel
VI32_12plex # USVI/32/2016 # large indel
KY126351 # Thailand/1605aTw # excess divergence
KU744693 # VE_Ganxian # excess divergence
KY328290 # ZK_YN001 # excess divergence
KY415986 # Haiti/0029/2014 # contamination present
KY415987 # Haiti/0033/2014 # contamination present
KY415990 # Haiti/0036/2014 # contamination present
KY415988 # Haiti/0054/2014 # contamination present
KY415989 # Haiti/0074/2014 # contamination present
KY415991 # Haiti/0097/2014 # contamination present
MF384325 # mosquito/Haiti/1682/2016 # contamination present
ZF36_36S # contamination present
MR766 # lab strain
Aedes_sp/MEX_I_44/2016 # duplicate of Aedes_aegypti/MEX/MEX_I_44/2016
Puerto_Rico/2015/PRVABC59 # duplicate of PRVABC59
V15555 # highly diverged
DK # lab strain
DK23 # lab strain
rGZ02a/2018 # highly diverged
rGZ02p/2018 # highly diverged
V211784 # highly diverged
LMM/AG5643
Faranah/18
MK105975 # MR766 # lab strain
KX856011 # Aedes_sp/MEX_I_44/2016 # duplicate of Aedes_aegypti/MEX/MEX_I_44/2016
MK028857 # Puerto_Rico/2015/PRVABC59 # duplicate of PRVABC59
MN025403 # V15555 # highly diverged
MT505349 # DK # lab strain
MT505350 # DK23 # lab strain
MW680969 # rGZ02a/2018 # highly diverged
MW680970 # rGZ02p/2018 # highly diverged
OK054351 # V211784 # highly diverged
MT478034 # LMM/AG5643
OL414716 # Faranah/18
Loading

0 comments on commit f241e07

Please sign in to comment.