Skip to content

Commit

Permalink
Merge pull request #368 from broadinstitute/dp-lineages
Browse files Browse the repository at this point in the history
multi-sample lineage tasks
  • Loading branch information
dpark01 authored Oct 1, 2021
2 parents e4b400a + 7d9379a commit 687f21f
Show file tree
Hide file tree
Showing 9 changed files with 284 additions and 141 deletions.
59 changes: 0 additions & 59 deletions pipes/WDL/tasks/tasks_nextstrain.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -586,65 +586,6 @@ task filter_subsample_sequences {
}
}
task filter_sequences_by_length {
meta {
description: "Filter sequences in a fasta file to enforce a minimum count of non-N bases."
}
input {
File sequences_fasta
Int min_non_N = 1
String docker = "quay.io/broadinstitute/viral-core:2.1.32"
}
parameter_meta {
sequences_fasta: {
description: "Set of sequences in fasta format",
patterns: ["*.fasta", "*.fa"]
}
min_non_N: {
description: "Minimum number of called bases (non-N, non-gap, A, T, C, G, and other non-N ambiguity codes accepted)"
}
}
String out_fname = sub(basename(sequences_fasta), ".fasta", ".filtered.fasta")
command <<<
python3 <<CODE
import Bio.SeqIO
import gzip
n_total = 0
n_kept = 0
open_or_gzopen = lambda *args, **kwargs: gzip.open(*args, **kwargs) if args[0].endswith('.gz') else open(*args, **kwargs)
with open_or_gzopen('~{sequences_fasta}', 'rt') as inf:
with open_or_gzopen('~{out_fname}', 'wt') as outf:
for seq in Bio.SeqIO.parse(inf, 'fasta'):
n_total += 1
ungapseq = seq.seq.ungap().upper()
if (len(ungapseq) - ungapseq.count('N')) >= ~{min_non_N}:
n_kept += 1
Bio.SeqIO.write(seq, outf, 'fasta')
n_dropped = n_total-n_kept
with open('IN_COUNT', 'wt') as outf:
outf.write(str(n_total)+'\n')
with open('OUT_COUNT', 'wt') as outf:
outf.write(str(n_kept)+'\n')
with open('DROP_COUNT', 'wt') as outf:
outf.write(str(n_dropped)+'\n')
CODE
>>>
runtime {
docker: docker
memory: "1 GB"
cpu : 1
disks: "local-disk 300 HDD"
dx_instance_type: "mem1_ssd1_v2_x2"
}
output {
File filtered_fasta = out_fname
Int sequences_in = read_int("IN_COUNT")
Int sequences_dropped = read_int("DROP_COUNT")
Int sequences_out = read_int("OUT_COUNT")
}
}
task filter_sequences_to_list {
meta {
description: "Filter and subsample a sequence set to a specific list of ids in a text file (one id per line)."
Expand Down
145 changes: 139 additions & 6 deletions pipes/WDL/tasks/tasks_sarscov2.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ task nextclade_many_samples {
String basename
String docker = "nextstrain/nextclade:1.2.3"
}
command {
command <<<
set -e
apt-get update
apt-get -y install python3
Expand All @@ -112,7 +112,36 @@ task nextclade_many_samples {
--output-json "~{basename}".nextclade.json \
--output-tsv "~{basename}".nextclade.tsv \
--output-tree "~{basename}".nextclade.auspice.json
}
cp genomes.aligned.fasta "~{basename}".nextalign.msa.fasta
python3 <<CODE
# transpose table
import codecs, csv, json
out_maps = {'clade':{}, 'aaSubstitutions':{}, 'aaDeletions':{}}
with codecs.open('~{basename}.nextclade.tsv', 'r', encoding='utf-8') as inf:
with codecs.open('NEXTCLADE_CLADE', 'w', encoding='utf-8') as outf_clade:
with codecs.open('NEXTCLADE_AASUBS', 'w', encoding='utf-8') as outf_aasubs:
with codecs.open('NEXTCLADE_AADELS', 'w', encoding='utf-8') as outf_aadels:
for row in csv.DictReader(inf, delimiter='\t'):
outf_clade.write('\t'.join([row['seqName'], row['clade']])+'\n')
outf_aasubs.write('\t'.join([row['seqName'], row['aaSubstitutions']])+'\n')
outf_aadels.write('\t'.join([row['seqName'], row['aaDeletions']])+'\n')
for k in ('clade','aaSubstitutions','aaDeletions'):
out_maps[k][row['seqName']] = row[k]
with codecs.open('NEXTCLADE_CLADE.json', 'w', encoding='utf-8') as outf:
json.dump(out_maps['clade'], outf)
with codecs.open('NEXTCLADE_AASUBS.json', 'w', encoding='utf-8') as outf:
json.dump(out_maps['aaSubstitutions'], outf)
with codecs.open('NEXTCLADE_AADELS.json', 'w', encoding='utf-8') as outf:
json.dump(out_maps['aaDeletions'], outf)
CODE
# gather runtime metrics
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
cat /proc/loadavg > CPU_LOAD
cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
>>>
runtime {
docker: docker
memory: "14 GB"
Expand All @@ -121,10 +150,20 @@ task nextclade_many_samples {
dx_instance_type: "mem1_ssd1_v2_x16"
}
output {
String nextclade_version = read_string("VERSION")
File nextclade_json = "~{basename}.nextclade.json"
File auspice_json = "~{basename}.nextclade.auspice.json"
File nextclade_tsv = "~{basename}.nextclade.tsv"
#Map[String,String] nextclade_clade = read_map("NEXTCLADE_CLADE")
#Map[String,String] aa_subs_csv = read_map("NEXTCLADE_AASUBS")
#Map[String,String] aa_dels_csv = read_map("NEXTCLADE_AADELS")
Map[String,String] nextclade_clade = read_json("NEXTCLADE_CLADE.json")
Map[String,String] aa_subs_csv = read_json("NEXTCLADE_AASUBS.json")
Map[String,String] aa_dels_csv = read_json("NEXTCLADE_AADELS.json")
String nextclade_version = read_string("VERSION")
File nextalign_msa = "~{basename}.nextalign.msa.fasta"
File nextclade_json = "~{basename}.nextclade.json"
File auspice_json = "~{basename}.nextclade.auspice.json"
File nextclade_tsv = "~{basename}.nextclade.tsv"
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
String cpu_load = read_string("CPU_LOAD")
}
}
Expand Down Expand Up @@ -154,6 +193,7 @@ task pangolin_one_sample {
~{"--min-length " + min_length} \
~{"--max-ambig " + max_ambig} \
--alignment \
--threads $(nproc) \
--verbose
cp sequences.aln.fasta "~{basename}.pangolin_msa.fasta"
Expand Down Expand Up @@ -198,6 +238,99 @@ task pangolin_one_sample {
}
}
task pangolin_many_samples {
meta {
description: "Pangolin classification of multiple SARS-CoV-2 samples."
}
input {
Array[File]+ genome_fastas
Int? min_length
Float? max_ambig
Boolean inference_usher=true
String basename
String docker = "quay.io/staphb/pangolin:3.1.11-pangolearn-2021-09-17"
}
command <<<
date | tee DATE
conda list -n pangolin | grep "usher" | awk -F ' +' '{print$1, $2}' | tee VERSION_PANGO_USHER
set -ex
pangolin -v | tee VERSION_PANGOLIN
pangolin -pv | tee VERSION_PANGOLEARN
pangolin --all-versions | tr '\n' ';' | cut -f -5 -d ';' | tee VERSION_PANGOLIN_ALL
cat ~{sep=" " genome_fastas} > unaligned.fasta
pangolin unaligned.fasta \
~{true='--usher' false='' inference_usher} \
--outfile "~{basename}.pangolin_report.csv" \
~{"--min-length " + min_length} \
~{"--max-ambig " + max_ambig} \
--alignment \
--threads $(nproc) \
--verbose
cp sequences.aln.fasta "~{basename}.pangolin_msa.fasta"
python3 <<CODE
import csv, json
#grab output values by column header
with open("~{basename}.pangolin_report.csv", 'rt') as csv_file:
for line in csv.DictReader(csv_file):
with open("VERSION", 'wt') as outf:
pangolin_version=line["pangolin_version"]
version=line["version"]
outf.write(f"pangolin {pangolin_version}; {version}")
break
out_maps = {'lineage':{}, 'conflict':{}, 'note':{}}
with open("~{basename}.pangolin_report.csv", 'rt') as csv_file:
with open("PANGO_LINEAGE", 'wt') as outf_lineage:
with open("PANGOLIN_CONFLICTS", 'wt') as outf_conflicts:
with open("PANGOLIN_NOTES", 'wt') as outf_note:
for row in csv.DictReader(csv_file):
outf_lineage.write('\t'.join([row['taxon'], row['lineage']])+'\n')
outf_conflicts.write('\t'.join([row['taxon'], row['conflict']])+'\n')
outf_note.write('\t'.join([row['taxon'], row['note']])+'\n')
for k in ('lineage','conflict','note'):
out_maps[k][row['taxon']] = row[k]
with open('PANGO_LINEAGE.json', 'wt') as outf:
json.dump(out_maps['lineage'], outf)
with open('PANGOLIN_CONFLICTS.json', 'wt') as outf:
json.dump(out_maps['conflict'], outf)
with open('PANGOLIN_NOTES.json', 'wt') as outf:
json.dump(out_maps['note'], outf)
CODE
# gather runtime metrics
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
cat /proc/loadavg > CPU_LOAD
cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
>>>
runtime {
docker: docker
memory: "14 GB"
cpu: 16
disks: "local-disk 100 HDD"
dx_instance_type: "mem1_ssd1_v2_x16"
}
output {
#Map[String,String] pango_lineage = read_map("PANGO_LINEAGE")
#Map[String,String] pangolin_conflicts = read_map("PANGOLIN_CONFLICTS")
#Map[String,String] pangolin_notes = read_map("PANGOLIN_NOTES")
Map[String,String] pango_lineage = read_json("PANGO_LINEAGE.json")
Map[String,String] pangolin_conflicts = read_json("PANGOLIN_CONFLICTS.json")
Map[String,String] pangolin_notes = read_json("PANGOLIN_NOTES.json")
String date = read_string("DATE")
String version = read_string("VERSION")
String pangolin_docker = docker
String pangolin_versions = read_string("VERSION_PANGOLIN_ALL")
String pangolin_usher_version = read_string("VERSION_PANGO_USHER")
String pangolin_version = read_string("VERSION_PANGOLIN")
String pangolearn_version = read_string("VERSION_PANGOLEARN")
File pango_lineage_report = "${basename}.pangolin_report.csv"
File msa_fasta = "~{basename}.pangolin_msa.fasta"
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
String cpu_load = read_string("CPU_LOAD")
}
}
task sequencing_report {
meta {
Expand Down
64 changes: 62 additions & 2 deletions pipes/WDL/tasks/tasks_utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ task tsv_join {
Array[File]+ input_tsvs
String id_col
String out_basename = "merged"
String out_suffix = ".txt"
}
command <<<
Expand Down Expand Up @@ -291,15 +292,15 @@ task tsv_join {
out_ids = list(collections.OrderedDict(((i,0) for i in out_ids)).keys())
# write output
with open(out_basename+'.txt', 'w', newline='') as outf:
with open(out_basename+'~{out_suffix}', 'w', newline='') as outf:
writer = csv.DictWriter(outf, header, delimiter='\t', dialect=csv.unix_dialect, quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
writer.writerows(out_row_by_id[row_id] for row_id in out_ids)
CODE
>>>
output {
File out_tsv = "${out_basename}.txt"
File out_tsv = "~{out_basename}~{out_suffix}"
}
runtime {
Expand Down Expand Up @@ -499,3 +500,62 @@ task s3_copy {
disks: "local-disk 1000 HDD"
}
}
task filter_sequences_by_length {
meta {
description: "Filter sequences in a fasta file to enforce a minimum count of non-N bases."
}
input {
File sequences_fasta
Int min_non_N = 1
String docker = "quay.io/broadinstitute/viral-core:2.1.32"
}
parameter_meta {
sequences_fasta: {
description: "Set of sequences in fasta format",
patterns: ["*.fasta", "*.fa"]
}
min_non_N: {
description: "Minimum number of called bases (non-N, non-gap, A, T, C, G, and other non-N ambiguity codes accepted)"
}
}
String out_fname = sub(basename(sequences_fasta), ".fasta", ".filtered.fasta")
command <<<
python3 <<CODE
import Bio.SeqIO
import gzip
n_total = 0
n_kept = 0
open_or_gzopen = lambda *args, **kwargs: gzip.open(*args, **kwargs) if args[0].endswith('.gz') else open(*args, **kwargs)
with open_or_gzopen('~{sequences_fasta}', 'rt') as inf:
with open_or_gzopen('~{out_fname}', 'wt') as outf:
for seq in Bio.SeqIO.parse(inf, 'fasta'):
n_total += 1
ungapseq = seq.seq.ungap().upper()
if (len(ungapseq) - ungapseq.count('N')) >= ~{min_non_N}:
n_kept += 1
Bio.SeqIO.write(seq, outf, 'fasta')
n_dropped = n_total-n_kept
with open('IN_COUNT', 'wt') as outf:
outf.write(str(n_total)+'\n')
with open('OUT_COUNT', 'wt') as outf:
outf.write(str(n_kept)+'\n')
with open('DROP_COUNT', 'wt') as outf:
outf.write(str(n_dropped)+'\n')
CODE
>>>
runtime {
docker: docker
memory: "1 GB"
cpu : 1
disks: "local-disk 300 HDD"
dx_instance_type: "mem1_ssd1_v2_x2"
}
output {
File filtered_fasta = out_fname
Int sequences_in = read_int("IN_COUNT")
Int sequences_dropped = read_int("DROP_COUNT")
Int sequences_out = read_int("OUT_COUNT")
}
}
2 changes: 1 addition & 1 deletion pipes/WDL/workflows/augur_from_assemblies.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ workflow augur_from_assemblies {
infiles = assembly_fastas,
output_name = "all_samples_combined_assembly.fasta"
}
call nextstrain.filter_sequences_by_length {
call utils.filter_sequences_by_length {
input:
sequences_fasta = zcat.combined,
min_non_N = min_unambig_genome
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/workflows/mafft_and_snp.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ workflow mafft_and_snp {
infiles = assembly_fastas,
output_name = "all_samples_combined_assembly.fasta.gz"
}
call nextstrain.filter_sequences_by_length {
call utils.filter_sequences_by_length {
input:
sequences_fasta = zcat.combined,
min_non_N = min_unambig_genome
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/workflows/mafft_and_snp_annotated.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ workflow mafft_and_snp_annotated {
infiles = assembly_fastas,
output_name = "all_samples_combined_assembly.fasta.gz"
}
call nextstrain.filter_sequences_by_length {
call utils.filter_sequences_by_length {
input:
sequences_fasta = zcat.combined,
min_non_N = min_unambig_genome
Expand Down
Loading

0 comments on commit 687f21f

Please sign in to comment.