Skip to content

Commit

Permalink
Merge pull request #10 from PacificBiosciences/wrowell/yak-suggestions
Browse files Browse the repository at this point in the history
Use FASTA file size to estimate depth for yak count parameters.
  • Loading branch information
gconcepcion authored Oct 2, 2023
2 parents 6c02327 + ebe6afe commit eee833f
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 198 deletions.
24 changes: 5 additions & 19 deletions wdl-ci.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@
},
"yak_count": {
"key": "yak_count",
"digest": "qysjdjudeldfcf6pm2unping3zkh4qve",
"digest": "6hlh6n3b3cqohtmjweg57of626he4c4v",
"tests": [
{
"inputs": {
Expand All @@ -150,7 +150,8 @@
"${resources_file_path}/m64017_200108_232219.hifi_reads.fasta",
"${resources_file_path}/m64017_200112_090459.hifi_reads.fasta"
],
"yak_options": "-b37",
"yak_params": "-b37",
"mem_gb": 70,
"runtime_attributes": "${default_runtime_attributes}"
},
"output_tests": {
Expand All @@ -164,21 +165,6 @@
}
}
]
},
"fasta_basecount": {
"key": "fasta_basecount",
"digest": "",
"tests": []
},
"get_total_gbp": {
"key": "get_total_gbp",
"digest": "",
"tests": []
},
"determine_yak_options": {
"key": "determine_yak_options",
"digest": "",
"tests": []
}
}
},
Expand All @@ -189,7 +175,7 @@
"tasks": {
"hifiasm_assemble": {
"key": "hifiasm_assemble",
"digest": "r4ikydzmdaed4hzsmc3t7efh6mz5e4mx",
"digest": "vhkzwee3f754jcjksog22uyps3j6myow",
"tests": [
{
"inputs": {
Expand Down Expand Up @@ -278,7 +264,7 @@
},
"align_hifiasm": {
"key": "align_hifiasm",
"digest": "77gs34t4c2i6epsg2epukfoaign2fmnt",
"digest": "4qf5jeepfn3jv3g2socql6xh7vmd4b7s",
"tests": [
{
"inputs": {
Expand Down
11 changes: 6 additions & 5 deletions workflows/assemble_genome/assemble_genome.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ workflow assemble_genome {
parameter_meta {
sample_id: {help: "Sample ID; used for naming files"}
reads_fastas: {help: "Reads in fasta format to be used for assembly; one for each movie bam to be used in assembly. Reads fastas from one or more sample may be combined to use in the assembly"}
reference: {help: "Reference genome data"}
references: {help: "Array of Reference genomes data"}
hiiasm_extra_params: {help: "[OPTIONAL] Additional parameters to pass to hifiasm assembly"}
father_yak: {help: "[OPTIONAL] kmer counts for the father; required if running trio-based assembly"}
mother_yak: {help: "[OPTIONAL] kmer counts for the mother; required if running trio-based assembly"}
Expand All @@ -98,7 +98,7 @@ task hifiasm_assemble {
String prefix = "~{sample_id}.asm"
Int threads = 48
Int mem_gb = threads * 6
Int disk_size = ceil((size(reads_fastas[0], "GB") * length(reads_fastas)) * 4 + 20)
Int disk_size = ceil(size(reads_fastas, "GB") * 4 + 20)
command <<<
set -euo pipefail
Expand Down Expand Up @@ -202,7 +202,8 @@ task align_hifiasm {
}
Int threads = 16
Int disk_size = ceil((size(query_sequences[0], "GB") * length(query_sequences) + size(reference, "GB")) * 2 + 20)
Int mem_gb = threads * 8
Int disk_size = ceil((size(query_sequences, "GB") + size(reference, "GB")) * 2 + 20)
command <<<
set -euo pipefail
Expand All @@ -218,7 +219,7 @@ task align_hifiasm {
~{reference} \
~{sep=' ' query_sequences} \
| samtools sort \
-@ 4 \
-@ 3 \
-T ./TMP \
-m 8G \
-O BAM \
Expand All @@ -235,7 +236,7 @@ task align_hifiasm {
runtime {
docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:3968cb152a65163005ffed46297127536701ec5af4c44e8f3e7051f7b01f80fe"
cpu: threads
memory: "128 GB"
memory: mem_gb + " GB"
disk: disk_size + " GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: runtime_attributes.preemptible_tries
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ workflow de_novo_assembly_sample {
sample_id = sample.sample_id,
reads_fastas = samtools_fasta.reads_fasta,
references = references,
hifiasm_extra_params = "",
backend = backend,
default_runtime_attributes = default_runtime_attributes,
on_demand_runtime_attributes = on_demand_runtime_attributes
Expand Down Expand Up @@ -82,7 +81,7 @@ workflow de_novo_assembly_sample {

parameter_meta {
sample: {help: "Sample information and associated data files"}
reference: {help: "Reference genome data"}
references: {help: "Array of Reference genomes data"}
default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"}
on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"}
}
Expand Down
164 changes: 21 additions & 143 deletions workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,6 @@ workflow de_novo_assembly_trio {
}
}
# For yak, we need to know the total input coverage so we can set cloud memory resources accordingly
scatter (fasta in samtools_fasta_father.reads_fasta) {
call fasta_basecount as fasta_bc_father {
input:
reads_fasta = fasta,
runtime_attributes = default_runtime_attributes
}
}
call get_total_gbp as get_total_gbp_father {
input:
sample_id = father.sample_id,
fasta_totals = fasta_bc_father.read_total_bp,
runtime_attributes = default_runtime_attributes
}
scatter (movie_bam in mother.movie_bams) {
call SamtoolsFasta.samtools_fasta as samtools_fasta_mother {
input:
Expand All @@ -65,41 +49,32 @@ workflow de_novo_assembly_trio {
}
}
# For yak, we need to know the total input coverage so we can set cloud memory resources accordingly
scatter (fasta in samtools_fasta_mother.reads_fasta) {
call fasta_basecount as fasta_bc_mother {
input:
reads_fasta = fasta,
runtime_attributes = default_runtime_attributes
}
}
call get_total_gbp as get_total_gbp_mother {
input:
sample_id = mother.sample_id,
fasta_totals = fasta_bc_mother.read_total_bp,
runtime_attributes = default_runtime_attributes
}
# if parental coverage is low (<15x), keep singleton kmers from parents and use them to bin child reads
# if parental coverage is high (>=15x), use bloom filter and require that a kmer occur >= 5 times in
# one parent and <2 times in the other parent to be used for binning
# 60GB uncompressed FASTA ~= 10x coverage (this is not robust to big changes in mean read length)
# memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage)
Boolean low_depth = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false
call determine_yak_options {
input:
father_total_gbp = get_total_gbp_father.sample_total_gbp,
mother_total_gbp = get_total_gbp_mother.sample_total_gbp,
}
String yak_params = if (low_depth) then "-b0" else "-b37"
Int yak_mem_gb = if (low_depth) then 70 else 50
String hifiasm_extra_params = if (low_depth) then "-c1 -d1" else "-c2 -d5"
call yak_count as yak_count_father {
input:
sample_id = father.sample_id,
reads_fastas = samtools_fasta_father.reads_fasta,
yak_options = determine_yak_options.yak_options,
yak_params = yak_params,
mem_gb = yak_mem_gb,
runtime_attributes = default_runtime_attributes
}
call yak_count as yak_count_mother {
input:
sample_id = mother.sample_id,
reads_fastas = samtools_fasta_mother.reads_fasta,
yak_options = determine_yak_options.yak_options,
yak_params = yak_params,
mem_gb = yak_mem_gb,
runtime_attributes = default_runtime_attributes
}
Expand All @@ -125,7 +100,7 @@ workflow de_novo_assembly_trio {
sample_id = "~{cohort.cohort_id}.~{child.sample_id}",
reads_fastas = samtools_fasta_child.reads_fasta,
references = references,
hifiasm_extra_params = "-c1 -d1",
hifiasm_extra_params = hifiasm_extra_params,
father_yak = yak_count_father.yak,
mother_yak = yak_count_mother.yak,
backend = backend,
Expand All @@ -142,12 +117,11 @@ workflow de_novo_assembly_trio {
Array[Array[File]] zipped_assembly_fastas = flatten(assemble_genome.zipped_assembly_fastas)
Array[Array[File]] assembly_stats = flatten(assemble_genome.assembly_stats)
Array[Array[IndexData]] asm_bams = flatten(assemble_genome.asm_bams)

}

parameter_meta {
cohort: {help: "Sample information for the cohort"}
references: {help: "List of reference genome data"}
references: {help: "Array of Reference genomes data"}
default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"}
on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"}
}
Expand Down Expand Up @@ -186,47 +160,27 @@ task parse_families {
}
}
task determine_yak_options {
input {
Int mother_total_gbp
Int father_total_gbp
}

command {
set -e
if [ ~{father_total_gbp} -lt 48 ] && [ ~{mother_total_gbp} -lt 48 ]; then
options=""
else
options="-b37"
fi
echo $options
}
output {
String yak_options = read_string(stdout())
}
}

task yak_count {
input {
String sample_id
Array[File] reads_fastas
String yak_options

String yak_params
Int mem_gb

RuntimeAttributes runtime_attributes
}
Int threads = 10
# Usage up to 140 GB @ 10 threads for Revio samples
Int mem_gb = 16 * threads
Int threads = 24
Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20)
command <<<
set -euo pipefail
yak count \
-t ~{threads} \
-o ~{sample_id}.yak \
~{yak_options} \
~{yak_params} \
~{sep=' ' reads_fastas}
>>>
Expand All @@ -247,79 +201,3 @@ task yak_count {
zones: runtime_attributes.zones
}
}
task fasta_basecount {
input {
File reads_fasta
String reads_fasta_basename = basename(reads_fasta)

RuntimeAttributes runtime_attributes
}
Int threads = 1
Int mem_gb = 4 * threads
Int disk_size = ceil(size(reads_fasta, "GB") * 2 + 20)
command <<<
set -euo pipefail
grep -v "^>" ~{reads_fasta} | tr -d '\n' | wc -c > ~{reads_fasta_basename}.total
>>>
output {
File read_total_bp = "~{reads_fasta_basename}.total"
}

runtime {
docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2"
cpu: threads
memory: mem_gb + " GB"
disk: disk_size + " GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: runtime_attributes.preemptible_tries
maxRetries: runtime_attributes.max_retries
awsBatchRetryAttempts: runtime_attributes.max_retries
queueArn: runtime_attributes.queue_arn
zones: runtime_attributes.zones
}
}
task get_total_gbp {
input {
String sample_id
Array[File] fasta_totals

RuntimeAttributes runtime_attributes
}
Int threads = 1
Int mem_gb = 4 * threads
Int disk_size = ceil(size(fasta_totals[0], "GB") * 2 + 20)
command <<<
set -euo pipefail
awk '{sum+=$1}END{print sum/1000000000}' ~{sep=' ' fasta_totals} > ~{sample_id}.total
>>>
output {
Int sample_total_gbp = round(read_float("~{sample_id}.total"))
}

runtime {
docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2"
cpu: threads
memory: mem_gb + " GB"
disk: disk_size + " GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: runtime_attributes.preemptible_tries
maxRetries: runtime_attributes.max_retries
awsBatchRetryAttempts: runtime_attributes.max_retries
queueArn: runtime_attributes.queue_arn
zones: runtime_attributes.zones
}
}
Loading

0 comments on commit eee833f

Please sign in to comment.