diff --git a/docker/Dockerfile b/docker/Dockerfile index 6adbce0f..ce9cb447 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -10,7 +10,7 @@ RUN apt-get update --allow-releaseinfo-change && \ apt-get install -y --no-install-recommends \ curl wget bzip2 make gcc cmake g++ keychain git build-essential zlib1g-dev libssl-dev lbzip2 \ libbz2-dev libcurl4-gnutls-dev libncurses5-dev libstdc++6 libncursesw5-dev liblzma-dev clang \ - libclang-dev pkg-config libtcmalloc-minimal4 python3-dev python3-setuptools && \ + libclang-dev pkg-config libtcmalloc-minimal4 python3-dev python3-setuptools fontconfig && \ rm -rf /var/lib/apt/lists/* # Reduced gcloud installation (from framegrace: https://github.com/GoogleCloudPlatform/gsutil/issues/1732#issuecomment-2029591598) @@ -59,6 +59,11 @@ RUN wget -O samtools-1.21.tar.bz2 https://github.com/samtools/samtools/releases/ # Final stage FROM python:3.9-slim +# Install some libraries in final image +RUN apt-get update && \ + apt-get install -y --no-install-recommends fontconfig libcurl4-gnutls-dev && \ + rm -rf /var/lib/apt/lists/* + # Copy necessary files from builder stage COPY --from=builder /usr/local/bin/minimap2 /usr/local/bin/minimap2 COPY --from=builder /usr/local/bin/samtools /usr/local/bin/samtools diff --git a/src/hidive/Cargo.toml b/src/hidive/Cargo.toml index 12e3f581..11d050b2 100644 --- a/src/hidive/Cargo.toml +++ b/src/hidive/Cargo.toml @@ -23,7 +23,8 @@ indicatif = { version = "0.17.8", features = ["rayon"] } itertools = "0.13.0" # linfa = "0.5" linked-hash-map = "0.5.6" -minimap2 = { version = "0.1.20+minimap2.2.28", features = ["simde"] } +# minimap2 = { version = "0.1.20+minimap2.2.28", features = ["simde"] } +minimap2 = "0.1.21" needletail = "0.5.1" # noodles = "0.78.0" # ndarray = "0.15" @@ -36,10 +37,11 @@ petgraph = "0.6.5" rand = "0.8.5" rayon = "1.10.0" #recgraph = { git = "https://github.com/AlgoLab/RecGraph" } -# regex = "1.10.5" +regex = "1.10.5" #russcip = "0.3.4" rust-htslib = { version = "0.47.0", features = ["gcs", "serde_feature"] } # rust-wfa2 = {git = "https://github.com/pairwise-alignment/rust-wfa2.git"} +sdust = "=0.1.0" serde = "1.0.204" serde_json = "1.0.120" spoa = {git = "https://github.com/nlhepler/spoa-rs.git"} diff --git a/src/hidive/src/assemble.rs b/src/hidive/src/assemble.rs new file mode 100644 index 00000000..92a241c4 --- /dev/null +++ b/src/hidive/src/assemble.rs @@ -0,0 +1,92 @@ +use std::collections::{HashMap, HashSet}; +use std::{fs::File, path::PathBuf, io::Write}; + +use needletail::Sequence; +use petgraph::graph::NodeIndex; +use rayon::prelude::*; +use indicatif::ParallelProgressIterator; + +use skydive::ldbg::LdBG; +use skydive::mldbg::MLdBG; +use skydive::utils::*; + +pub fn start( + output: &PathBuf, + gfa_output: Option, + kmer_size: usize, + model_path: &PathBuf, + long_read_fasta_path: &PathBuf, + short_read_fasta_path: &PathBuf, +) { + let long_read_seq_urls = skydive::parse::parse_file_names(&[long_read_fasta_path.clone()]); + let short_read_seq_urls = skydive::parse::parse_file_names(&[short_read_fasta_path.clone()]); + + // Read all long reads. + skydive::elog!("Processing long-read samples {:?}...", long_read_seq_urls.iter().map(|url| url.as_str()).collect::>()); + let all_lr_seqs = skydive::utils::read_fasta(&vec![long_read_fasta_path.clone()]); + + // Read all short reads. + skydive::elog!("Processing short-read samples {:?}...", short_read_seq_urls.iter().map(|url| url.as_str()).collect::>()); + let all_sr_seqs = skydive::utils::read_fasta(&vec![short_read_fasta_path.clone()]); + + let l1 = LdBG::from_sequences("lr".to_string(), kmer_size, &all_lr_seqs); + let s1 = LdBG::from_sequences("sr".to_string(), kmer_size, &all_sr_seqs); + + let m = MLdBG::from_ldbgs(vec![l1, s1]) + .score_kmers(model_path) + .collapse() + .clean(0.1) + .build_links(&all_lr_seqs, false); + + skydive::elog!("Built MLdBG with {} k-mers.", m.kmers.len()); + + skydive::elog!("Correcting reads..."); + let corrected_seqs = m.correct_seqs(&all_lr_seqs); + + let mut fa_file = File::create(output).unwrap(); + for (i, corrected_seq) in corrected_seqs.iter().enumerate() { + let _ = writeln!(fa_file, ">corrected_{}\n{}", i, String::from_utf8(corrected_seq.clone()).unwrap()); + } + + // let read = b"CAGCTGCCCATGCCACCTCCTCCTTCTCTGCCCGCCCCAGTGCCTTATGGGTCCAAGGTTGACTCCTGTCCCTAGGGCAGGCCTGTGGGCCCTGCCTGATCCCTACTGGGAGGATGGTACCTAGGGTTGGAGCCAAACAAGTGTCCTCCTCCAGCGCCAGCCTGGCCCTGAGTGCGAACTCGTCACTGGTCAGGGGTCTGTACAGCAGCGTCCCTGAGGGCCCAGAGAGGTAGCCAGTCCTGTGGTGAGGTGACGAGGCTGAGGGTGGTGGCTCAGTCCTGGGCTTCCATGGGGCCTTCCCAGGGAACGTTCTGGCACCTGCCGACTGAGCCCTGGGAGGTAGGTAGCCCTGGCCTATAGCTCCCTGACGCCATGATTTGTCTTCCGTTTTTGGGGTGTCATATATGAAGGGAGGTGACTGTGATGGTGCTGGCAGGACTGCTGTCCCTGATGTGGGGTGGGCTGAGTTAGGCCTGAAATATGGGCCTCCAGGCTGAGTCCTGCCCTCTCCACCACATCCAGGGCTGACTGACACCTCTAGTCAGCCCATTCTGGCCCCTTCCCCACATGCCAGGACAATGTAGTCCTTGTCATCAATCTGGGCAGTCAGAGTTGGGTCAGTGGGGGACATGGGATTATGGGCAAGGGTAACTGACATCTGCTCAGCCTCAACGTACCCCTGTCTCAAATGCGGCCAGGCGGTGGGGTAAGCAGGAATGAGGCAGGGGTTGGGGTTGCCCTGAGGAGGATGATCCCAACGAGGGCGTGAGCAGGGGACCCGAGTT"; + // let corrected_seqs = m.correct_seqs(&vec![read.to_vec()]); + // let mut fa_file = File::create(output).unwrap(); + // for (i, corrected_seq) in corrected_seqs.iter().enumerate() { + // let _ = writeln!(fa_file, ">corrected_{}\n{}", i, String::from_utf8(corrected_seq.clone()).unwrap()); + // } + + if let Some(gfa_output) = gfa_output { + skydive::elog!("Writing GFA to {}", gfa_output.display()); + + let g = m.traverse_all_kmers(); + + let _ = write_gfa(&mut File::create(gfa_output.clone()).unwrap(), &g); + + let csv_output = gfa_output.with_extension("csv"); + let mut csv_file = File::create(&csv_output).unwrap(); + + writeln!(csv_file, "node,label,kmer,cov,entropy").unwrap(); + + for (node_index, node_label) in g.node_indices().zip(g.node_weights()) { + let kmer = node_label.as_bytes(); + let cn_kmer = skydive::utils::canonicalize_kmer(kmer); + let score = (100.0 * *m.scores.get(&cn_kmer).unwrap_or(&0.0)) as u32; + let cov = if m.kmers.contains_key(&cn_kmer) { m.kmers.get(&cn_kmer).unwrap().coverage() } else { 0 }; + let entropy = skydive::utils::shannon_entropy(kmer); + let sources = m.sources.get(&cn_kmer).unwrap_or(&vec![]).clone(); + + let source = if sources.len() == 1 { sources[0] } else { 2 }; + + writeln!( + csv_file, + "{},{},{},{},{}", + node_index.index(), + format!("{} ({})", source, score), + node_label, + cov, + entropy, + ) + .unwrap(); + } + } +} \ No newline at end of file diff --git a/src/hidive/src/build.rs b/src/hidive/src/build.rs index b9a9702a..5ab775cb 100644 --- a/src/hidive/src/build.rs +++ b/src/hidive/src/build.rs @@ -971,7 +971,7 @@ pub fn start(output: &PathBuf, k: usize, fasta_path: &PathBuf, reference_name: S .collect(); // Filter edges with little read support. - let filtered_edges = filter_undersupported_edges(&edge_info, &stem, 4); + let filtered_edges = filter_undersupported_edges(&edge_info, &stem, 30); // Write final graph to disk. let _ = write_gfa( diff --git a/src/hidive/src/call.rs b/src/hidive/src/call.rs new file mode 100644 index 00000000..31234cb6 --- /dev/null +++ b/src/hidive/src/call.rs @@ -0,0 +1,327 @@ +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::{fs::File, path::PathBuf, io::Write}; + +use itertools::Itertools; +use linked_hash_map::LinkedHashMap; +use minimap2::Aligner; +use needletail::Sequence; +use petgraph::graph::NodeIndex; +use rayon::prelude::*; +use indicatif::ParallelProgressIterator; + +use rust_htslib::bam::{FetchDefinition, Read}; +use skydive::ldbg::LdBG; +use skydive::mldbg::MLdBG; + +use skydive::wmec::*; +use spoa::AlignmentType; + +pub fn start( + output: &PathBuf, + loci_list: &Vec, + reference_fasta_path: &PathBuf, + bam_path: &PathBuf, +) { + // Get the system's temporary directory path + let cache_path = std::env::temp_dir(); + skydive::elog!("Intermediate data will be stored at {:?}.", cache_path); + + // Parse reference sequence file path. + let reference_seq_urls = skydive::parse::parse_file_names(&[reference_fasta_path.clone()]); + let reference_seq_url = reference_seq_urls.iter().next().unwrap(); + + // Parse BAM file path. + let bam_urls = skydive::parse::parse_file_names(&[bam_path.clone()]); + let bam_url = bam_urls.iter().next().unwrap(); + + // Iterate over loci + let loci = skydive::parse::parse_loci(loci_list, 0).into_iter().collect::>(); + + for (chr, start, stop, name) in loci { + let fasta = skydive::stage::open_fasta(&reference_seq_url).unwrap(); + let mut bam = skydive::stage::open_bam(&bam_url).unwrap(); + + skydive::elog!("Processing locus {} ({}:{}-{})...", name, chr, start, stop); + // skydive::elog!("Reference sequence: {}", seq); + + let mut read_ids = HashMap::new(); + let mut matrix = Vec::new(); + let mut metadata: BTreeMap = BTreeMap::new(); + let mut mloci = Vec::new(); + + let _ = bam.fetch(FetchDefinition::RegionString(chr.as_bytes(), start as i64, stop as i64)); + for (cursor, p) in bam.pileup().enumerate() { + let pileup = p.unwrap(); + + // skydive::elog!("Processing position {}:{}-{} {}...", chr, start, stop, pileup.pos() + 1); + + if pileup.pos() + 1 < start as u32 || pileup.pos() + 1 > stop as u32 { + continue; + } + + let ref_str = fasta.fetch_seq_string(&chr, usize::try_from(pileup.pos()).unwrap(), usize::try_from(pileup.pos()).unwrap()).unwrap(); + let ref_base = ref_str.as_bytes()[0]; + + let mut is_variant = false; + + let mut allele_map: BTreeMap = BTreeMap::new(); + + for alignment in pileup.alignments() { + let record = alignment.record(); + let qname = String::from_utf8(record.qname().to_vec()).unwrap(); + + if !alignment.is_del() && !alignment.is_refskip() { + let base = record.seq()[alignment.qpos().unwrap()]; + let seq = vec![base]; + let qual = vec![record.qual()[alignment.qpos().unwrap()]]; + let q = *qual.iter().min().unwrap(); + + let len = read_ids.len(); + read_ids.entry(qname.clone()).or_insert(len); + allele_map.insert(*read_ids.get(&qname).unwrap(), (String::from_utf8_lossy(&seq).to_string(), q)); + + if base != ref_base { + is_variant = true; + } + } + + match alignment.indel() { + rust_htslib::bam::pileup::Indel::Ins(len) => { + let qpos_start = alignment.qpos().unwrap(); + let qpos_stop = alignment.qpos().unwrap() + 1 + (len as usize); + let seq = record.seq().as_bytes()[qpos_start..qpos_stop].to_vec(); + let qual = record.qual()[qpos_start..qpos_stop].to_vec(); + let q = *qual.iter().min().unwrap(); + + let len = read_ids.len(); + read_ids.entry(qname.clone()).or_insert(len); + allele_map.insert(*read_ids.get(&qname).unwrap(), (String::from_utf8_lossy(&seq).to_string(), q)); + + is_variant = true; + }, + rust_htslib::bam::pileup::Indel::Del(len) => { + let seq = vec![b'-'; len as usize]; + let qual = vec![record.qual()[alignment.qpos().unwrap()]]; + let q = *qual.iter().min().unwrap(); + + let len = read_ids.len(); + read_ids.entry(qname.clone()).or_insert(len); + let full_seq = [&[ref_base], seq.as_slice()].concat(); + allele_map.insert(*read_ids.get(&qname).unwrap(), (String::from_utf8_lossy(&full_seq).to_string(), q)); + + is_variant = true; + }, + rust_htslib::bam::pileup::Indel::None => () + } + } + + if is_variant { + // skydive::elog!("Variant found at {}:{} {} ({:?})", chr, pileup.pos() + 1, ref_base, alleles); + matrix.push(allele_map); + // metadata.push((pileup.pos(), ref_base)); + // metadata.insert(cursor, ref_base); + metadata.insert(pileup.pos() + 1, String::from_utf8(vec![ref_base]).unwrap()); + mloci.push(pileup.pos()); + } + } + + let (h1, h2) = phase_variants(&matrix); + + skydive::elog!("Haplotype 1: {:?} ({})", h1, h1.len()); + skydive::elog!("Haplotype 2: {:?} ({})", h2, h2.len()); + skydive::elog!("Metadata: {:?}", metadata); + + let mut hap_alleles = HashMap::new(); + for ((pos, a1), a2) in mloci.iter().zip(h1.iter()).zip(h2.iter()) { + if a1.is_some() && a2.is_some() { + hap_alleles.insert(pos, (a1.clone().unwrap(), a2.clone().unwrap())); + } else if a1.is_some() && a2.is_none() { + hap_alleles.insert(pos, (a1.clone().unwrap(), a1.clone().unwrap())); + } else if a1.is_none() && a2.is_some() { + hap_alleles.insert(pos, (a2.clone().unwrap(), a2.clone().unwrap())); + } + } + + let fasta = skydive::stage::open_fasta(&reference_seq_url).unwrap(); + + let mut hap1 = Vec::new(); + let mut hap2 = Vec::new(); + + for pos in start as u32..stop as u32 { + let ref_str = fasta.fetch_seq_string(&chr, usize::try_from(pos).unwrap(), usize::try_from(pos).unwrap()).unwrap(); + + if hap_alleles.contains_key(&pos) { + let a1 = hap_alleles.get(&pos).unwrap().0.clone(); + let a2 = hap_alleles.get(&pos).unwrap().1.clone(); + + hap1.push(a1); + hap2.push(a2); + } else { + hap1.push(ref_str.clone()); + hap2.push(ref_str.clone()); + } + } + + for i in 0..hap1.len() { + if hap1[i].contains("-") { + let len = hap1[i].len(); + + for j in 1..len { + hap1[i+j] = "".to_string(); + } + } + + if hap2[i].contains("-") { + let len = hap2[i].len(); + + for j in 1..len { + hap2[i+j] = "".to_string(); + } + } + } + + let h1 = hap1.join("").replace("-", ""); + let h2 = hap2.join("").replace("-", ""); + + let mut output = File::create(output).expect("Failed to create output file"); + writeln!(output, ">h1").expect("Failed to write to output file"); + writeln!(output, "{}", h1).expect("Failed to write to output file"); + writeln!(output, ">h2").expect("Failed to write to output file"); + writeln!(output, "{}", h2).expect("Failed to write to output file"); + } +} + +fn phase_variants(matrix: &Vec>) -> (Vec>, Vec>) { + let num_snps = matrix.len(); + let num_reads = matrix.iter().map(|m| m.keys().max().unwrap_or(&0) + 1).max().unwrap_or(0); + + let mut reads = vec![vec![None; num_snps]; num_reads]; + let mut confidences = vec![vec![None; num_snps]; num_reads]; + let mut all_alleles = vec![HashMap::new(); num_snps]; + + for (snp_idx, column) in matrix.iter().enumerate() { + // Count frequency of each allele in this column + let allele_counts = column + .values() + .map(|(allele, _)| allele) + .fold(HashMap::new(), |mut counts, allele| { + *counts.entry(allele.clone()).or_insert(0) += 1; + counts + }) + .into_iter() + .sorted_by(|a, b| b.1.cmp(&a.1)) + .take(2) + .collect::>(); + + let alleles = allele_counts + .iter() + .map(|(a, _)| a) + .cloned() + .collect::>(); + + let allele_map = alleles.iter().enumerate().map(|(i, a)| (a, i as u8)).collect::>(); + + let mut index_map = HashMap::new(); + + for (&read_idx, (allele, qual)) in column { + if let Some(allele_idx) = allele_map.get(allele) { + reads[read_idx][snp_idx] = Some(*allele_idx); + confidences[read_idx][snp_idx] = Some(*qual as u32); + + index_map.insert(*allele_idx, allele.clone()); + } + } + + all_alleles[snp_idx] = index_map; + } + + let wmec_matrix = WMECData::new(reads, confidences); + + let (p1, p2) = skydive::wmec::phase(&wmec_matrix); + + let mut h1 = Vec::new(); + let mut h2 = Vec::new(); + for i in 0..p1.len() { + // h1.push(all_alleles[i].get(&p1[i]).unwrap().clone()); + // h2.push(all_alleles[i].get(&p2[i]).unwrap().clone()); + + let a1 = all_alleles[i].get(&p1[i]).cloned(); + let a2 = all_alleles[i].get(&p2[i]).cloned(); + + h1.push(a1); + h2.push(a2); + } + + (h1, h2) +} + +fn create_read_allele_matrix(lr_msas: &Vec) -> Vec> { + let mut matrix = Vec::new(); + + let mut index1 = 0; + while index1 < lr_msas[0].len() { + let combined_base_counts = allele_counts(lr_msas, index1, index1+1); + + if combined_base_counts.len() > 1 { + let mut index2 = index1; + let mut allele_base_counts = allele_counts(lr_msas, index2, index2+1); + + while index2 < lr_msas[0].len() && allele_base_counts.len() > 1 { + index2 += 1; + allele_base_counts = allele_counts(lr_msas, index2, index2+1); + } + + let allele_counts = allele_counts(lr_msas, index1, index2) + .into_iter() + .sorted_by(|a, b| b.1.cmp(&a.1)) + .take(2) + .collect::>(); + + // println!("{} {} {:?}", index1, index2, allele_counts); + + if allele_counts.len() == 2 { + let mut column = BTreeMap::new(); + + let alleles = allele_indices(lr_msas, index1, index2); + + let mut allele_index = 0; + for (allele, _) in &allele_counts { + alleles.iter().enumerate().for_each(|(i, a)| { + if *a == *allele { + // column.insert(i, allele.clone()); + column.insert(i, allele_index.to_string()); + } + }); + + allele_index += 1; + } + + matrix.push(column); + } + + index1 = index2; + } else { + index1 += 1; + } + } + + matrix +} + +fn allele_indices(lr_msas: &Vec, index1: usize, index2: usize) -> Vec { + let alleles = lr_msas.iter() + .map(|msa| msa[index1..index2].to_string().replace(" ", "")) + .collect::>(); + alleles +} + +fn allele_counts(lr_msas: &Vec, index1: usize, index2: usize) -> BTreeMap { + let combined_allele_counts = lr_msas.iter() + .map(|msa| msa[index1..index2].to_string().replace(" ", "")) + .filter(|allele| !allele.is_empty()) + .fold(BTreeMap::new(), |mut counts, base| { + *counts.entry(base).or_insert(0) += 1; + counts + }); + combined_allele_counts +} diff --git a/src/hidive/src/coassemble.rs b/src/hidive/src/coassemble.rs index abafca42..cd1c2573 100644 --- a/src/hidive/src/coassemble.rs +++ b/src/hidive/src/coassemble.rs @@ -3,6 +3,7 @@ use std::{fs::File, path::PathBuf, io::Write}; use itertools::Itertools; use linked_hash_map::LinkedHashMap; +use minimap2::Aligner; use needletail::Sequence; use petgraph::graph::NodeIndex; use rayon::prelude::*; @@ -45,27 +46,92 @@ pub fn start( .score_kmers(model_path) .collapse() .clean(0.1) - .build_links(&all_lr_seqs, true); + .build_links(&all_lr_seqs, false); skydive::elog!("Built MLdBG with {} k-mers.", m.kmers.len()); skydive::elog!("Correcting reads..."); - let corrected_lr_seqs = correct_reads(&m, &all_lr_seqs); + let corrected_lr_seqs = m.correct_seqs(&all_lr_seqs); skydive::elog!("Clustering reads..."); let (reads_hap1, reads_hap2) = cluster_reads(&m, &corrected_lr_seqs); skydive::elog!("Assembling haplotype 1..."); let asm1 = assemble_haplotype(&all_ref_seqs, &reads_hap1); + // let hap1 = refine_haplotype(asm1, reference_fasta_paths[0].clone(), all_ref_seqs.get(0).unwrap()); skydive::elog!("Assembling haplotype 2..."); let asm2 = assemble_haplotype(&all_ref_seqs, &reads_hap2); + // let hap2 = refine_haplotype(asm2, reference_fasta_paths[0].clone(), all_ref_seqs.get(0).unwrap()); let mut output = File::create(output).expect("Failed to create output file"); writeln!(output, ">hap1").expect("Failed to write to output file"); writeln!(output, "{}", asm1).expect("Failed to write to output file"); writeln!(output, ">hap2").expect("Failed to write to output file"); writeln!(output, "{}", asm2).expect("Failed to write to output file"); + +} + +fn refine_haplotype(asm: String, ref_path: PathBuf, ref_seq: &Vec) -> String { + let aligner = Aligner::builder() + .map_hifi() + .with_cigar() + .with_index(ref_path.clone(), None) + .expect("Failed to build aligner"); + + let results = vec![asm] + .iter() + .map(|hap| aligner.map(hap.as_bytes(), true, false, None, None, None).unwrap()) + .collect::>(); + + let mut alt_seq: Vec = Vec::new(); + for result in results { + for mapping in result { + if let Some(alignment) = &mapping.alignment { + if mapping.is_primary && !mapping.is_supplementary { + if let Some(cs) = &alignment.cs { + let re = regex::Regex::new(r"([:\*\+\-])(\w+)").unwrap(); + let mut cursor = 0; + for cap in re.captures_iter(cs) { + let op = cap.get(1).unwrap().as_str(); + let seq = cap.get(2).unwrap().as_str(); + + match op { + ":" => { + // Match (copy from reference) + let length = seq.parse::().unwrap(); + alt_seq.extend(&ref_seq[cursor..cursor + length]); + cursor += length; + } + "*" => { + // Substitution + // let ref_base = seq.as_bytes()[0]; + let alt_base = seq.to_uppercase().as_bytes()[1]; + alt_seq.push(alt_base); + cursor += 1; + } + "+" => { + // Insertion + alt_seq.extend(seq.to_uppercase().as_bytes()); + } + "-" => { + // Deletion + // alt_seq.extend(vec![b'-'; seq.len()]); + cursor += seq.len(); + } + _ => unreachable!("Invalid CIGAR operation"), + } + } + } + } + + // skydive::elog!("ref {}", String::from_utf8_lossy(ref_seq)); + // skydive::elog!("alt {}", String::from_utf8_lossy(&alt_seq)); + } + } + } + + String::from_utf8_lossy(&alt_seq).to_string() } fn assemble_haplotype(ref_seqs: &Vec>, reads: &Vec>) -> String { @@ -260,18 +326,6 @@ fn assign_reads_to_bubbles(bubbles: &LinkedHashMap<(NodeIndex, NodeIndex), Vec>) -> Vec> { - let g = m.traverse_all_kmers(); - let corrected_seqs = - seqs - .par_iter() - .map(|seq| m.correct_seq(&g, seq)) - .flatten() - .collect::>(); - - vec![corrected_seqs] -} - fn create_fully_phased_haplotypes(lr_msas: &Vec, h1: &Vec) -> (String, String) { let mut index1 = 0; let mut hap_index = 0; diff --git a/src/hidive/src/correct.rs b/src/hidive/src/correct.rs index d857ded6..aa45e0d6 100644 --- a/src/hidive/src/correct.rs +++ b/src/hidive/src/correct.rs @@ -1,82 +1,161 @@ use std::collections::{HashMap, HashSet}; use std::{fs::File, path::PathBuf, io::Write}; -use needletail::Sequence; -use petgraph::graph::NodeIndex; +use indicatif::{ProgressIterator, ParallelProgressIterator}; use rayon::prelude::*; -use indicatif::ParallelProgressIterator; +// use rayon::iter::IntoParallelRefIterator; +// use rayon::iter::ParallelIterator; use skydive::ldbg::LdBG; use skydive::mldbg::MLdBG; -use skydive::utils::*; pub fn start( output: &PathBuf, - gfa_output: Option, + loci_list: &Vec, kmer_size: usize, + window: usize, model_path: &PathBuf, long_read_fasta_path: &PathBuf, short_read_fasta_path: &PathBuf, ) { - let long_read_seq_urls = skydive::parse::parse_file_names(&[long_read_fasta_path.clone()]); - let short_read_seq_urls = skydive::parse::parse_file_names(&[short_read_fasta_path.clone()]); + // Get the system's temporary directory path + let cache_path = std::env::temp_dir(); + skydive::elog!("Intermediate data will be stored at {:?}.", cache_path); - // Read all long reads. - skydive::elog!("Processing long-read samples {:?}...", long_read_seq_urls.iter().map(|url| url.as_str()).collect::>()); - let all_lr_seqs = skydive::utils::read_fasta(&vec![long_read_fasta_path.clone()]); + // Load datasets + let long_read_seq_urls = skydive::parse::parse_file_names(&[long_read_fasta_path.clone()]); - // Read all short reads. - skydive::elog!("Processing short-read samples {:?}...", short_read_seq_urls.iter().map(|url| url.as_str()).collect::>()); + skydive::elog!("Processing short-read sample {}...", short_read_fasta_path.display()); let all_sr_seqs = skydive::utils::read_fasta(&vec![short_read_fasta_path.clone()]); - - let l1 = LdBG::from_sequences("lr".to_string(), kmer_size, &all_lr_seqs); - let s1 = LdBG::from_sequences("sr".to_string(), kmer_size, &all_sr_seqs); - - let m = MLdBG::from_ldbgs(vec![l1, s1]) - .score_kmers(model_path) - .collapse() - .clean(0.1) - .build_links(&all_lr_seqs, false); - - skydive::elog!("Built MLdBG with {} k-mers.", m.kmers.len()); - - let corrected_seqs = m.correct_seqs(&all_lr_seqs); - - let mut fa_file = File::create(output).unwrap(); - for (i, corrected_seq) in corrected_seqs.iter().enumerate() { - let _ = writeln!(fa_file, ">corrected_{}\n{}", i, String::from_utf8(corrected_seq.clone()).unwrap()); - } - - if let Some(gfa_output) = gfa_output { - skydive::elog!("Writing GFA to {}", gfa_output.display()); - - let g = m.traverse_all_kmers(); - - let _ = write_gfa(&mut File::create(gfa_output.clone()).unwrap(), &g); - - let csv_output = gfa_output.with_extension("csv"); - let mut csv_file = File::create(&csv_output).unwrap(); - - writeln!(csv_file, "node,label,kmer").unwrap(); - - for (node_index, node_label) in g.node_indices().zip(g.node_weights()) { - let kmer = node_label.as_bytes(); - let cn_kmer = skydive::utils::canonicalize_kmer(kmer); - let score = (100.0 * *m.scores.get(&cn_kmer).unwrap_or(&0.0)) as u32; - let sources = m.sources.get(&cn_kmer).unwrap_or(&vec![]).clone(); - - let source = if sources.len() == 1 { sources[0] } else { 2 }; - - // let in_read = a_kmers.contains_key(&cn_kmer); - - writeln!( - csv_file, - "{},{},{}", - node_index.index(), - format!("{} ({})", source, score), - node_label, - ) - .unwrap(); - } - } + skydive::elog!(" - {} short reads loaded", all_sr_seqs.len()); + + // Iterate over loci + let loci = skydive::parse::parse_loci(loci_list, window as u64).into_iter().collect::>(); + + // let progress_bar = skydive::utils::default_bounded_progress_bar("Processing loci", loci.len() as u64); + + let fa_file = std::sync::Mutex::new(File::create(output).unwrap()); + loci + // .par_iter() + // .progress_with(progress_bar) + .iter() + .inspect(|(chrom, start, end, name)| { + skydive::elog!("Processing locus {} ({}:{}-{})", name, chrom, start, end); + }) + .for_each(|(chrom, start, end, name)| { + let (padded_start, padded_end) = pad_interval(start, end, window); + + let mut corrected_reads = HashMap::new(); + + for window_start in (padded_start..padded_end).step_by(window) { + let window_end = window_start + window as u64; + + let locus = HashSet::from([(chrom.clone(), window_start, window_end, name.clone())]); + + let r = skydive::stage::stage_data_in_memory(&locus, &long_read_seq_urls, false, &cache_path); + if let Ok(reads) = r { + let long_reads: HashMap> = reads + .into_iter() + .map(|read| (read.id().to_string(), read.seq().to_vec())) + .collect(); + + let cn_kmers = long_reads + .values() + .map(|seq| seq.windows(kmer_size).collect::>()) + .flatten() + .map(|kmer| skydive::utils::canonicalize_kmer(kmer)) + .collect::>(); + + let sr_seqs = all_sr_seqs + .iter() + .filter_map(|seq| { + let kmers = seq.windows(kmer_size).collect::>(); + let read_kmers = kmers.iter().map(|kmer| skydive::utils::canonicalize_kmer(kmer)).collect::>(); + + let count = cn_kmers.intersection(&read_kmers).count(); + + if count > (0.5*read_kmers.len() as f64) as usize { + Some(seq) + } else { + None + } + }) + .cloned() + .collect::>(); + + let lr_seqs = long_reads.values().cloned().collect::>(); + let l1 = LdBG::from_sequences("lr".to_string(), kmer_size, &lr_seqs); + let s1 = LdBG::from_sequences("sr".to_string(), kmer_size, &sr_seqs); + + let m = MLdBG::from_ldbgs(vec![l1, s1]) + .score_kmers(model_path) + .collapse() + .clean(0.1) + .build_links(&lr_seqs, false); + + let g = m.traverse_all_kmers(); + + for (id, seq) in long_reads { + let corrected_seq = m.correct_seq(&g, &seq); + + corrected_reads.entry(id) + .or_insert_with(Vec::new) + .push((corrected_seq, m.scores.clone())); + } + } + } + + let mut file = fa_file.lock().unwrap(); + for id in corrected_reads.keys() { + let pieces = corrected_reads.get(id).unwrap(); + + let mut joined_seq = Vec::::new(); + let mut joined_scores = HashMap::new(); + for (piece, scores) in pieces { + joined_seq.extend(piece); + joined_scores.extend(scores); + } + + let mut prob = vec![1.0; joined_seq.len()]; + let mut count = vec![1; joined_seq.len()]; + for (i, kmer) in joined_seq.windows(kmer_size).enumerate() { + let cn_kmer = skydive::utils::canonicalize_kmer(kmer); + let score = **joined_scores.get(&cn_kmer).unwrap_or(&&1.0); + + for j in i..std::cmp::min(i+kmer_size, prob.len()) { + prob[j] *= score; + count[j] += 1; + } + } + + let mut qual = vec![0; joined_seq.len()]; + for i in 0..prob.len() { + prob[i] = prob[i].powf(1.0/count[i] as f32); + qual[i] = (-10.0*(1.0 - (prob[i] - 0.0001).max(0.0)).log10()) as u8 + 33; + } + + let _ = writeln!(file, "@{}\n{}\n+\n{}", id, String::from_utf8(joined_seq).unwrap(), String::from_utf8_lossy(&qual)); + } + }); +} + +fn pad_interval(start: &u64, end: &u64, window: usize) -> (u64, u64) { + // Calculate how much padding is needed to make the interval cleanly divisible by 1000 + let interval_length = end - start; + let remainder = interval_length % window as u64; + + let (padded_start, padded_end) = if remainder > 0 { + let padding_needed = window as u64 - remainder; + let half_padding = padding_needed / 2; + + // Add padding evenly to start and end + let new_start = start.saturating_sub(half_padding); + let new_end = end + (padding_needed - half_padding); + + (new_start, new_end) + } else { + (*start, *end) + }; + + (padded_start, padded_end) } \ No newline at end of file diff --git a/src/hidive/src/filter.rs b/src/hidive/src/filter.rs index d9bf68f7..6bc2545d 100644 --- a/src/hidive/src/filter.rs +++ b/src/hidive/src/filter.rs @@ -97,7 +97,7 @@ pub fn start(output: &PathBuf, gfa_path: &PathBuf, short_read_fasta_paths: &Vec< } let count = aligner - .map(&sr_seq, false, false, None, None) + .map(&sr_seq, false, false, None, None, None) .iter() .flatten() .filter_map(|a| { diff --git a/src/hidive/src/main.rs b/src/hidive/src/main.rs index fcf569ce..cd8d1a10 100644 --- a/src/hidive/src/main.rs +++ b/src/hidive/src/main.rs @@ -55,6 +55,7 @@ use clap::{Parser, Subcommand}; mod build; mod cluster; mod coassemble; +mod assemble; mod correct; mod fetch; mod filter; @@ -64,6 +65,7 @@ mod recruit; mod train; mod trim; mod eval_model; +mod call; #[derive(Debug, Parser)] #[clap(name = "hidive")] @@ -75,6 +77,7 @@ struct Cli { } const DEFAULT_KMER_SIZE: usize = 17; +const DEFAULT_WINDOW_SIZE: usize = 1000; #[derive(Debug, Subcommand)] enum Commands { @@ -315,7 +318,7 @@ enum Commands { /// Error-correct long reads using a linked de Bruijn graph. #[clap(arg_required_else_help = true)] - Correct { + Assemble { /// Output path for corrected reads. #[clap(short, long, value_parser, default_value = "/dev/stdout")] output: PathBuf, @@ -341,6 +344,38 @@ enum Commands { short_read_fasta_path: PathBuf, }, + /// Error-clean long reads using a linked de Bruijn graph. + #[clap(arg_required_else_help = true)] + Correct { + /// Output path for corrected reads. + #[clap(short, long, value_parser, default_value = "/dev/stdout")] + output: PathBuf, + + /// One or more genomic loci ("contig:start-stop[|name]", or BED format) to extract from WGS BAM files. + #[clap(short, long, value_parser, required = true)] + loci: Vec, + + /// Kmer-size + #[clap(short, long, value_parser, default_value_t = DEFAULT_KMER_SIZE)] + kmer_size: usize, + + /// Window size + #[clap(short, long, value_parser, default_value_t = DEFAULT_WINDOW_SIZE)] + window_size: usize, + + /// Trained error-cleaning model. + #[clap(short, long, required = true, value_parser)] + model_path: PathBuf, + + /// FASTA files with long-read sequences (may contain one or more samples). + #[clap(required = true, value_parser)] + long_read_fasta_path: PathBuf, + + /// FASTA files with short-read sequences (may contain one or more samples). + #[clap(required = true, value_parser)] + short_read_fasta_path: PathBuf, + }, + /// Co-assemble target locus from long- and short-read data using a linked de Bruijn graph. #[clap(arg_required_else_help = true)] Coassemble { @@ -368,6 +403,26 @@ enum Commands { #[clap(required = false, value_parser)] short_read_fasta_path: PathBuf, }, + + /// Call and phase variants. + #[clap(arg_required_else_help = true)] + Call { + /// Output path for assembled short-read sequences. + #[clap(short, long, value_parser, default_value = "/dev/stdout")] + output: PathBuf, + + /// One or more genomic loci ("contig:start-stop[|name]", or BED format) to extract from WGS BAM files. + #[clap(short, long, value_parser, required = true)] + loci: Vec, + + /// FASTA reference sequence. + #[clap(short, long, required = true, value_parser)] + reference_fasta_path: PathBuf, + + /// BAM file with integrated long- and short-read data. + #[clap(required = true, value_parser)] + bam_path: PathBuf, + }, } fn main() { @@ -483,7 +538,7 @@ fn main() { Commands::Impute { output, graph } => { impute::start(&output, &graph); } - Commands::Correct { + Commands::Assemble { output, gfa_output, kmer_size, @@ -491,7 +546,26 @@ fn main() { long_read_fasta_path, short_read_fasta_path, } => { - correct::start(&output, gfa_output, kmer_size, &model_path, &long_read_fasta_path, &short_read_fasta_path); + assemble::start(&output, gfa_output, kmer_size, &model_path, &long_read_fasta_path, &short_read_fasta_path); + } + Commands::Correct { + output, + loci, + kmer_size, + window_size, + model_path, + long_read_fasta_path, + short_read_fasta_path, + } => { + correct::start( + &output, + &loci, + kmer_size, + window_size, + &model_path, + &long_read_fasta_path, + &short_read_fasta_path + ); } Commands::Coassemble { output, @@ -510,6 +584,20 @@ fn main() { short_read_fasta_path ); } + Commands::Call { + output, + loci, + reference_fasta_path, + bam_path, + } => { + call::start( + &output, + &loci, + &reference_fasta_path, + &bam_path + ); + } + } skydive::elog!("Complete. Elapsed time: {}.", elapsed_time(start_time)); diff --git a/src/skydive/Cargo.toml b/src/skydive/Cargo.toml index d2e1dfe9..9aecc133 100644 --- a/src/skydive/Cargo.toml +++ b/src/skydive/Cargo.toml @@ -47,6 +47,7 @@ rayon = "1.10.0" regex = "1.10.5" reqwest = { version = "0.12.5", features = ["blocking"]} rust-htslib = { version = "0.47.0", features = ["curl", "gcs"] } +sdust = "=0.1.0" # serde = "1.0.204" # serde_derive = "1.0.204" serde_json = "1.0.120" diff --git a/src/skydive/src/ldbg.rs b/src/skydive/src/ldbg.rs index 11c773cf..afc5be76 100644 --- a/src/skydive/src/ldbg.rs +++ b/src/skydive/src/ldbg.rs @@ -33,6 +33,7 @@ type KmerGraph = HashMap, Record>; type KmerScores = HashMap, f32>; type Links = HashMap, HashMap>; type Sources = HashMap, Vec>; +type KmerNoise = HashSet>; /// Represents a linked de Bruijn graph with a k-mer size specified at construction time. #[derive(Debug, Clone)] @@ -43,6 +44,7 @@ pub struct LdBG { pub scores: KmerScores, pub links: Links, pub sources: Sources, + pub noise: KmerNoise, pub verbose: bool, } @@ -56,6 +58,7 @@ impl LdBG { scores: KmerScores::new(), links: Links::new(), sources: Sources::new(), + noise: KmerNoise::new(), verbose: false, } } @@ -93,6 +96,7 @@ impl LdBG { let scores: KmerScores = kmers.keys().map(|k| (k.clone(), 1.0)).collect(); let links: Links = Links::new(); let sources: Sources = Sources::new(); + let noise: KmerNoise = Self::find_noise(&fwd_seqs, kmer_size); LdBG { name, @@ -101,6 +105,7 @@ impl LdBG { scores, links, sources, + noise, verbose: false, } } @@ -139,6 +144,7 @@ impl LdBG { let scores: KmerScores = kmers.keys().map(|k| (k.clone(), 1.0)).collect(); let links: Links = Links::new(); let sources: Sources = Sources::new(); + let noise: KmerNoise = Self::find_noise(&fwd_seqs, kmer_size); LdBG { name, @@ -147,6 +153,7 @@ impl LdBG { scores, links, sources, + noise, verbose: false, } } @@ -174,6 +181,7 @@ impl LdBG { let scores: KmerScores = kmers.keys().map(|k| (k.clone(), 1.0)).collect(); let links: Links = Links::new(); let sources: Sources = Sources::new(); + let noise: KmerNoise = Self::find_noise(&fwd_seqs, kmer_size); LdBG { name, @@ -182,6 +190,7 @@ impl LdBG { scores, links, sources, + noise, verbose: false, } } @@ -207,6 +216,7 @@ impl LdBG { let scores: KmerScores = kmers.keys().map(|k| (k.clone(), 1.0)).collect(); let links: Links = Links::new(); let sources: Sources = Sources::new(); + let noise: KmerNoise = Self::find_noise(&fwd_seqs, kmer_size); LdBG { name, @@ -215,6 +225,7 @@ impl LdBG { scores, links, sources, + noise, verbose: false, } } @@ -235,6 +246,38 @@ impl LdBG { self } + fn find_noise(fwd_seqs: &Vec>, kmer_size: usize) -> HashSet> { + let mut noise: KmerNoise = KmerNoise::new(); + + for seq in fwd_seqs { + let mut kmer_positions = HashMap::new(); + for (i, fw_kmer) in seq.windows(kmer_size).enumerate() { + let cn_kmer = crate::utils::canonicalize_kmer(fw_kmer); + kmer_positions.entry(cn_kmer) + .or_insert_with(Vec::new) + .push(i); + } + + for (cn_kmer, positions) in kmer_positions { + if positions.len() > 1 { + noise.insert(cn_kmer); + } + } + + for range in sdust::symmetric_dust(seq) { + if range.len() >= 1 { + for fw_kmer in seq[range.start..range.end].windows(kmer_size) { + let cn_kmer = crate::utils::canonicalize_kmer(fw_kmer); + + noise.insert(cn_kmer); + } + } + } + } + + noise + } + /// Build a de Bruijn graph from a vector of sequences. /// /// # Arguments @@ -848,20 +891,16 @@ impl LdBG { let kmer_in = g_seq[in_node].as_bytes(); let kmer_out = g_seq[out_node].as_bytes(); - let start_position = seq.windows(self.kmer_size).position(|w| w == kmer_in); - let end_position = seq.windows(self.kmer_size).position(|w| w == kmer_out); - - if start_position.is_none() || end_position.is_none() || start_position.unwrap() > end_position.unwrap() { - continue; - } - - // let paths_fw = petgraph::algo::all_simple_paths::, _>(&g_seq, in_node, out_node, 0, Some(interior.len())); - // let paths_rv = petgraph::algo::all_simple_paths::, _>(&g_seq, out_node, in_node, 0, Some(interior.len())); - // let all_paths = paths_fw.chain(paths_rv).collect::>>(); + // let start_position = seq.windows(self.kmer_size).position(|w| w == kmer_in); + // let end_position = seq.windows(self.kmer_size).position(|w| w == kmer_out); // crate::elog!("Superbubble: {} -> {} ({} interior nodes)", String::from_utf8_lossy(kmer_in), String::from_utf8_lossy(kmer_out), interior.len()); + // crate::elog!(" - start: {:?}", start_position); + // crate::elog!(" - end: {:?}", end_position); if a_kmer_set.contains(&kmer_in.to_vec()) && a_kmer_set.contains(&kmer_out.to_vec()) { + // crate::elog!(" - contains k-mers"); + let paths_fwd = petgraph::algo::all_simple_paths::, _>(&g_seq, in_node, out_node, 0, Some(interior.len())); let paths_rev = petgraph::algo::all_simple_paths::, _>(&g_seq, out_node, in_node, 0, Some(interior.len())); @@ -890,7 +929,6 @@ impl LdBG { paths.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap()); - /* for (path, score, seen_kmer_count) in &paths { let mut contig = String::new(); @@ -902,9 +940,8 @@ impl LdBG { } } - crate::elog!("Contig: {} (score: {}, seen k-mer count: {})", contig, score, seen_kmer_count); + // crate::elog!("Contig: {} (score: {}, seen k-mer count: {})", contig, score, seen_kmer_count); } - */ let read_path = paths.first().cloned().unwrap(); let read_kmers = read_path.0.iter().map(|node| g_seq[*node].as_bytes().to_vec()).collect::>(); @@ -928,6 +965,8 @@ impl LdBG { let replacement_kmers = replacement_path.0.iter().map(|node| g_seq[*node].as_bytes().to_vec()).collect::>(); replacements.push((read_kmers, replacement_kmers)); + + // crate::elog!(" - replaced"); } } } @@ -956,9 +995,11 @@ impl LdBG { } } - // crate::elog!("Replacing {}-{}\n{}\n{}", start_pos, end_pos, q1, r1); + crate::elog!("Replacing {}-{} in read of length {}\n{}\n{}", start_pos, end_pos, b.len(), q1, r1); - b.splice(start_pos..=end_pos, replacement_path); + if start_pos <= end_pos { + b.splice(start_pos..=end_pos, replacement_path); + } } let mut c = String::new(); @@ -972,7 +1013,7 @@ impl LdBG { } /* - let gfa_output = PathBuf::from("read.gfa"); + let gfa_output = PathBuf::from(format!("read.{}.gfa", 0)); let _ = crate::utils::write_gfa(&mut File::create(gfa_output.clone()).unwrap(), &g_seq); let csv_output = gfa_output.with_extension("csv"); @@ -1946,6 +1987,7 @@ impl LdBG { let kmer_size = self.kmer_size; self + .clean_hairballs() .clean_tangles(0, 500, threshold) .clean_tangles(1, 500, threshold) .clean_tips(3*kmer_size, threshold) @@ -2035,7 +2077,7 @@ impl LdBG { self.scores.remove(cn_kmer); } - crate::elog!(" -- Removed {} bad paths specific to color {} ({} kmers)", bad_paths, color, to_remove.len()); + // crate::elog!(" -- Removed {} bad paths specific to color {} ({} kmers)", bad_paths, color, to_remove.len()); self.infer_edges(); @@ -2123,7 +2165,7 @@ impl LdBG { self.scores.remove(cn_kmer); } - crate::elog!(" -- Removed {} bad paths ({} kmers)", bad_paths, to_remove.len()); + // crate::elog!(" -- Removed {} bad paths ({} kmers)", bad_paths, to_remove.len()); self.infer_edges(); @@ -2188,7 +2230,21 @@ impl LdBG { self.scores.remove(cn_kmer); } - crate::elog!(" -- Removed {} tangles in color {} ({} kmers)", bad_tangles, color, to_remove.len()); + // crate::elog!(" -- Removed {} tangles in color {} ({} kmers)", bad_tangles, color, to_remove.len()); + + self.infer_edges(); + + self + } + + #[must_use] + pub fn clean_hairballs(mut self) -> Self { + for cn_kmer in self.noise.iter() { + self.kmers.remove(cn_kmer); + self.scores.remove(cn_kmer); + } + + // crate::elog!(" -- Removed repetitive noise ({} kmers)", self.noise.len()); self.infer_edges(); @@ -2259,7 +2315,7 @@ impl LdBG { self.scores.remove(cn_kmer); } - crate::elog!(" -- Removed {} bad tips ({} kmers)", bad_paths, to_remove.len()); + // crate::elog!(" -- Removed {} bad tips ({} kmers)", bad_paths, to_remove.len()); self.infer_edges(); @@ -2357,7 +2413,7 @@ impl LdBG { self.scores.remove(cn_kmer); } - crate::elog!(" -- Removed {} bad bubble paths ({} kmers)", num_bad_paths, to_remove.len()); + // crate::elog!(" -- Removed {} bad bubble paths ({} kmers)", num_bad_paths, to_remove.len()); self.infer_edges(); @@ -2417,7 +2473,7 @@ impl LdBG { self.scores.remove(cn_kmer); } - crate::elog!(" -- Removed {} bad contigs ({} kmers)", bad_contigs, to_remove.len()); + // crate::elog!(" -- Removed {} bad contigs ({} kmers)", bad_contigs, to_remove.len()); self.infer_edges(); @@ -3105,12 +3161,12 @@ pub fn find_superbubble(graph: &DiGraph, s: NodeIndex, direction: p } #[must_use] -pub fn find_all_superbubbles(graph: &petgraph::Graph) -> LinkedHashMap<(NodeIndex, NodeIndex), Vec> { +pub fn find_all_superbubbles_old(graph: &petgraph::Graph) -> LinkedHashMap<(NodeIndex, NodeIndex), Vec> { let mut bubbles = LinkedHashMap::new(); let mut visited: HashSet = HashSet::new(); for n in graph.node_indices() { - if !visited.contains(&n) { + // if !visited.contains(&n) { for d in [petgraph::Direction::Outgoing, petgraph::Direction::Incoming] { let bubble = find_superbubble(&graph, n, d); @@ -3125,6 +3181,30 @@ pub fn find_all_superbubbles(graph: &petgraph::Graph) -> LinkedHash } } } + // } + } + + bubbles +} + +pub fn find_all_superbubbles(graph: &petgraph::Graph) -> LinkedHashMap<(NodeIndex, NodeIndex), Vec> { + let mut bubbles = LinkedHashMap::new(); + let mut processed_pairs = HashSet::new(); + + for n in graph.node_indices() { + for d in [petgraph::Direction::Outgoing, petgraph::Direction::Incoming] { + if let Some((start, end, interior)) = find_superbubble(graph, n, d) { + // Create both possible orderings of the pair + let pair_forward = (start.index(), end.index()); + let pair_reverse = (end.index(), start.index()); + + // Only process this bubble if we haven't seen it before + if !processed_pairs.contains(&pair_forward) && !processed_pairs.contains(&pair_reverse) { + bubbles.insert((start, end), interior); + processed_pairs.insert(pair_forward); + processed_pairs.insert(pair_reverse); + } + } } } diff --git a/src/skydive/src/mldbg.rs b/src/skydive/src/mldbg.rs index e0042840..f705a74e 100644 --- a/src/skydive/src/mldbg.rs +++ b/src/skydive/src/mldbg.rs @@ -241,6 +241,12 @@ impl MLdBG { ldbg.scores = self.scores.clone(); + for l in &self.ldbgs { + for cn_kmer in l.noise.iter() { + ldbg.noise.insert(cn_kmer.clone()); + } + } + ldbg.infer_edges(); ldbg diff --git a/src/skydive/src/stage.rs b/src/skydive/src/stage.rs index 72f8cf10..15e2b46d 100644 --- a/src/skydive/src/stage.rs +++ b/src/skydive/src/stage.rs @@ -161,7 +161,7 @@ fn get_sm_name_from_rg(read: &bam::Record, rg_sm_map: &HashMap) } // Function to extract seqs from a BAM file within a specified genomic region. -fn extract_aligned_bam_reads( +pub fn extract_aligned_bam_reads( _basename: &str, bam: &mut IndexedReader, chr: &str, @@ -452,6 +452,28 @@ pub fn stage_data( Ok(all_data.len()) } +pub fn stage_data_in_memory( + loci: &HashSet<(String, u64, u64, String)>, + seq_urls: &HashSet, + unmapped: bool, + cache_path: &PathBuf, +) -> Result> { + let current_dir = env::current_dir()?; + env::set_current_dir(cache_path).unwrap(); + + // Stage data from all BAM files. + let all_data = match stage_data_from_all_files(seq_urls, loci, unmapped) { + Ok(all_data) => all_data, + Err(e) => { + panic!("Error: {e}"); + } + }; + + env::set_current_dir(current_dir).unwrap(); + + Ok(all_data) +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/skydive/src/utils.rs b/src/skydive/src/utils.rs index defaf056..d2eb7765 100644 --- a/src/skydive/src/utils.rs +++ b/src/skydive/src/utils.rs @@ -300,6 +300,8 @@ pub fn read_gfa>(path: P) -> std::io::Result #[cfg(test)] mod tests { + use sdust::symmetric_dust; + use super::*; #[test] @@ -317,4 +319,33 @@ mod tests { assert_eq!(canonicalize_kmer(kmer3), b"AAAA".to_vec()); assert_eq!(canonicalize_kmer(kmer4), b"AAAA".to_vec()); } + + #[test] + fn test_dust() { + let read = b"TGGCAGCCATAGGTTTTCCCTGGAGTTGTGGCATCTGGAACTACAGGGATGAGCATTTGAGTACATATTACAGTGAGGTGGCCACACTGTGACCCGCAGTTCTGCAGACTGGAAGGCACTGAATGCCAGGATTTTTGCAGAGTGTCACTATGAAGTCCTGACTTGGCTCAGAGACCTTCTTAGAGCAGTAATTCGGGACCAGTGGATTTCTGATAAAGTTATTCTAATTTTCTAATAATTGTTTTCTAATAAAAGCCATATGGCAGGTCCTGCTCCCTTGGTAGCATGACCAGTACCTGGCGCAGTGCTAGTGCTGAGCTGACAGGAAGTGCCTCACCTTCATCTCTCACTTGACAGTGGGTGGAAGGTTCTTGGCTCGGTATCCCTCAGTCATGACTGCACACTGTCCTGAGCTTTTCTCCCAACTTCATCCACTTCATACTATTTTAATAAAGCGGTGCTGTGTATTATAACATTGTGCAGCTGAGCATTACACTCATGGCTCCCATTATCAAGCCCCTGCTATATACAGGGCATTTCACAAAGAAGCAAACTTCCAAGCAGTCACTCAGCAACCTCCTCCTAGGAGCATTTGGGGAAGAGAATCTTGGGGCAAGTTTCCTTTACCACCTGCAGTCACCTGGGATGCTGGGAAAAATTTTGATTTCTGTTGTCTTCCCTTCCAGAAAATTATTTGAGAGTGGGGCCAACAAATCTGCACTTGAGTCCATACCTAGGATAGGTTTTTCTGTGCAGTTTTTTAAGTTTAAGAGGTTTTTAAAGTTTAAGACACACTGGTTAGGGTTTTGGGCTCTGGAGGATGAGAAACCTTGCTTGGGTTATCAGATAACAGATTCTTCTCTGGTTTCCCTCCGATGTTATCAGGGGAATTGTTGGTTGTTTCACATTTGGGTGCTCCTGGGCCTTTTAAGAGCCAGGCTGGGAGGGCTGGTGATGGCAACCCTGGCTGGCAACAGAGGCTGTTTCCACCCCTGGGTGGCTCCCCACCTGCTTTCTGCCCTGGTAGGGTTCAAGGCTCCGGGAATTGGCACTCAGTGAAAGAATTTTGATTTCCAGTGGAATTTGTGCTGTCACAAGATTTGACCCATGGGACTAGTGAATAGATAGATGGGTTAGGTGAGCATGTGACTTGGCTGGTGGCCGAGAGAGTGATAAATGTGAGAGTAGCTGGGGAAAATGGAAACGGATTAAGATAGAAGAGGGGCATTGTCCATCTGGCCGATGGCAAGGGCTGGTGGAGCAGCAGTTCTAGACTATTCTGAGGTTAGTTCAGAAACTGACCTAACAACGTGGGAAGTCTCTCCCAAATTGTTTATAGTTTCTCACAGTGGGTGCCTTTTGAAGTGATTGTATTTGACAGCCCAGAGTGTTGGGCACACAGCTTTGTGCTATCTAAGGTCACGGTCCAATTGTGATTCCTAGCAATAGCTTCAAGGCATATTTCATAGCTCTAATAGTTTTCAAGTATAAGGGTGTGAGAATGAGCTTTAAGAATATTTATGCCATGAAATCTTCCAATTGCTCTTCAACACGGGTGCACCATAGTAGGTGTGAATAGAAGTGGTGGCAACAGACCTGAATTCAGGTCTGCCACTGACTATAATACTAGCTTGAGAAGTAACTTGAACTCTGTGAGCCTCAGTTTCCTGTCTGTAGAATGAAGACAATGATACTGCCTTCATAGGATTATTATTAGGATTAAATGAAATATTATAGTGAGGCATTCAGCAAAGTGTTCTATAAATTGGGGTAGGATGTGAGGTAATTGGCATTGTTAGATGCGTCTCTGGGTAAACAACCAAATTTTCTGCTTATTTGGCTGTTTCCCTAGCTGCCTTGTTTAAAACAAAACACCTGAGTTGACCAGAACACCTCTGTTTTTAGAATCTAACTTTGCAGTTGTATTAGTCTCTTCTTGCATTGCCATAAAGAAATACCTGACACCTTCATAAAGAAATGAGGTTTAACTGGCTCACGGTCCTGCAGGCTTTACAGGAAGCATGGTACTGACATCTACTTGGCTTCTAGAGAGGCCTCAGGAAGCTTACAGTCATGGCAGAAAGTGAAGCAGTAGCAGGCACATCACATGGTGAAAGCGGGAGCAAGAGAGACAGTTGGGAGAGGAGGCGCCACACACCTTTTAAACAACCAGATCTCCCAAGAACTCACTCACTATCGTGAAGATAGCACCAAGCCATGAGGGGTCTGTCCCCATGATCCAGACACCTCTCACCAGGCCCCACCTCCAGCACTGGGGATTACAATTGAGCATGAGATTTGGGTAGGGACAACTATCTGAACTGTATCAGCAATAGAGTGTGATTATAAGTTATGCTGTAGGAATAGAATTGTTGTCACTGAAAGATTCCCTTGGCCATGGGAGCCTCCTGGCTCTATGAAGGATCAGCCAATGCTTATCCAGGGAGGTAATGATAAGGTCGAAGTTTGACAAGAAATCTACGTTTTCTTAAGCTAAGTAGTAGGTTAACAGAAGATATGTTGTGTGTTAATAGTTCTATTTACATCTCTTTCTCCAAGGTTATACACACTCTGCATCACTAAGTCAAGACACCATTCTTTGACACTGGCTAATAGTAATAGCAATCATAGCCACTGTGCATTAGCACTTACTCCACATTCCTTGTACTGAGCACTACTTACATTATGTTGGTGTTGTCATTGTCACCATTTCATACATAAAGAAACCAAGGTTTTCAGAGATTGAATAACTTGTGCAAGATCACACATCTGGTAGGGCAGATCCAAGATCTGTTTGTCTCCAAAATCTGCTTCTGTCCTGCCTGGGAGACCTTGGGAATGACGGCAAGTGGTTGTAGGAAGGAGGGCTGATGTCAAGGTGGCTGTGGGGGCAGGAGGCTGAGGGAACTCACTGACCCTTGAGGGACTCCTTAGGTGGGGGATTCTGGGTTTCCTGTTGGCAGCTGGAGGGGGAGTGCCAGTTCCCATAAGTGGTTATTGCCCAGGTTGTGACCTTGGCTTGGCCAGTGATTGGTTCATTTTGGAATTTCATGAGTGACCCCCAGGCAGGGTTCTTACAATCACGCTGGAAGACCACCCAGGAAGTTCCTGTTGGGGTAAAATGATGCAGCAGCCTGCTTTCCTCAGGAGGTCTGAACCCTCCCCATGTACACACACACACACACACAAACACACACACACCCACCCACACACCCTCCACCCCTCTTGGTGTCTTTGGCCTTTTTTCCTAGCTTGTTTGTTTCTATGGTGTCTTCAAGTTCAACTAGAACCTATGGGAATGACTTAGTTTTGAACCTGTAAGAATGAGAAGTAAACAATTCTTGTACTGACTTTGAATTTCCTTTCTTCTGTTGTCCAAAGGTGAAGGGTGACAATGTGTCCCAGATTTTTTTGGATATTCTACAAAAAAATAGATATTTTTTTGTAGAAAAAAGCTTATTCTACAGTGTTGTCCCAATTTTAAAAGCCCTAGAAAACTGGTTAAGGCAAATTATAACCAAATCAAATCACTAATTATTACAATAAAGTGTAACTAGCTACAAAAATCCTAAATTACAATTTTAGGCTTTGAGAAAATATCACTGATGATAGAGGAAGAGTGACAGTCTTTGTTTTGGGTCTTGGGATGGCAGAAAGAAAATATTTAGTAGGGAGTAAAGATCAGTGTACCCCTTGAAGTGTGGTGAAGATGGGTGGGTTTTGATGCTCTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTATGTAGAATTTCCCTAAATTCAAAATGATACTTACATTTTGATAAACCCAGAGAGTTAATTAACATGGCATAAATGTGCTAGCTCATTGTTGTTTTTCAGGGAAAGAACTAGAGAGAGACACAAGTCACCCCGGAATAAGACGGCAGAGGGTCAGAAAAGTCTGTCACCATTCAACCTCCCACTGGAGAGCCCCTGTTGGGAAATGATTCTACTCGGACAGAGGAAGTTCAGGTAAGGATCAAAGGTGGTCTGTAAGCACACTGCCACTTGGCCAAGACTCCTGTCTGTAAATGTTTTCCTTAGGTTCTCTTGGATTTTTGTCTTTATTTTTCTAGCATGCTAATTAGTTTATCTATGTCCCATGGTCTCTTGTTGGTCTAAGGACAGCCTGTCCTGCCCTTTGGTAGCGTGGGGATTCTTCTGAAGTATGATTGGTTGGCCTGCTTTACATGGTGTGGAAAAGTAGCCAGCAAGGTTGACGACAGGGTTGGGAAGGGAAAAGCTGAAGTCTCCCACGACTCATTTCAAAATGGAACAGTATAAGGGGGAGAAGGAAACTCAGAAAAACCTAAGAAGTTTAAAAAACATGGGGCAGCCCAGACTGACCATTACTACTAGAGCTATGCAGGAATTGAGAGGCCGCACGCTCAGATGCCTGGTAGAGAAACGTAAGTTAATTAAGGAGGCCCTGGGTCGAAAAGAGGGGGCAAAAATATATTAACCTAGCTTTGGGTTAACAGCAATCTGTGCAGTGCCTCAGTGTCAGTGCTGTAGTGTGGTGTGAAGGAGCCTATGGCTAACTGGAGAATGCATTTCCTCTGTAAAGGAAACAGCAGCTCCGCAGCTCCAGACACCTACTGTTGCTCAGGAATGCAGGGATTCATTGTTTGAGAAAAGCTGCAAATCAGGATTTTATGTGGAATCCTAACTTTACAGTATTTTTGAAATACTGATACTTTATTAATTTTTTTGAACCATTGAGTGGGTTCTTCTCCAGGTTCCAACTGTCTGGCAGCCCACCTGATTTAAATAAACATCTGAGCTATACACAGAAACATGTCTGCATACCCTCTGCACATCCTGAAGTATATATACACATGTCCAGCCTTGCCCCTCATAAACAAAGTGGTGTATGATACACAGCTGTAAAGATAGATATAGGACTATAGATAAGCATACATCTGTACATACCTGTGCATACACACAGGTAAGCATTTATAATCAAATAGGTGGACTGAAACTGGAATTCCTCAGAGTACACAAGGTGTTCTTGGGACACCAAAACTACAATTGTGGGGTTGAACGTGGGATTCATTGAGCAATGAGCAAATGCCTTTAGTGCTGCCTGCCTTGGCTCTGGATGGCTGATGGTCGGATGGGGCCAGTCTTAGGATTGGATCACCCTGGAGTACTTGAAGGGGTCAGTTTCCTCCTGGATGTGGGTTCAGAGGTGCCAGTGGCCTACAGCAAAGGCTCTTCTTTCTCTGCATCTCCTCTGCACCTCGTAGCTGAGAACACTTTGAGAAGCTCTTGGTGTTGCCCCAGGATGATCTGGTGTGAAAAGCATTGAGATGGGTGTTTGGAGGCTGTATTTTTTAGTAGCTCTGTTACCTTGAGCAGTCACAGCCTTTGTAGGCCTCAATTTCTTTATTGAAAATCTAGGGTTTTGATGAAAGCATCTTAGGTGCTTTTTCTTCTAAGAACCTGAAGCTTAACAGGATCCTTTGTGTATCTACATGTTTTAGGCATACATGTGCACCCCAGGAAATTCTCTCATGCCCTTTCTAGTCAATCTCTGCCCCACCCTCACCTCTCCAAGGCAACCACTGTGTTGATTTCTATCACTGTAGATCAATTTTGCCTGTTTTTGAATTTAGTATAATTAGAATCATATGGTCCATCTCCTGGCCCCCACCCACCCGCCCTGCTTAGCATAATGATTTTGAGATTTATCCATGTTTTGGTATGGTTTCAACAGCTTATTCTTTTTATTTTTGCTGAGTAGTATTCCATTGTATCAGTCTACCACAATTTGTTATCCATTCTCCTAGTGGATGGACATTTGGGTTTTTTTTGTTGTTGTTTGTTTGTTTTTTGAGGCAGAGTCTTGTTCTGTCGCCCAGGCTGGAGTGCAGCGCATGATATCACCTCACTGCAGCCTCTACCTCCCGGGTTCAAGCGATTCTCCTGCCTCAGCCTCCCGAATAGCCGGGGATTACAGGCACGCACCACCACGCCCAGCTAATTTTTGTATTTTCAGTAGAGATGGGCTTTCACCATGTTGGCCAGGCTTGGTCTCGAACTCCTGACCTCAGGTGATTTGCCTGCCTTGGCCTCCCAAAGTGCTAGGATTACAGGCGTGAGCCACCGTGCCTGGCTGGATGTATATTTTTATTAATTTTTGGATAAAACCTGAGAGTGGAATTGCTGGGCCATATAGCTAAGTGTATATTTAGATTTATATGAAACCGCCAGAGTGGTTTTCCAGAGGCACTGTACCACGGTCCACTTCCACCAGCAGTGTTGGAGAGTCCTGGCTGCTTCTGGCCATCGTCTGAAATAGGAATTTCTCTCACTGTAGGTGATACTTCTGACTTTGCAAGTTGAAGGATTATTAGTTTATGGGATTGAGACCTTCACCACCACCACTTCTTACCATAGCCCATACATTTCATAAATCATGGTTTTTTTGGTCATTACTAGATTCGGAGTTATTTGATGATGAGCGATGTCTGTCTTGCTGATTTAGCTACTAACTGAAACTAGCTTTTTCTAAGTTGGTGTCCTAATTTCACCCCCTTTGCCACTGCATCTGACTGTTTTCTTTCCGAGTGAAAGGATACATACAAATTTCAGAGGCAGAAACCTCTTTGGCCTCCTGTGTCTTTTCAGCGCCTTGCTCTTATTGCTTCATTATTGTTGCCAGTTGGTTTTTAAACAACAAAATCCTTTAAAATTCTATCAACTGGGTTTTGCTAAGTGAATAGACTAATTGCTTTAACTAGCAACGGCCTAGAAGTTTAAAAAGAGAGGAAGCTAGAAAGTAAAAGATAACATTTTAATAATCCTGGTTGTTTCTATGCCCTTGATGTTTAGTTCCTCGTGAAAACATGTTTTAGAAAGAATTTTTAAGCCAATCTGGCCATACACGGATTCCTGGATTTGCTTAGCTTGGTCCATGAGAAATATTGTTAAAGAGTGCTTGACACTGATGCTTGTTAAGTGGATCTTGTGAACATCATAAGGAGATTTTTTTTTTTTTTTTTGAGACGGAGTCTCACTCTGTAGCCCAGGCTGGAGTACAGTGGCACGATCTCAGCTCACTGCAACCTCCGACTCCCGGGTTCAAGCGATTCTCCTGCCTCAGCCTCCTGAGTAGCTGGGATTACAGGCATGTGCCACCACGCCTTGCTAATTTTTGTATTTTTAGTAGAGACGGGGGTTTCACCATGGTTGGCCAGGATGTTCTTGATCTCCTGACCTCCTGATCCGTCCGCCTCGGCCTTCCAAAGTACGGGGATTACAGGTGTGAGCCACCGCGCCCCAGTCCATAAAGAGATTTTAAAATGTGGGTCCTAGCTACAGGTAAGCTTGGGTTTGTGTAGTGGTGTAAGTTCCCTTGCTACGCCCTTTGCTCTTCTGGGCTGCTAGAGGGTGTAGTAATACTCCCACCTCCAAAAGTTGGACTTCGTAAGCCTTTATAACCCAGCGTGAATTGGAAAGAAGATGCAGGAGGTTTATCTCTATAGATGAGCTCTCACCAAGATTAGTCTAATACCTGGGTTGCGCATTGCAGGGCAAACAGCTCCAGGCCCTCAGAGCTGCTCAAGGCTTTTCAACCCAGGGGATGATAATCAATGTTATGTCAATGAATCAGCCAAACAGACAGAAAGATCACATTATGTTTTCTCTGTTTGAAAGGTAAATACCTCATACATTTTGAAAATTTCAATGAAAATCGTTTGAGTTAAGAAGTTCTAATATTTAAAGAGTTAAGCCTTTCATTTTCTGGAAGCCTTTGTGAATAGGGCTGGGTAGATGCAGCGGGCCCTGCATGTTCACTGCCCTTTGTAGCTTTTACAAATGACCTGTGTCATGTCATCCTCACTGTCTTCTCCCCACCAGGATGACAACTGGGGAGAGACCACCACGGCCATCACAGGCACCTCGGAGCACAGCATATCCCAAGAGGACATTGCCAGGATCAGCAAGGACATGGAGGACAGCGTGGGGCTGGATTGCAAACGCTACCTGGGCCTCACCGTCGCCTCTTTTCTTGGACTTCTAGTTTTCCTCACCCCTATTGCCTTCATCCTTTTACCTCCGATCCTGTGGAGGATGAGCTGGAGCCTTGTGGCACAATTTGTGAGGGGCTCTTTATCTCCATGGCATTCAAACTCCTCATTCTGCTCATAGGGACCTGGGCACCTTTTTTTCCGCAAGCGGAGAGCTGACATGCCACGGGTGTTTGTGTTTCGTGCCCTTTTGTTGGTCCTCATCTTTCTCTTTGTGGTTTCCTATTGGCTTTTTTACGGGGTCCGCATTTTGGACTCTCGGGACCGGAATTACCAGGGCATTGTGCAATATGCAGTCTCCCTTGTGGATGCCCTCCTCTTCATCCATTACCTGGCATCGTCCTGCTGGAGCTCAGGCCAGCTGCAGCCCATGTTCACGCTGCAGGTGGTCCGCTTCCACCGATGGCGAGTCCCGCTTCTACAGCCTGGACACCTGAGGTAAGAGGCAACATCCAGGAGGCAGAAAGGATGGCTGATGTCTTGCTGGGAGACAGCTGCTCTGTAGCACGTGAGGGGTGGTGACAGATGCCAAGAGCTAGGACCAGAGTCTGACTCTTTTCTGGTTTTGGGGAGGAGATGCGAGGGTGGGGAGGGTTGTCCATGTTCATTGAGTTTCTGGACTTCTAGATGGTGCGGGGCAGTTGCTGGCTCTCACCCAGGTTGAGATTTTGCTGGGCTTGTTCTCAAAGTTATTGGCAGCTCCCAAAAATGATGGAGAAAGGAGATGCATAGTGATGGCTGCCTTCTTTGACTCTGAAATTGGCCAATGGACAACAGATAAAGTGACCAGCAGCTCCATTTTGTCCCAAATGTGACATCTGGTTTACCATGTTGTCCCAGTGGAATAATGAATTGTTCCTTTTTTCCCACTCTCAGAGGCCTGGTTTGGGCAGTAAATTATATGGTCATCCGAGGGACCCTTCCAATAAAGAATCAAGTGCAGGTTAGAGACTCCAAATGTGTAATCCTTGAGTGTTGTGAAAATGTATGCCGTGAGAAAAAGTTAGAAGTCAGTTGGGTTGTCATACTTACATCTTTGCATAAAATCTCATTATTTTGTGGTTAAATAAGAGTGATTACCATCATTTTATTTGCTTCAAGGTAAGCACTTTATATATAGATTGTGTATTTAGTCTTCATAGAACCCGTGACCTAGGTATTATTAATCCCTGTGTCACAGATGAAGAAACTAGGGCTTAGGGGATTTAAGTAATTTGCACAAAAACATATGGCTAGCCTCATTTAGGATTCACTCAGATGTCATGAGGCCAGGGCTGAGTGAATGCCCCCATAATGGCATCTCTCACTTTGTGGTTAGTGGCCTATTTTTCCATCTGTTTTCTTCCACAGACTATGAACTCCCTGAGGCCAGGGGCCACCCTTATACCTCATTACATCCTCAGTGCCTGGCATGGAGCATGGCTTGCACTGAGATGTTCTCTGGGTGAATGCAGAGCCTGGGACATTTGACTTCAAAGCCTTTACCCTCTCCCAGGCTCTCTGCCTCCTTAGGCAGTATATGCTGATGTGTGTAGCCTGCTTGGGGCAGGGTAGGCACTTAGTTCATTGCAGCTATTACTGCTGTGATCATGTAGCTGGCAGAGCAGCCAGAATCAGCAAGGGCACACCTTAGTGGGTATCAGAACAATCGGCTTTGTCATAGATTTGGCTGGGCTCCAGGAAGGTGGCTCAGCCTGTATTTGGAGTCAGGCCATGCTGCCAAACCATCTTCATGTTGGTGTGTACCCCCTCCTCCATTCCTCTGGCTTGGCTTGTGCTACGAGAACGGGATGATCTAGCGTTCAAGGTTGCTGCCACCCTAACTGATCCTTGGTGGAAACTGGTGTCCAAGTCACATGTCTGTGCACCAAAAATCTGGGGTTTAGAGTCCTTTCACAGATGCCTGTAGGGCTCTGAAGACAAGTAGGTCACCGCTTTGCTGCATATTCATCTCAGAAGGCTTTCTTTTCCCATGTTTTGCATCAGGGAATGACCAGCAGTTTTGTGTTAAACATCTGCTGTGTGCAGAGCCCTTGGACACACCAGGCTGGCTGCCTTCAGAGCTCTATCTCAGCACCTGTGGCACTCACAGTCACTTGGAAAGAGACCAGTGCACCGCTGTCTGGTGGACAGGTTTCCAGGAAACAGGCCTGGGGGTATAGGTGATAGGAACACAGGAGGACAGAGAATTTCAGATTGTGGCAGCAATAAAGCCGAGCAGGGAGACAGTCTGTCTCAGAACAGGTTTTGCTGCAGTTAAAGTGGTAGAGAAAATCCGGCTGTGGTCTCAGTGGAGATGAATGATATTTGGAACTCTGTATATGTAAGTAGCCAAGACACTTGGCCAGGAGTGAGGTCTATGGTGGTTTTGTTTTTTGGCCCTTAGCCCTAGTTGGTGTGAATTCCACCTGTGTAGGTGGGAAAGGGCAGGGCATCTTCTCACCATAGGTCATGCAGGGTGGTGGGACCGACTTACCCCCATGGGCTCCCACATCGCTCCTCCCTGTACGACTGGTTGAGCTGCACACTGCATCTGAGTGGGAGTGGAGAGGGGACAAACCAAACAGCCCGAGGAAAGTATGCCTGTGGCATGTTCAGGAAAGCATGATTAGCAGCAGGCCCTCGCCTCCCACCACACAGCTCTGCTGGTCAGGGCAGAGCTGGATGGGAGAAGCCAGACTGATTGTGCTGCATGGCTCCCAGGCTTGCTGCAAACCTTTCAGTCTGCTCTTACCATGACCAACAACTGTCCAGGCTTTTAAAAAACTCAAGTCAGTCACCCCAGCTCCCCAGGGAGAACTGAAAGGTGGCAAGTGCCCATCTGCCCTGGGGAGAGCGTTTTGAGGTTGGTCCCCAGCCTCATCCTTTCGGCTTCTTTTTAGGACCATTGGTGTTCCTCCTCTCCCTGCCTTTAATAAGGCCCCCTTTGTCCCTCTCGTGGAGAGCCTGAGTTAGGAGGTGGAAAGAATGGCTGGGGAAAGAGGGACATCTTTACTGACAAATGGAGCCCTCAGGGAGAGCCAGATGCCCAAGTGTCAGCCAGTCTGCCAGAAGCTGGAGCAGGCTTGGCACCTTTCCTCCTGGCATTGTGTGGGCCTGGTCACCTGCCGATCCTTGGGCTAAATCTGGTCTGAACCCAGCAGTGGCTGGAAGAGTTACTAGGCCAGAAATACAACTTCTAAGGCCTTTTGTAAGTGTAGAAACAGACAGGAGGGAAGAGGGAGCGGGAATAGACAAAGCAAGCCTCGGAAATCAGAATAGCAGGTCTCCAATTAGACCCAGCAGAATCACAGGCTGTTGGCTCTCCCTTTATGTAAAGCCTTCACCGTGGCAGCACCCTATTGGGCTTAGGTGCCAAGCGATGGTGAGTTCTTTTTTATGTTTTCAAAGATGATTTTATCGAATTGACTGAGCTATTTTTGAGAGTTGTCTAAAGAATGTCTACTCTTTAGTTTCTTAAAGAAAATAGGCTTCTCATTAGTTCATAAAAGGTGCTTGCTGTGGCCCTGCTTGTTGGCAGGAATGAAGTTTTGGGCTTATTTGAAAACTTTCAAAAATGTAAAAAGTTGTTGCAGAAAGTAAGATACCATAAATAGATTGAGATACTTCCTAACCTCTGCCCAGTGCCCTAGGAGTTATGAAAAGCTTTTCATAGGTTTGGACTCATTTACCCCTCCTTGCTGGCCCTGTCAGAGGTCAGAGCAGTGGGGTAGAGGTGTCCCCTCTTACAGTTGAGGTCCCCAACCCCCAGGGTAAAGGGACCTGCTCAAGGTCACAGGGAATCAGCACCTTCCTGTGCATCACACCGCTTACCCCACCGCACTTTCTACAGCGTCCTGGTGTCTCACACAGTCGCTTTGTCATTTTCCATACACACCTTGCTCGTCACTTTTCTTGGCCCCGTTCTCACAATAAGTTGCTAACTTTTCCAGGATGTTACCAGAGACTAATGACTGTTGATATGACTTTATTTGAGGAGAAACCCAGAAGAATAGAAGAGCCTAAAAATTGGCATTCAATTATCTTAATCATTTTTCAGTTTTGAAACCTCTAAAGGGAAATAAGTGTGAATACTGGTGCACAGGCACTAGTGTAATTGACTGGTTGAATGTGAAATGGTAGAACAACAGAGACAGATAAAGAAAACCTTAACAATAAACAATTACATAGTCTTACTGGGAGCCAGGTACCTTCTGATCTTTTAATGTGCAACTCAGTTTTCACAACAATCATGTGAGGTAGGTTCTGGTAGCCTCCTTTTCAGATGAGCAAACCGAGGCGTGGGAGAGTTAGGTAAGTCTAAGGCCCCACACTTTGTAATTGTGGGAACCAGGATTTGAGCCAGGCTCTCTGGTTCCAGAATGATCTTACCCATTTCACCATACTACCTCTGAATAGATAGTTGCATGTTCACGTCACTGCTTTAGGAACACATGAACAAACCCAGAAGCATTTTTTGAGTGTTTCCCATGTGCCAGCCTCGGTGCCAGATAATTTTTGTATACATTATCCTGTTTCACTTAACACAGAGCTTAGGGCTGGAACAGAGAGAATGTGCAGGATTGTCAAGATGGCTTGCACTCTGAGGTCCATTCCTTAGCTCCACTGGTTATTTTATTCACTCAGTTGACAAAGCCTGGTCCTTAAAGTCATAGCAGTAGGTTTGAAGCCTTTGGTCAGACCTTTTTAACTTCCTATACTTCACATGTCTCAAAGAAAATATATTTTATCTATTGTTGTGTAACAACTACCTCACCACTTAGTAGTTTAAAACAACAATTCAGTATTTCTCACAATTCTGCAGCATGGATTGGGCTCAGCCAGGAGGTTCTTTAGCTGATCCTGCCCATGGTCCTGTGTGATTATACTCACATGGTGAGTCAGCTGAGGCTGACTTTTTTTTCCTGTGTATTCTCTCCTCCTTCTTTCCTGTATATTCTCTCCTTCTCATGGCCTCTCCTTGTTGTTTCCTCATTAGGGTTGCTAGACTTCTTAAATGGCAGCTCAGGGCTCCCAGGAGCACAAAAGCAGAAGCTCCCAGGCCTTTTAAGGTCTGGGCCTGGAACTGTCCCAGTAGTATTTCCACTGCATTCTCTTGGTGAAAGCAAGCCACAGCCCTATCCCACATTTAAGGAGAGAGGACGACACAAGACAGAGGCTACTAGGATGTTAAGTTCAATGTCACAGACCCCCACAACCACCTCGGGCTGTCTTTCTCAGAGTCTGTTTATTCTCATTTTGCATATAGTTCTAACAAATTTAATTATTGATTTCTACATCTTAAAAAGCCCAGAAGTAATATATTTTGAGGGGGGGAAAAAGTGCTGCTTTAAGGAGGTATATGAACATCAATGGAAAAATGATAGCTGATAGTCATCAACAAGGAGGGAGACAGAGAAACCACAAAAGCAGGTATGACTCAGCACTCTGGGAAGCTTTCCACAGTGACCCATTCTATAGGATATTTATATTGCTGAAGCTCCCTTGTACCTAATTCAGCCAGCAGGTTTTAACTGTTTGGGTTTTTAAGCTTCAGGGTCAAAGTTTTGGGGTAAAAAATGCTTCATTCATTGAGACTGAGAGAGAGTAGCTTATAAATTGACACTGACCATAGACCTTGATTTTGTGTCCCCACCCAAATCTCATCTTGAATTATAGCTCCCATAATTCCCACATGTTGGTGGGAGGGACCCAGTGGGAGTTCATTGAATCACGGGGGCGATTTCCCCCATACTGTTCCTGTTGGTAGTGAGTAAGTCTCACAAGATCTGATCGTTTTATAAGAGGAAACCTTATAAAAGGTGGCTCTCATTTTCTTCGTTGTCTGTAAGATGTGCCTTTTGCCTTCTGCCATGATTGTGAGGCCTCCCCAGCCACATGGAACTGTGAGTCCATTAAACCTCTTTTTCTATATAAATTACCCAGTCTTGGGTATGTCTTTATCAGCAGCATAAAAATGGACTAATACAGATCTTTTAATCAAAGATGTGGTAACGAATCACAGAACCACCTGTGCTTTAGAAGAAATGATCCTTGGATTGCTTTTCAAGCAATGGAAGTTTATGATTGTCACATTGTCAATTGTGATTATATTCAAGGAAGTGTTATGGACTGAATGTTTGTGTCCCCCCAAATTCCATATGTGGAAGCCCTAACCTCCAGTGTGGCATATTGGAGATGGAGCCCCTAAGGAAGTAATTGAGGTTAAATGTAGTCATAGTGTGGGGCCCTGATCCCATAGGACTGGCATCCTTACAAGAACAGATACCAGAGAGCTTGCTCTGTCTCTGCACACACCCTTAGAAAAGGCTGTGTGAGGCCCCAGAAAGAAGGTGGCCATCTGTAAGCCCAGAAGAGAGCCCTCAATAGGAACCAGGTTGGCTAGAACGTTGATCTTGGACCCCCAACCTCCAGAACTGTAAGAAAATAAATTTCTGTTGTTTAAGCTACCTAGGCTGTAGCATTTTGATACAGCAAGCCTGAAGCTGAGACAGGAATATTACATACACTGGAGACTTGTGACCCCAAAGACTTTTGACCTGTTGAATAGAGCTCATCTTGTCTCTCTCCAGCTCATGCATGCATCCTCCCAGCTTGCAAGGGGGCCTTGCTTCTCTGGATTGCACTTTGATTTTCTAGTTTTAAGTGACAAAGGGAGAGTCTTCTAGGGATGTTAAAGTTACTCCAGTAATTCCAGGATATTTCCAGCTCCTTTTGAAATCTTATGTTTGTAATTCTGGGTCAAGTAATGTCCAAGCCAGTGATTACATTACTGGTAGGCATGTCTCTCATGCTGGGCCACGCCCTTCCATCCCATGTTCACGATGAGCACCAACGGTTCTCTGAGAGCCCAGAGCCAGTGGCTGCAACGTTGGGAAAATTCTTAAATGACCATCAGTGGTTTTGGCTCATGTTCCTACGATTGTGGGGTTCATATACCATCTCATTTTTAGAAATGTGTGTTTTTGTACTCCTGTAATTACTTTTTAATAAAGATATTTTGCCAGTCCTTAGCTCCACTCCAATAGCAAAGCAAAGGACAAGAACAAGTAAGGGCTGAAACATAGAGCGTGGAGGGTTTTGCTCAGGCCATGCTTTGCTGTGGGAGAATTTTGAAGGCGGGAGTGGAGCTGCCGTTTGTGGTTTGGTGCTGTGGTGCCTGTTAAAAGTGGCTTTAATGAGAGTGTAAGGTGCTGCACACTGAAGCCCTGTGTTTATTCAGCTGCCTCCTGCCAGCGGCTACAGCTGGGATGGCTTCCCTCGCACGGCGTCTGCCCACAGCCTTGCGCCCGGAGCCCAGAGGACTCACAGGAAAGGAGCTGGCAAAGGTGGAAGCTGGTTTTCATGGTCTCCTGAGGGCCCCTGGCCCCTGGGAGATGGGTCACACTCCCTGAATGCTGTGCTGTTGGTTTCCCTGGAGGATTCTTGCTGCAGGCCAGGTCCCGTATTCTCCACACTCACCACAAGTGGCTGGGTGTGACTTGACACGGTGTGAAAGTGGAGGGGCGCGAGCACTCAGGTGGGTGAACAGCCTGCGGGCCTCCTTTCCCTGGCTGCAAAGCCGCCACTCAACTCTGCTCCAGCCCAGGTTTCGGGGAGCCGGGATCCACTTGGGCAGGCCGGGAGCCTCAGACTCCAGACTTTTCATGGTGCGCTCCTTCCTGCTTACTCACGAGGAAGGCGAGGCAGTCCAGCATCCTGGGTGGAGTGGAGGGTGTTTCGAGTCCATATCTAAATCTTTTTCTTAGAGCACCCTAAGCAGGCTGCTGTCTTTGATCCCCATGCCTCTGCTGTTTATCTTGGTGTGATTCATCACTGTAATTTAAGACGTGGAGAGAGCAGAGTTCCCATCCCAGGCAAGGGGAGGCGCAGTGCAGGTTGGACTGTGTAAGGAAATGGCAGCATGGCGAGGTTTGTGCCGCGGCCTACAAGCAGGGCTGCATGCTCCCCAGGCAGACTGTGGCAGAGCCAAGCCCCTCACTTGTAGGGAAGCGGTGTCTCCTAGGTGCCCCAGTAGGGGAGGTCTGCCAGCATCGCTGTGCTGTGGGGAAGGCAGCCAGAGGGCTTCATCCATAACTGGCTCAGCTCCTCAGGAGGAGACCAGCAGTGTGTCTCTGCACTCAGAACTGCCACTGGGTCGTGGTGTTAAGCCCAGGAGGGGTGCATATGTGACAACCTTGTATTGCTTAGTTGCTGAGACCAGAAGATCCAGGTAATTGCATGAGCTATTAGCTACTTCTGGTTCTCACAAACTCCTCCCAGTGTTGATAGAGAATGGTGTCCTCCGGGCATGCTCTGGGTATAGTTTTATTTGTATTATTAGGGACTCATGGAGAAGTGCTCTGGGTGTCCTGCACACTGCACTTTGGAGATCATTCTGTGATTCCCAAGTCCTGCTGATTCCACTTCCTTGGCGCTCTGGGATTAGATATCCTAGGCTGCCAATCTGCATGTTCATCTTTCAGTGGGGATACCCTGCAGGGCTTGTCCACAGCTTGAATTTCAAACCAAAAGCCAGGTACGCTTTCCAAGCCTTCCGATATTGGTTCAAAGAATTTGGCTGCCGAAGCTTTTGTGTAGCTGAGGCACCAGCAGGCCGAGGCACGAGTGAATCCATGTGGCCCGAGGAAGAGCCTTCCCATGGGCCTCAGCAGCCACACAGAGCCTCTGATCTGTTTCCCTTTGCGGGATGGTCAGTCTCCTGTGTCTCAAGACCTCAAGCAGAAACGTGTGGATCTCCCCCTCTATCTTGAAAGTCCAACCAAGTCCAGGCCTTTGTTGTGCAGTTTAAACCAGACCTGTCAGTAAACATGAGCTAATTCCAGTTTTTGTCCCTCTTTGTCCTTCTCAAGTTCCAAGGTGATCATTGCCTGTTATCTATGGGACTTGTGTAAGCTAACTTCCCAAATGCAGCTGTGAGACAAACATTTTAATTAAAAGGCAGAAGGGCCAGGAGATATAAACACTCATGTGCCTGGTTGTCAGTGAAGGCCGGGTGGCGTTCAGCGTCCAGGGGCTAATTATATTCTCTTCTCTGGGACTCACACAAATATTGCCACAAATGTACCTGACTGTCAGACTGAAGTCATTTATCTCCAAGTGTGGGGAGCAGTGAAGCCCACACGTCCAGGTAGATTTAGCTCTTACGGACTCTTCTGGGAAGCGGCAGGTGGGTAAAACTGAAAGCATCAGCTATTGCACCCTAGCTGCAGGTTTTCACAGAAAGCTGAATCAACTTGTATTGGGGATTCTGCATTTTAGAGTTCTCTCAAAGACCTAGGTTTGGGCCCTAAAATGCAGCCACCAGAGCAGGCACACCTTAAAAAGTAGGTAATGAGTGGCCTTAGTGCCTGGGCAGCTGTCAGTACTGGCCTCCTTTGGTTGTCCCTGTCCACTGACCCTCCTTCCTCCCGTTCTCTCACGTTTGCATTCATCTGCAGCCTCCATTACCATTGACCAGCTTTGCCGCTTACCTGCCTCCACCCTTCCTTCCCTAAGTTCGAGTAGTTTCCTAAGTAGCTTCCCCTTAGTTTCCTAAGGCTGAAGTAATAGAGTATAGCACAAACTGGATGGCTTAAAAGTTCTCCTTCTCAGAAAGTACAAAATCTAGGTGTTGGCACAGCTGTGCTCCCCGCAAAGCCTCTAGGGAAGAATCCTTCCTTGCCTCTTCCTAGTTTCTGGTGGCTGCTGGCAACCCTTCGTGTTCCTTTATGGCTGTGTCACTCCAGTTTCTGCCCCCATCATTACATGACCTCCTCTCTGTGGGTCTCTGTATGTCCTCTTTTTTTTTTTTTTTTTTTTTTTGAGACAGAGTCTTACTCTTTCGCCCAGGCTGGAGTGCCTTAGTGCGATCTCAGCTCTCTGCAACCTCTGTCTCCAGGTTCAAGCAATTCTCCTGCCTCAGCCTCCCTAGTAGCTGGGATTACAGATGCATGCCACCACACCCAGCTGCTTTTTGTATTTTTAGTAGAGATGGGGTTTCACTTTGTTGGTCAGGCTGGTCTTGAACTCCTGACCTCAAATGATCTGCCTGTGTCGGCCTCCCAAAGTGCTTGGGATTACAGGCATGAGCCAAAGAGGCCTGGCCACCTCTTTTCTTATACGGATACCAGAATGACTCTACCTTAACTAAACATAACATCTGCAAAGACCCTGTGTCCAAATAAGGTCACATTCTGAAGTTCTGGGTAGATGCAAATTTTGGGAGACACTATTTGCCACTACGGGTTTCTCCAAGTTGTGCTGTCCTGTGACTGAACACTGACCCTGTTGCTCTATTGTGGATGTTTGGGAGGATTAACACCAGCCTGTTTCTGCTCACACTGCTTTCTCTGCCTGGAATTTGCTTTACAGCCTGTTGTCCCTGGTGTGACCATCATTCATCTTGTAAGACTTAACAGCCCAGCTTTGCCCCCACTGTGGATTCCTGGCGGAAACACCGTCCCTCTTCTCCTGGTGTACATGTTCTGTCACGGTCTCTGGATCACTTGACTGTACTTATTGTTTGAGTGGTACTCACCACTAATTGGAGCATAAGCCCTTGAAGGTTTCATTTACATCCGGGTCTCAGTACTAGGGACTTAGCAGGTGCTCATAATTCATAACTGCTTTTGATGGAGTTGGAGAGGCTAGTTATTAAGATTTTTCTGAATGTAGCATCCTTAGCTGGCTTTCAGGATAGTGACTGCATGCTCTAAAAGGAGATCTTTGCAGATTTTACTCTATGATGAGAAAACTTTTTAGTACCTTTTTCTGTCAGCATGTACCTAGGTAATAAGAAGAATGACGTGACATGTATTTGGGGAATTAGCATACAAGAAGTACTTTGACATTTTCCACGTGTGAAGAAAGCTGTTTTTATTGACCTAGCTGAAGGGATCAAATTCATATTTGAAAAGATGTGGCTAAAACTTGAAAAGGACTTGTCCATGGGGGATGTCTTCATCCCTTCTCCCTGCTCAGGGAGAAGTTGCAGGCCATAAAGTGTGAGGGCATCAGCCTGGATGAGCTCTAAAGCTCAATGTTAATTTCGGATTTTAATAACATGTAAGAATACAAGTTTGATTGCAGAAGCCACAATCAAGTTCAAGGGAGGGTGAATGAATGGGAAGTAGAAGGGACCATGTGTACGTCTGTGTGTGTGTGTGCATGTATGCACACATTTTTTTAGGCTTTGGTTCCTTTAGCTATAAAATAAGGGAGTTAAAATGGGGGTGGTTTATGGTCACGATTGTTTCCTTCAAGTATCTAAAAGGCTTTCACCAGCAAGGGGGACTTGGGTTCAGAACAAGGCATTGGGTGAACCAGTGGTAGAACTGAAAGGTCGTTTCAGTGTGAAAGTGAACTTTCTTTTCAAGCAAAGGCTGGATGTCATTTGGGAGCTTGTAAAAAGAATTGTTATATGGCTGCCAGGCGCAATGGCTCACACCTGTAATCCTAGCACTTTGGTAGGCTGAAATGGATGGATCCCTTAAGGCCAGAAGTTTGAGCCAGCCTGAGCAACATGGTGAAACCCCGTCTCTACTAAAATACAAAAAATGAGCCAGGCGTGTGGCATGTGCCTGTAGTCCCAGCTACTCCGGTAGGCCCGAGGCACGAGAATTGCTTGAACCTGGGAAGCAGAGGTTGCAGTTAGCTGAGATTGCACCGCTGCACTCCATCCTGGACGACCAAAGGAGACTGTCTCAAAAATAAAAATAAAAATTGGCCAGGTTTGGTGACTCACACCTGTAATCCCAGCACTTTGGGAGGCCAAGGCAGGTGGATCATCTGAGGTCAGGAGTGCAAGACCAGCCTGGCCAACATGGCGAAACCCCATCTCCACTAAAAATCAGCCAGGCATTGGTGGCGTGCGCCTGTAATCCCAGCTACTTGGGATTTGAGGCAGGAGAATTGCTCGGACCTGGGAGACAGAGGTGCAGTGAGCTGAGATACGCCACTGCACTCCAGCTTGGGCGACAGAGCGAGACTCCATCCCTAAATAAATAAAATAATTAAAAAAAACAATTGGTATATGGATTGGGAGATTGGACTGATGGACTCTTGATTTCTCTGATTCTTGAATCTCTGCTTGGGTGAACACTAAGATTCCTTCCATTTCTAAAATTTTGAGGTTCAGTATAAATAGAAATGGGGTCATGCTTTTGTGGCTAGGAGCTGTGTGTATATTCTAGCCACCAA"; + + let ranges = symmetric_dust(read); + let expected_ranges = vec![ + 742..808, + 3169..3223, + 3406..3413, + 3424..3431, + 3437..3444, + 3729..3764, + 4729..4736, + 5831..5862, + 6449..6456, + 7014..7031, + 8194..8201, + 12955..12963, + 12971..13033, + 13369..13376, + 17841..17864, + 19193..19207, + 19221..19228, + 19746..19763, + 20037..20063 + ]; + assert_eq!(ranges, expected_ranges); + } } diff --git a/wdl/HidiveCorrect.wdl b/wdl/HidiveCorrect.wdl index 929541f4..80d0d42c 100644 --- a/wdl/HidiveCorrect.wdl +++ b/wdl/HidiveCorrect.wdl @@ -30,8 +30,9 @@ workflow Hidive { call Correct { input: + locus = locus, model = model, - long_read_fasta = Fetch.fasta, + long_reads_bam = long_reads_bam, short_read_fasta = Rescue.fasta, } @@ -71,7 +72,7 @@ task Fetch { } runtime { - docker: "us.gcr.io/broad-dsp-lrma/lr-hidive:0.1.77" + docker: "us.gcr.io/broad-dsp-lrma/lr-hidive:kvg_phase" memory: "2 GB" cpu: num_cpus disks: "local-disk ~{disk_size_gb} SSD" @@ -120,7 +121,7 @@ task Rescue { } runtime { - docker: "us.gcr.io/broad-dsp-lrma/lr-hidive:0.1.77" + docker: "us.gcr.io/broad-dsp-lrma/lr-hidive:kvg_phase" memory: "~{num_cpus} GB" cpu: num_cpus disks: "local-disk ~{disk_size_gb} SSD" @@ -130,29 +131,29 @@ task Rescue { task Correct { input { File model - File long_read_fasta + String long_reads_bam File short_read_fasta + String locus String prefix = "out" Int num_cpus = 4 } - Int disk_size_gb = 1 + 2*ceil(size([model, long_read_fasta, short_read_fasta], "GB")) + Int disk_size_gb = 1 + 2*ceil(size([model, short_read_fasta], "GB")) command <<< set -euxo pipefail - hidive correct -m ~{model} -g {prefix}.gfa ~{long_read_fasta} ~{short_read_fasta} > ~{prefix}.fa + hidive correct -l "~{locus}" -m ~{model} ~{long_reads_bam} ~{short_read_fasta} > ~{prefix}.fa >>> output { File fasta = "~{prefix}.fa" - File gfa = "~{prefix}.gfa" } runtime { - docker: "us.gcr.io/broad-dsp-lrma/lr-hidive:0.1.85" + docker: "us.gcr.io/broad-dsp-lrma/lr-hidive:kvg_phase" memory: "4 GB" cpu: num_cpus disks: "local-disk ~{disk_size_gb} SSD" @@ -182,7 +183,7 @@ task Align { } runtime { - docker: "us.gcr.io/broad-dsp-lrma/lr-hidive:0.1.78" + docker: "us.gcr.io/broad-dsp-lrma/lr-hidive:kvg_phase" memory: "32 GB" cpu: num_cpus disks: "local-disk 100 SSD"