From 250c0291149b19ca1501355ebf7e5587cce74eda Mon Sep 17 00:00:00 2001 From: Eoghan Harrington Date: Tue, 10 Jan 2023 15:32:28 -0500 Subject: [PATCH 01/11] chore(lib.rs): apply cargo fix --- src/lib.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 7f65e0e..d662108 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,5 @@ use std::cell::RefCell; -use std::io::BufRead; + use std::io::Read; use std::mem::MaybeUninit; use std::num::NonZeroI32; @@ -456,7 +456,7 @@ impl Aligner { let mut idx: MaybeUninit<*mut mm_idx_t> = MaybeUninit::uninit(); - let mut idx_reader = unsafe { idx_reader.assume_init() }; + let idx_reader = unsafe { idx_reader.assume_init() }; unsafe { // Just a test read? Just following: https://github.com/lh3/minimap2/blob/master/python/mappy.pyx#L147 @@ -480,8 +480,8 @@ impl Aligner { /// Map a single sequence to an index /// not implemented yet! - pub fn with_seq(mut self, seq: &[u8]) -> Result { - let seq = match std::ffi::CString::new(seq) { + pub fn with_seq(self, seq: &[u8]) -> Result { + let _seq = match std::ffi::CString::new(seq) { Ok(seq) => seq, Err(_) => return Err("Invalid sequence"), }; @@ -625,7 +625,7 @@ impl Aligner { let mut m_cs_string: libc::c_int = 0i32; let cs_str = if cs { - let cs_len = mm_gen_cs( + let _cs_len = mm_gen_cs( km, &mut cs_string, &mut m_cs_string, @@ -644,7 +644,7 @@ impl Aligner { }; let md_str = if md { - let md_len = mm_gen_MD( + let _md_len = mm_gen_MD( km, &mut cs_string, &mut m_cs_string, @@ -762,7 +762,7 @@ impl Aligner { } // If gzipped, open it with a reader... - let mut reader: Box = if compression_type == CompressionType::GZIP { + let reader: Box = if compression_type == CompressionType::GZIP { Box::new(GzDecoder::new(std::fs::File::open(file).unwrap())) } else { Box::new(std::fs::File::open(file).unwrap()) @@ -909,7 +909,7 @@ mod tests { #[test] fn test_builder() { - let mut aligner = Aligner::builder().preset(Preset::MapOnt); + let _aligner = Aligner::builder().preset(Preset::MapOnt); } #[test] @@ -930,7 +930,7 @@ mod tests { let mappings = aligner.map("ACGGTAGAGAGGAAGAAGAAGGAATAGCGGACTTGTGTATTTTATCGTCATTCGTGGTTATCATATAGTTTATTGATTTGAAGACTACGTAAGTAATTTGAGGACTGATTAAAATTTTCTTTTTTAGCTTAGAGTCAATTAAAGAGGGCAAAATTTTCTCAAAAGACCATGGTGCATATGACGATAGCTTTAGTAGTATGGATTGGGCTCTTCTTTCATGGATGTTATTCAGAAGGAGTGATATATCGAGGTGTTTGAAACACCAGCGACACCAGAAGGCTGTGGATGTTAAATCGTAGAACCTATAGACGAGTTCTAAAATATACTTTGGGGTTTTCAGCGATGCAAAA".as_bytes(), false, false, None, None).unwrap(); println!("{:#?}", mappings); - let mut aligner = aligner.with_cigar(); + let aligner = aligner.with_cigar(); aligner .map( From 44326fcfbe8fa181c25d4cb2a4d4bf7562b136d6 Mon Sep 17 00:00:00 2001 From: Eoghan Harrington Date: Tue, 10 Jan 2023 17:08:07 -0500 Subject: [PATCH 02/11] feat(htslib): [wip] convert Mapping -> bam::Record WIP to convert Mapping to a bam record, can be enabled with the `htslib` feature flag. In order to get it to compile on linux I needed to move the flate2-dependent code behind a `map-file` feature flag. --- Cargo.toml | 24 ++++++++++----- src/htslib.rs | 78 +++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 12 +++++++- tests/htslib.rs | 58 ++++++++++++++++++++++++++++++++++++ 4 files changed, 163 insertions(+), 9 deletions(-) create mode 100644 src/htslib.rs create mode 100644 tests/htslib.rs diff --git a/Cargo.toml b/Cargo.toml index fe0398a..b15819a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,25 +9,33 @@ repository = "https://github.com/jguhlin/minimap2-rs" categories = ["science"] keywords = ["bioinformatics", "fasta", "alignment", "fastq"] exclude = [ - "**/*.fasta", - "libsfasta/test_data/", - "*.profdata", - "*.mmi", - "**/*.mmi", - "minimap2-sys/" + "**/*.fasta", + "libsfasta/test_data/", + "*.profdata", + "*.mmi", + "**/*.mmi", + "minimap2-sys/", ] [dependencies] libc = "0.2.134" bytelines = "2.4.0" -simdutf8 = "0.1.4" -flate2 = { version = "1.0.17", features = ["zlib-ng"], default-features = false } +simdutf8 = { version = "0.1.4", optional = true } +flate2 = { version = "1.0.17", features = [ + "zlib-ng", +], default-features = false, optional = true } # Dep for development #minimap2-sys = { path = "./minimap2-sys" } minimap2-sys = "0.1.7" fffx = "0.1.1" +rust-htslib = { version = "0.40.2", optional = true } + +[features] +default = [] +htslib = ['rust-htslib'] +map-file = ['flate2', 'simdutf8'] # [profile.release] # opt-level = 3 diff --git a/src/htslib.rs b/src/htslib.rs new file mode 100644 index 0000000..84e4df0 --- /dev/null +++ b/src/htslib.rs @@ -0,0 +1,78 @@ +use rust_htslib::bam::record::{Cigar, CigarString}; +use rust_htslib::bam::{Header, Record}; +// bam, bam::header::HeaderRecord, bam::record::Aux, bam::CompressionLevel, bam::Format, +// bam::Header, bam::HeaderView, bam::Read, errors, tpool::ThreadPool, +//}; +use crate::{Alignment, Mapping}; + +fn cigar_to_cigarstr(cigar: &Vec<(u32, u8)>) -> CigarString { + let op_vec: Vec = cigar + .to_owned() + .iter() + .map(|(len, op)| match op { + 0 => Cigar::Match(*len), + 1 => Cigar::Ins(*len), + 2 => Cigar::Del(*len), + 3 => Cigar::RefSkip(*len), + 4 => Cigar::SoftClip(*len), + 5 => Cigar::HardClip(*len), + 6 => Cigar::Pad(*len), + 7 => Cigar::Equal(*len), + 8 => Cigar::Diff(*len), + _ => panic!("Unexpected cigar operation"), + }) + .collect(); + CigarString(op_vec) +} + +pub fn mapping_to_record( + mapping: Option<&Mapping>, + seq: &[u8], + header: Header, + qual: Option<&[u8]>, + query_name: Option<&[u8]>, +) -> Record { + let mut rec = Record::new(); + let qname = query_name.unwrap_or(b"query"); + // FIXFIX: there's probably a better way of setting a default value + // for the quality string + let qual = match qual { + Some(q) => Vec::from(q), + None => { + let q = vec![255; seq.len()]; // Vec::with_capacity(seq.len()); + q + } + }; + + let cigar: Option = mapping + .and_then(|m| m.alignment.clone()) // FIXFIX: we probably don't need a clone here + .and_then(|a| a.cigar) + .map(|c| cigar_to_cigarstr(&c)); + + match mapping { + Some(m) => { + // TODO: set strand + // TODO: set secondary/supplementary flags + rec.set_pos(m.target_start as i64); + rec.set_mapq(m.mapq as u8); + rec.set_mpos(-1); + // TODO: set tid from sequences listed in header + rec.set_mtid(-1); + rec.set_insert_size(0); + } + None => { + rec.set_unmapped(); + rec.set_tid(-1); + rec.set_pos(-1); + rec.set_mapq(255); + rec.set_mpos(-1); + rec.set_mtid(-1); + rec.set_insert_size(-1); + } + }; + rec.set(qname, cigar.as_ref(), seq, &qual[..]); + // TODO: set AUX flags for cs/md if available + rec +} + + diff --git a/src/lib.rs b/src/lib.rs index d662108..31c303f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,10 +5,17 @@ use std::mem::MaybeUninit; use std::num::NonZeroI32; use std::path::Path; -use flate2::read::GzDecoder; use minimap2_sys::*; + + +#[cfg(feature="map-file")] +use flate2::read::GzDecoder; +#[cfg(feature="map-file")] use simdutf8::basic::from_utf8; +#[cfg(feature="htslib")] +pub mod htslib; + /// Alias for mm_mapop_t pub type MapOpt = mm_mapopt_t; @@ -716,6 +723,7 @@ impl Aligner { /// /// TODO: Remove cs and md and make them options on the struct /// + #[cfg(feature="map-file")] pub fn map_file(&self, file: &str, cs: bool, md: bool) -> Result, &'static str> { // Make sure index is set if self.idx.is_none() { @@ -827,7 +835,9 @@ pub enum FileFormat { FASTQ, } + #[allow(dead_code)] +#[cfg(feature="map-file")] pub fn detect_file_format(buffer: &[u8]) -> Result { let buffer = from_utf8(&buffer).expect("Unable to parse file as UTF-8"); if buffer.starts_with(">") { diff --git a/tests/htslib.rs b/tests/htslib.rs new file mode 100644 index 0000000..f3488b9 --- /dev/null +++ b/tests/htslib.rs @@ -0,0 +1,58 @@ +#[cfg(test)] +#[cfg(feature="htslib")] +mod tests { + use minimap2::{Aligner, Preset}; + use minimap2::htslib::{mapping_to_record}; + + use rust_htslib::bam::header::{Header, HeaderRecord}; + use rust_htslib::bam::{Format, Read, Reader, Writer}; + + #[test] + fn test_mappy_output() { + let seq = b"atCCTACACTGCATAAACTATTTTGcaccataaaaaaaagGGACatgtgtgGGTCTAAAATAATTTGCTGAGCAATTAATGATTTCTAAATGATGCTAAAGTGAACCATTGTAatgttatatgaaaaataaatacacaattaagATCAACACAGTGAAATAACATTGATTGGGTGATTTCAAATGGGGTCTATctgaataatgttttatttaacagtaatttttatttctatcaatttttagtaatatctacaaatattttgttttaggcTGCCAGAAGATCGGCGGTGCAAGGTCAGAGGTGAGATGTTAGGTGGTTCCACCAACTGCACGGAAGAGCTGCCCTCTGTCATTCAAAATTTGACAGGTACAAACAGactatattaaataagaaaaacaaactttttaaaggCTTGACCATTAGTGAATAGGTTATATGCTTATTATTTCCATTTAGCTTTTTGAGACTAGTATGATTAGACAAATCTGCTTAGttcattttcatataatattgaGGAACAAAATTTGTGAGATTTTGCTAAAATAACTTGCTTTGCTTGTTTATAGAGGCacagtaaatcttttttattattattataattttagattttttaatttttaaat"; + + let aligner = Aligner::builder() + .preset(Preset::MapOnt) + .with_threads(1) + .with_index("test_data/test_data.fasta", None) + .unwrap() + .with_cigar(); + + let mut mappings = aligner.map(seq, true, true, None, None).unwrap(); + assert_eq!(mappings.len(), 1); + + let mut header = Header::new(); + // TODO: would be nice to get this from the aligner index + header.push_record( + HeaderRecord::new(b"SQ") + .push_tag(b"SN", &String::from("contig4")) + .push_tag(b"LN", &3360), + ); + + let observed = mappings.pop().unwrap(); + let o = mapping_to_record(Some(&observed), seq, header.clone(), None, Some(b"q1")); + + let mut sam_reader = Reader::from_path("../test_data/query_vs_test_data.sam").unwrap(); + let e = sam_reader.records().next().unwrap().unwrap(); + + assert_eq!(o.cigar(), e.cigar()); + assert_eq!(o.inner().core.pos, e.inner().core.pos); + assert_eq!(o.inner().core.mpos, e.inner().core.mpos); + assert_eq!(o.inner().core.mtid, e.inner().core.mtid); + assert_eq!(o.inner().core.tid, e.inner().core.tid); + // the bin attribute is associated with BAM format, so I don't think we need to set it + // assert_eq!(o.inner().core.bin, e.inner().core.bin); + assert_eq!(o.inner().core.qual, e.inner().core.qual); + assert_eq!(o.inner().core.l_extranul, e.inner().core.l_extranul); + assert_eq!(o.inner().core.flag, e.inner().core.flag); + assert_eq!(o.inner().core.l_qname, e.inner().core.l_qname); + assert_eq!(o.inner().core.n_cigar, e.inner().core.n_cigar); + assert_eq!(o.inner().core.l_qseq, e.inner().core.l_qseq); + assert_eq!(o.inner().core.isize, e.inner().core.isize); + + let mut writer = + Writer::from_path("test_data/query_vs_target.bam", &header, Format::Bam).unwrap(); + writer.write(&o).unwrap(); + } +} + From 6b90f083f8da8905efd5d00841d94da585f9877c Mon Sep 17 00:00:00 2001 From: Joseph Guhlin Date: Wed, 11 Jan 2023 12:47:51 +1300 Subject: [PATCH 03/11] Compiles --- Cargo.toml | 11 +++++++---- src/lib.rs | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b15819a..53c62c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,22 +20,25 @@ exclude = [ [dependencies] libc = "0.2.134" -bytelines = "2.4.0" + simdutf8 = { version = "0.1.4", optional = true } + flate2 = { version = "1.0.17", features = [ - "zlib-ng", + "zlib" ], default-features = false, optional = true } # Dep for development #minimap2-sys = { path = "./minimap2-sys" } minimap2-sys = "0.1.7" -fffx = "0.1.1" + +fffx = {version = "0.1.2", optional = true } +#fffx = { path = "../fffx", optional=true } rust-htslib = { version = "0.40.2", optional = true } [features] default = [] htslib = ['rust-htslib'] -map-file = ['flate2', 'simdutf8'] +map-file = ['flate2', 'simdutf8', 'fffx'] # [profile.release] # opt-level = 3 diff --git a/src/lib.rs b/src/lib.rs index 31c303f..4d3c175 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,6 +22,7 @@ pub type MapOpt = mm_mapopt_t; /// Alias for mm_idxopt_t pub type IdxOpt = mm_idxopt_t; +#[cfg(feature="map-file")] pub use fffx::{Fasta, Fastq, Sequence}; // TODO: Probably a better way to handle this... From 05893dade78ca81358017cd381c4f6e356688013 Mon Sep 17 00:00:00 2001 From: Joseph Guhlin Date: Wed, 11 Jan 2023 12:53:06 +1300 Subject: [PATCH 04/11] Compiles --- Cargo.toml | 11 +++++++---- src/lib.rs | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b15819a..53c62c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,22 +20,25 @@ exclude = [ [dependencies] libc = "0.2.134" -bytelines = "2.4.0" + simdutf8 = { version = "0.1.4", optional = true } + flate2 = { version = "1.0.17", features = [ - "zlib-ng", + "zlib" ], default-features = false, optional = true } # Dep for development #minimap2-sys = { path = "./minimap2-sys" } minimap2-sys = "0.1.7" -fffx = "0.1.1" + +fffx = {version = "0.1.2", optional = true } +#fffx = { path = "../fffx", optional=true } rust-htslib = { version = "0.40.2", optional = true } [features] default = [] htslib = ['rust-htslib'] -map-file = ['flate2', 'simdutf8'] +map-file = ['flate2', 'simdutf8', 'fffx'] # [profile.release] # opt-level = 3 diff --git a/src/lib.rs b/src/lib.rs index 31c303f..4d3c175 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,6 +22,7 @@ pub type MapOpt = mm_mapopt_t; /// Alias for mm_idxopt_t pub type IdxOpt = mm_idxopt_t; +#[cfg(feature="map-file")] pub use fffx::{Fasta, Fastq, Sequence}; // TODO: Probably a better way to handle this... From b175d4efb97fd58195d1b663492b503a60a1f3c1 Mon Sep 17 00:00:00 2001 From: Joseph Guhlin Date: Wed, 11 Jan 2023 12:56:25 +1300 Subject: [PATCH 05/11] Fmt, fix, update --- src/htslib.rs | 2 -- src/lib.rs | 14 ++++++-------- tests/htslib.rs | 5 ++--- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/htslib.rs b/src/htslib.rs index 84e4df0..b440415 100644 --- a/src/htslib.rs +++ b/src/htslib.rs @@ -74,5 +74,3 @@ pub fn mapping_to_record( // TODO: set AUX flags for cs/md if available rec } - - diff --git a/src/lib.rs b/src/lib.rs index 4d3c175..2fa2edf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,13 +7,12 @@ use std::path::Path; use minimap2_sys::*; - -#[cfg(feature="map-file")] +#[cfg(feature = "map-file")] use flate2::read::GzDecoder; -#[cfg(feature="map-file")] +#[cfg(feature = "map-file")] use simdutf8::basic::from_utf8; -#[cfg(feature="htslib")] +#[cfg(feature = "htslib")] pub mod htslib; /// Alias for mm_mapop_t @@ -22,7 +21,7 @@ pub type MapOpt = mm_mapopt_t; /// Alias for mm_idxopt_t pub type IdxOpt = mm_idxopt_t; -#[cfg(feature="map-file")] +#[cfg(feature = "map-file")] pub use fffx::{Fasta, Fastq, Sequence}; // TODO: Probably a better way to handle this... @@ -724,7 +723,7 @@ impl Aligner { /// /// TODO: Remove cs and md and make them options on the struct /// - #[cfg(feature="map-file")] + #[cfg(feature = "map-file")] pub fn map_file(&self, file: &str, cs: bool, md: bool) -> Result, &'static str> { // Make sure index is set if self.idx.is_none() { @@ -836,9 +835,8 @@ pub enum FileFormat { FASTQ, } - #[allow(dead_code)] -#[cfg(feature="map-file")] +#[cfg(feature = "map-file")] pub fn detect_file_format(buffer: &[u8]) -> Result { let buffer = from_utf8(&buffer).expect("Unable to parse file as UTF-8"); if buffer.starts_with(">") { diff --git a/tests/htslib.rs b/tests/htslib.rs index f3488b9..b24caa6 100644 --- a/tests/htslib.rs +++ b/tests/htslib.rs @@ -1,8 +1,8 @@ #[cfg(test)] -#[cfg(feature="htslib")] +#[cfg(feature = "htslib")] mod tests { + use minimap2::htslib::mapping_to_record; use minimap2::{Aligner, Preset}; - use minimap2::htslib::{mapping_to_record}; use rust_htslib::bam::header::{Header, HeaderRecord}; use rust_htslib::bam::{Format, Read, Reader, Writer}; @@ -55,4 +55,3 @@ mod tests { writer.write(&o).unwrap(); } } - From 21b4897d42d861fb62146b0d9d4bd9bbb05fb7a1 Mon Sep 17 00:00:00 2001 From: Eoghan Harrington Date: Tue, 10 Jan 2023 19:48:47 -0500 Subject: [PATCH 06/11] add missing test file --- test_data/query_vs_test_data.sam | 3 +++ tests/htslib.rs | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 test_data/query_vs_test_data.sam diff --git a/test_data/query_vs_test_data.sam b/test_data/query_vs_test_data.sam new file mode 100644 index 0000000..1aaf50e --- /dev/null +++ b/test_data/query_vs_test_data.sam @@ -0,0 +1,3 @@ +@SQ SN:contig4 LN:3360 +@PG ID:minimap2 PN:minimap2 VN:2.24-r1122 CL:minimap2 -ax map-ont -cs --MD test_data.fasta query.fa +q1 0 contig4 1529 60 39M2I582M * 0 0 atCCTACACTGCATAAACTATTTTGcaccataaaaaaaagGGACatgtgtgGGTCTAAAATAATTTGCTGAGCAATTAATGATTTCTAAATGATGCTAAAGTGAACCATTGTAatgttatatgaaaaataaatacacaattaagATCAACACAGTGAAATAACATTGATTGGGTGATTTCAAATGGGGTCTATctgaataatgttttatttaacagtaatttttatttctatcaatttttagtaatatctacaaatattttgttttaggcTGCCAGAAGATCGGCGGTGCAAGGTCAGAGGTGAGATGTTAGGTGGTTCCACCAACTGCACGGAAGAGCTGCCCTCTGTCATTCAAAATTTGACAGGTACAAACAGactatattaaataagaaaaacaaactttttaaaggCTTGACCATTAGTGAATAGGTTATATGCTTATTATTTCCATTTAGCTTTTTGAGACTAGTATGATTAGACAAATCTGCTTAGttcattttcatataatattgaGGAACAAAATTTGTGAGATTTTGCTAAAATAACTTGCTTTGCTTGTTTATAGAGGCacagtaaatcttttttattattattataattttagattttttaatttttaaat * NM:i:4 ms:i:1223 AS:i:1222 nn:i:0 tp:A:P cm:i:114 s1:i:590 s2:i:0 de:f:0.0048 rl:i:0 diff --git a/tests/htslib.rs b/tests/htslib.rs index b24caa6..d593a31 100644 --- a/tests/htslib.rs +++ b/tests/htslib.rs @@ -32,7 +32,7 @@ mod tests { let observed = mappings.pop().unwrap(); let o = mapping_to_record(Some(&observed), seq, header.clone(), None, Some(b"q1")); - let mut sam_reader = Reader::from_path("../test_data/query_vs_test_data.sam").unwrap(); + let mut sam_reader = Reader::from_path("test_data/query_vs_test_data.sam").unwrap(); let e = sam_reader.records().next().unwrap().unwrap(); assert_eq!(o.cigar(), e.cigar()); From 7364af6795ab87b65c9784b0b3ab5a5e2eb76899 Mon Sep 17 00:00:00 2001 From: Eoghan Harrington Date: Wed, 11 Jan 2023 08:15:22 -0500 Subject: [PATCH 07/11] build(Cargo.toml): make map-file default feature --- .gitignore | 3 ++- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 24a359c..c53f643 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ /Cargo.lock test.mmi */target -/*.mmi \ No newline at end of file +/*.mmi +test_data/*.bam diff --git a/Cargo.toml b/Cargo.toml index 53c62c2..fb7dbc1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,7 +36,7 @@ fffx = {version = "0.1.2", optional = true } rust-htslib = { version = "0.40.2", optional = true } [features] -default = [] +default = ['map-file'] htslib = ['rust-htslib'] map-file = ['flate2', 'simdutf8', 'fffx'] From a5817fd275eb2dbe2937e68cbfccacd32ebfb480 Mon Sep 17 00:00:00 2001 From: Eoghan Harrington Date: Thu, 12 Jan 2023 12:43:13 -0500 Subject: [PATCH 08/11] WIP --- Cargo.toml | 3 +++ src/htslib.rs | 46 ++++++++++++++++++------------------ test_data/cDNA_reads.fa | 4 ++++ test_data/cDNA_vs_genome.sam | 6 +++++ test_data/gDNA_reads.fa | 12 ++++++++++ test_data/gDNA_vs_genome.sam | 12 ++++++++++ test_data/genome.fa | 2 ++ test_data/genome.fasta | 1 + test_data/query.fa | 9 ++++++- tests/htslib.rs | 24 +++++++++++++++++-- 10 files changed, 93 insertions(+), 26 deletions(-) create mode 100644 test_data/cDNA_reads.fa create mode 100644 test_data/cDNA_vs_genome.sam create mode 100644 test_data/gDNA_reads.fa create mode 100644 test_data/gDNA_vs_genome.sam create mode 100644 test_data/genome.fa create mode 100644 test_data/genome.fasta diff --git a/Cargo.toml b/Cargo.toml index fb7dbc1..ada0f22 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,3 +50,6 @@ debug = true [profile.dev.package."*"] opt-level = 3 + +[dev-dependencies] +cargo-nextest = "0.9.48" diff --git a/src/htslib.rs b/src/htslib.rs index b440415..6f76992 100644 --- a/src/htslib.rs +++ b/src/htslib.rs @@ -1,30 +1,7 @@ use rust_htslib::bam::record::{Cigar, CigarString}; use rust_htslib::bam::{Header, Record}; -// bam, bam::header::HeaderRecord, bam::record::Aux, bam::CompressionLevel, bam::Format, -// bam::Header, bam::HeaderView, bam::Read, errors, tpool::ThreadPool, -//}; use crate::{Alignment, Mapping}; -fn cigar_to_cigarstr(cigar: &Vec<(u32, u8)>) -> CigarString { - let op_vec: Vec = cigar - .to_owned() - .iter() - .map(|(len, op)| match op { - 0 => Cigar::Match(*len), - 1 => Cigar::Ins(*len), - 2 => Cigar::Del(*len), - 3 => Cigar::RefSkip(*len), - 4 => Cigar::SoftClip(*len), - 5 => Cigar::HardClip(*len), - 6 => Cigar::Pad(*len), - 7 => Cigar::Equal(*len), - 8 => Cigar::Diff(*len), - _ => panic!("Unexpected cigar operation"), - }) - .collect(); - CigarString(op_vec) -} - pub fn mapping_to_record( mapping: Option<&Mapping>, seq: &[u8], @@ -74,3 +51,26 @@ pub fn mapping_to_record( // TODO: set AUX flags for cs/md if available rec } + + +fn cigar_to_cigarstr(cigar: &Vec<(u32, u8)>) -> CigarString { + let op_vec: Vec = cigar + .to_owned() + .iter() + .map(|(len, op)| match op { + 0 => Cigar::Match(*len), + 1 => Cigar::Ins(*len), + 2 => Cigar::Del(*len), + 3 => Cigar::RefSkip(*len), + 4 => Cigar::SoftClip(*len), + 5 => Cigar::HardClip(*len), + 6 => Cigar::Pad(*len), + 7 => Cigar::Equal(*len), + 8 => Cigar::Diff(*len), + _ => panic!("Unexpected cigar operation"), + }) + .collect(); + CigarString(op_vec) +} + + diff --git a/test_data/cDNA_reads.fa b/test_data/cDNA_reads.fa new file mode 100644 index 0000000..d4b8f75 --- /dev/null +++ b/test_data/cDNA_reads.fa @@ -0,0 +1,4 @@ +>cdna.fwd el:Z:chr1:0:1720:1 +ATGCCTAGAAGTGTGTGATCGCATTGCTGCCAAGTATTCGATGCATCTGTTACCCAGAGGTGCTCCTCACTACAGCCAGGTCATGGACTTCTTCTCAGGAAAGTCTGGTCGCGGTCGTGGTTGTCCGAGAAACGCATCACCCACAGATAAAATCAGTTATTACAGTTGGACCTTCATGTCAAACCAGAGACCCGTATTTCAAATCGCACATACTGCGTCGTGCAATGCCGGGCGCTAACGGCTCAATATCACGCTGCGTCACTATGGCTACCCCAAAGCGGGGGGGGCATCGACGGGCTGCATAGTGGCAGTATCGAGACGACGACCGTTAAAGAATTTCGGAGGCTGGGTGCTGTACTGTAATCGCCTATTGCAGCACTCAGATTTTAACCTGGTGCCG +>cdna.rev el:Z:chr1:360:1720:0 +CGGCACCAGGTTAAAATCTGAGTGCTGCAATAGGCGATTACAGTACAGCACCCAGCCTCCGAAATTCTTTAACGGTCGTCGTCTCGATACTGCCACTATGGTTGGTAGGTCCCAATCTTCGAGCACAGCGGCTCTAACGTGGCCGAAAGCAGTGCCACGATCCGCATGATCTTATGAGCCATACCACATGTAGTTTAGAGCAGCCCGTCGATGCCCCCCCCGCTTTGGGGTAGCCATAGTGACGCAGCGTGATATTGAGCCGTTAGCGCCCGGCATTGCACGACGCAGTATGTGCGATTTTACTCCTGAAGCAGCCGTGAGTACTGTACTCTAATGCACACGTACTTGTTTATGCGAGAAGGTTACCGTCTAATTTTAATGGCCTAGGCGATGGCTATAA \ No newline at end of file diff --git a/test_data/cDNA_vs_genome.sam b/test_data/cDNA_vs_genome.sam new file mode 100644 index 0000000..0a0c365 --- /dev/null +++ b/test_data/cDNA_vs_genome.sam @@ -0,0 +1,6 @@ +@HD VN:1.6 SO:queryname +@SQ SN:chr1 LN:1720 +@PG ID:minimap2 PN:minimap2 VN:2.24-r1122 CL:minimap2 -ay -x splice --MD --cs test_data/genome.fa test_data/cDNA_reads.fa +@PG ID:samtools PN:samtools PP:minimap2 VN:1.16.1 CL:samtools sort -n -o test_data/cDNA_vs_genome.sam +cdna.fwd 0 chr1 1 60 100M620N100M80N100M620N100M * 0 0 ATGCCTAGAAGTGTGTGATCGCATTGCTGCCAAGTATTCGATGCATCTGTTACCCAGAGGTGCTCCTCACTACAGCCAGGTCATGGACTTCTTCTCAGGAAAGTCTGGTCGCGGTCGTGGTTGTCCGAGAAACGCATCACCCACAGATAAAATCAGTTATTACAGTTGGACCTTCATGTCAAACCAGAGACCCGTATTTCAAATCGCACATACTGCGTCGTGCAATGCCGGGCGCTAACGGCTCAATATCACGCTGCGTCACTATGGCTACCCCAAAGCGGGGGGGGCATCGACGGGCTGCATAGTGGCAGTATCGAGACGACGACCGTTAAAGAATTTCGGAGGCTGGGTGCTGTACTGTAATCGCCTATTGCAGCACTCAGATTTTAACCTGGTGCCG * NM:i:0 ms:i:400 AS:i:304 nn:i:0 ts:A:+ tp:A:P cm:i:124 s1:i:372 s2:i:87 de:f:0 MD:Z:400 rl:i:0 el:Z:chr1:0:1720:1 +cdna.rev 16 chr1 361 60 100M440N100M260N100M260N100M * 0 0 TTATAGCCATCGCCTAGGCCATTAAAATTAGACGGTAACCTTCTCGCATAAACAAGTACGTGTGCATTAGAGTACAGTACTCACGGCTGCTTCAGGAGTAAAATCGCACATACTGCGTCGTGCAATGCCGGGCGCTAACGGCTCAATATCACGCTGCGTCACTATGGCTACCCCAAAGCGGGGGGGGCATCGACGGGCTGCTCTAAACTACATGTGGTATGGCTCATAAGATCATGCGGATCGTGGCACTGCTTTCGGCCACGTTAGAGCCGCTGTGCTCGAAGATTGGGACCTACCAACCATAGTGGCAGTATCGAGACGACGACCGTTAAAGAATTTCGGAGGCTGGGTGCTGTACTGTAATCGCCTATTGCAGCACTCAGATTTTAACCTGGTGCCG * NM:i:0 ms:i:400 AS:i:304 nn:i:0 ts:A:- tp:A:P cm:i:120 s1:i:369 s2:i:0 de:f:0 MD:Z:400 rl:i:0 el:Z:chr1:360:1720:0 diff --git a/test_data/gDNA_reads.fa b/test_data/gDNA_reads.fa new file mode 100644 index 0000000..3498263 --- /dev/null +++ b/test_data/gDNA_reads.fa @@ -0,0 +1,12 @@ +>perfect_read.fwd el:Z:chr1:180:280:1 em:i:0 +TACGCCACACGGGCTACACTCTCGCCTTCTCGTCGCAACTACGAGCTGGACTATCGGCCGAGAGGATCTAACACGAGAAGTACTTGCCGGCAATCCCTAA +>perfect_read.rev el:Z:chr1:180:280:0 em:i:0 +TTAGGGATTGCCGGCAAGTACTTCTCGTGTTAGATCCTCTCGGCCGATAGTCCAGCTCGTAGTTGCGACGAGAAGGCGAGAGTGTAGCCCGTGTGGCGTA +>imperfect_read.fwd el:Z:chr1:180:280:1 em:i:5 +TACGCCACACAGGCTACACTCTCGCCTTCTCGTCGCAACTACGCGCTGGACTATCCGCCGAGAGGATCTAACACGTGAAGTACTTGCCGGCATTCCCTAA +>unmappable_read el:Z:chr1:180:280:1 em:i:99 +GTTTGTGAGGATCGCGGCGATGGTTTAGGGACGATAAGTATTATCGGAAGGGGCATATTCGTGTCGGAAGGTGAAGATGCCGAAAAAGCAATCGAAGCCG +>perfect_inv_duplicate el:Z:chr1:540:640:1 em:i:0 +GAAATACGGGTCTCTGGTTTGACATAAAGGTCCAACTGTAATAACTGATTTTATCTGTGGGTGATGCGTTTCTCGGACAACCACGACCGCGCCCAGACTT +>split_read +ATGCCTAGAAGTGTGTGATCGCATTGCTGCCAAGTATTCGATGCATCTGTTACCCAGAGGTGCTCCTCACTACAGCCAGGTCATGGACTTCTTCTCAGGACTACCCACCTGTTTCATGATCCCCCCTTTGTGAACAATAAACTTAGTAAACATTTTTACGATTAAATGTTTAACTCCTAC \ No newline at end of file diff --git a/test_data/gDNA_vs_genome.sam b/test_data/gDNA_vs_genome.sam new file mode 100644 index 0000000..5f351ef --- /dev/null +++ b/test_data/gDNA_vs_genome.sam @@ -0,0 +1,12 @@ +@HD VN:1.6 SO:queryname +@SQ SN:chr1 LN:1720 +@PG ID:minimap2 PN:minimap2 VN:2.24-r1122 CL:minimap2 -ay --MD --cs test_data/genome.fa test_data/gDNA_reads.fa +@PG ID:samtools PN:samtools PP:minimap2 VN:1.16.1 CL:samtools sort -n -o test_data/gDNA_vs_genome.sam +imperfect_read.fwd 0 chr1 181 27 100M * 0 0 TACGCCACACAGGCTACACTCTCGCCTTCTCGTCGCAACTACGCGCTGGACTATCCGCCGAGAGGATCTAACACGTGAAGTACTTGCCGGCATTCCCTAA * NM:i:5 ms:i:170 AS:i:170 nn:i:0 tp:A:P cm:i:6 s1:i:56 s2:i:0 de:f:0.05 MD:Z:10G32A11G19A16A7 rl:i:0 el:Z:chr1:180:280:1 em:i:5 +perfect_inv_duplicate 0 chr1 541 36 100M * 0 0 GAAATACGGGTCTCTGGTTTGACATAAAGGTCCAACTGTAATAACTGATTTTATCTGTGGGTGATGCGTTTCTCGGACAACCACGACCGCGCCCAGACTT * NM:i:0 ms:i:200 AS:i:200 nn:i:0 tp:A:P cm:i:15 s1:i:85 s2:i:71 de:f:0 MD:Z:100 rl:i:0 el:Z:chr1:540:640:1 em:i:0 +perfect_inv_duplicate 272 chr1 721 0 100M * 0 0 * * NM:i:2 ms:i:188 AS:i:188 nn:i:0 tp:A:S cm:i:11 s1:i:71 de:f:0.02 MD:Z:8T65C25 rl:i:0 el:Z:chr1:540:640:1 em:i:0 +perfect_read.fwd 0 chr1 181 60 100M * 0 0 TACGCCACACGGGCTACACTCTCGCCTTCTCGTCGCAACTACGAGCTGGACTATCGGCCGAGAGGATCTAACACGAGAAGTACTTGCCGGCAATCCCTAA * NM:i:0 ms:i:200 AS:i:200 nn:i:0 tp:A:P cm:i:17 s1:i:84 s2:i:0 de:f:0 MD:Z:100 rl:i:0 el:Z:chr1:180:280:1 em:i:0 +perfect_read.rev 16 chr1 181 60 100M * 0 0 TACGCCACACGGGCTACACTCTCGCCTTCTCGTCGCAACTACGAGCTGGACTATCGGCCGAGAGGATCTAACACGAGAAGTACTTGCCGGCAATCCCTAA * NM:i:0 ms:i:200 AS:i:200 nn:i:0 tp:A:P cm:i:17 s1:i:84 s2:i:0 de:f:0 MD:Z:100 rl:i:0 el:Z:chr1:180:280:0 em:i:0 +split_read 0 chr1 1 60 100M80S * 0 0 ATGCCTAGAAGTGTGTGATCGCATTGCTGCCAAGTATTCGATGCATCTGTTACCCAGAGGTGCTCCTCACTACAGCCAGGTCATGGACTTCTTCTCAGGACTACCCACCTGTTTCATGATCCCCCCTTTGTGAACAATAAACTTAGTAAACATTTTTACGATTAAATGTTTAACTCCTAC * NM:i:0 ms:i:200 AS:i:200 nn:i:0 tp:A:P cm:i:13 s1:i:90 s2:i:0 de:f:0 SA:Z:chr1,821,-,80M100S,52,0; MD:Z:100 rl:i:0 +split_read 2064 chr1 821 52 80M100H * 0 0 GTAGGAGTTAAACATTTAATCGTAAAAATGTTTACTAAGTTTATTGTTCACAAAGGGGGGATCATGAAACAGGTGGGTAG * NM:i:0 ms:i:160 AS:i:160 nn:i:0 tp:A:P cm:i:11 s1:i:70 s2:i:0 de:f:0 SA:Z:chr1,1,+,100M80S,60,0; MD:Z:80 rl:i:0 +unmappable_read 4 * 0 0 * * 0 0 GTTTGTGAGGATCGCGGCGATGGTTTAGGGACGATAAGTATTATCGGAAGGGGCATATTCGTGTCGGAAGGTGAAGATGCCGAAAAAGCAATCGAAGCCG * rl:i:0 el:Z:chr1:180:280:1 em:i:99 diff --git a/test_data/genome.fa b/test_data/genome.fa new file mode 100644 index 0000000..27c6b80 --- /dev/null +++ b/test_data/genome.fa @@ -0,0 +1,2 @@ +>chr1 +ATGCCTAGAAGTGTGTGATCGCATTGCTGCCAAGTATTCGATGCATCTGTTACCCAGAGGTGCTCCTCACTACAGCCAGGTCATGGACTTCTTCTCAGGAGTAATTTGCGCTGCGGAAAACGGCTGATGGGGAGTCGACCTACCTTAATATCTCCGAGGTTGCCCTCACAAATGGCGTAGTACGCCACACGGGCTACACTCTCGCCTTCTCGTCGCAACTACGAGCTGGACTATCGGCCGAGAGGATCTAACACGAGAAGTACTTGCCGGCAATCCCTAAGTACCTAGGCTCTCGGTCCACTATGACGCAGGACAGGGTTCAGTTAAAAGGCCTCTTCATGCGGTCTTAAGACCTTATAGTTATAGCCATCGCCTAGGCCATTAAAATTAGACGGTAACCTTCTCGCATAAACAAGTACGTGTGCATTAGAGTACAGTACTCACGGCTGCTTCAGGAGTAGTACATAGATGTCCGTTAGACTCTCACGTGTTTGCCACTCAGTTCCGACATGTTTATACGCAATCCTATTGAACGACTAGGAAATACGGGTCTCTGGTTTGACATAAAGGTCCAACTGTAATAACTGATTTTATCTGTGGGTGATGCGTTTCTCGGACAACCACGACCGCGCCCAGACTTGTACTCCGTTCGATGGTGACAGAAGGGTTATGGAGGGAGTGATTATCGCTTTTTACGGAAGGAAAGAGCCGGGTTTGTAGAAGTCTGGTCGCGGTCGTGGTTGTCCGAGAAACGCATCACCCACAGATAAAATCAGTTATTACAGTTGGACCTTCATGTCAAACCAGAGACCCGTATTTCGTAGGAGTTAAACATTTAATCGTAAAAATGTTTACTAAGTTTATTGTTCACAAAGGGGGGATCATGAAACAGGTGGGTAGAAATCGCACATACTGCGTCGTGCAATGCCGGGCGCTAACGGCTCAATATCACGCTGCGTCACTATGGCTACCCCAAAGCGGGGGGGGCATCGACGGGCTGGTATATTGGGCTACGTCCAAAATTTAACGCCAGCAGTACTGTCCGAAGGAGCGTATCTCCTGAGTCTGCAGATGCAGTAGTTTGATTTGAGCTCCATTACCCTACAATTAGAACACTGGCAACATTTGGGCGTTGAGCGGTCTTCCGTGTCGCTCGATCCGCTGGAACTTGGCAACCACAGTACCCAGGACCAGGGGTTGTCTTGTGGCTGTCTAAGGTCGGTGCTATACTGTGGCAACAGTTCTTTCCTACATGAATAGCTCTAAACTACATGTGGTATGGCTCATAAGATCATGCGGATCGTGGCACTGCTTTCGGCCACGTTAGAGCCGCTGTGCTCGAAGATTGGGACCTACCAACGTAAGACCTGTCCCTTTTGGTCCTCAGCTTCGTGAGGGACTACCAACTACAGCTGTGCTCTTGGCTGTCTCAAGCACTAGTGTCAATTGACAAAGATCCGTCAATAGCTATGCATCCCACAACACTAAAGCCACCCGTCTGCACGATCTGCAGCATCACCTGAAGACAATAATATAAAGGGTAGAGGTCACTGGCATTTTCCATGCTGTTACGTACGTAGCAGCGCTATAGCCAGAGGTGCGTCATAGAATAGGAATTAGCATAGTGGCAGTATCGAGACGACGACCGTTAAAGAATTTCGGAGGCTGGGTGCTGTACTGTAATCGCCTATTGCAGCACTCAGATTTTAACCTGGTGCCG \ No newline at end of file diff --git a/test_data/genome.fasta b/test_data/genome.fasta new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/test_data/genome.fasta @@ -0,0 +1 @@ + diff --git a/test_data/query.fa b/test_data/query.fa index cabc7c5..9d37096 100644 --- a/test_data/query.fa +++ b/test_data/query.fa @@ -1,2 +1,9 @@ >q1 -atCCTACACTGCATAAACTATTTTGcaccataaaaaaaagGGACatgtgtgGGTCTAAAATAATTTGCTGAGCAATTAATGATTTCTAAATGATGCTAAAGTGAACCATTGTAatgttatatgaaaaataaatacacaattaagATCAACACAGTGAAATAACATTGATTGGGTGATTTCAAATGGGGTCTATctgaataatgttttatttaacagtaatttttatttctatcaatttttagtaatatctacaaatattttgttttaggcTGCCAGAAGATCGGCGGTGCAAGGTCAGAGGTGAGATGTTAGGTGGTTCCACCAACTGCACGGAAGAGCTGCCCTCTGTCATTCAAAATTTGACAGGTACAAACAGactatattaaataagaaaaacaaactttttaaaggCTTGACCATTAGTGAATAGGTTATATGCTTATTATTTCCATTTAGCTTTTTGAGACTAGTATGATTAGACAAATCTGCTTAGttcattttcatataatattgaGGAACAAAATTTGTGAGATTTTGCTAAAATAACTTGCTTTGCTTGTTTATAGAGGCacagtaaatcttttttattattattataattttagattttttaatttttaaat +atCCTACACTGCATAAACTATTTTGcaccataaaaaaaagGGACatgtgtgGGTCTAAAATAATTTGCTGAGCAATTAAT +GATTTCTAAATGATGCTAAAGTGAACCATTGTAatgttatatgaaaaataaatacacaattaagATCAACACAGTGAAAT +AACATTGATTGGGTGATTTCAAATGGGGTCTATctgaataatgttttatttaacagtaatttttatttctatcaattttt +agtaatatctacaaatattttgttttaggcTGCCAGAAGATCGGCGGTGCAAGGTCAGAGGTGAGATGTTAGGTGGTTCC +ACCAACTGCACGGAAGAGCTGCCCTCTGTCATTCAAAATTTGACAGGTACAAACAGactatattaaataagaaaaacaaa +ctttttaaaggCTTGACCATTAGTGAATAGGTTATATGCTTATTATTTCCATTTAGCTTTTTGAGACTAGTATGATTAGA +CAAATCTGCTTAGttcattttcatataatattgaGGAACAAAATTTGTGAGATTTTGCTAAAATAACTTGCTTTGCTTGT +TTATAGAGGCacagtaaatcttttttattattattataattttagattttttaatttttaaat diff --git a/tests/htslib.rs b/tests/htslib.rs index d593a31..e55e058 100644 --- a/tests/htslib.rs +++ b/tests/htslib.rs @@ -1,11 +1,31 @@ #[cfg(test)] -#[cfg(feature = "htslib")] +// #[cfg(feature = "htslib")] mod tests { use minimap2::htslib::mapping_to_record; use minimap2::{Aligner, Preset}; + use std::collections::HashMap; + use std::rc::Rc; + use std::borrow::Cow; + use rust_htslib::bam::header::{Header, HeaderRecord}; - use rust_htslib::bam::{Format, Read, Reader, Writer}; + use rust_htslib::bam::{Format, Read, Reader, Writer, Record}; + + fn gdna_records() -> Vec { + let mut reader = Reader::from_path("test_data/gDNA_vs_genome.sam").unwrap(); + let records = reader.records().into_iter().filter_map(|r| Some(r.unwrap())).collect(); + // for rec in reader.records() { + // let rec = rec.unwrap(); + // if rec.qname() == q + // } + records + } + + #[test] + fn test_unspliced_alignments() { + let truth = gdna_records(); + println!("{truth:?}"); + } #[test] fn test_mappy_output() { From 270be634a8198ca1199a9fbb8ccb4c4cf3b0609f Mon Sep 17 00:00:00 2001 From: Eoghan Harrington Date: Thu, 12 Jan 2023 16:56:50 -0500 Subject: [PATCH 09/11] test(htslib): acceptance tests sam outout Added a synthetic dataset to test some of the different aligment types: forward, reverse, primary, secondary, supplementary, unmapped, spliced --- Cargo.toml | 1 + src/htslib.rs | 12 ++-- src/lib.rs | 1 - test_data/genome.fasta | 1 - tests/htslib.rs | 141 ++++++++++++++++++++++++++++++++++++++--- 5 files changed, 142 insertions(+), 14 deletions(-) delete mode 100644 test_data/genome.fasta diff --git a/Cargo.toml b/Cargo.toml index ada0f22..97a6c48 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,4 +52,5 @@ debug = true opt-level = 3 [dev-dependencies] +bitflags = "1.3.2" cargo-nextest = "0.9.48" diff --git a/src/htslib.rs b/src/htslib.rs index 6f76992..1096694 100644 --- a/src/htslib.rs +++ b/src/htslib.rs @@ -1,6 +1,6 @@ use rust_htslib::bam::record::{Cigar, CigarString}; use rust_htslib::bam::{Header, Record}; -use crate::{Alignment, Mapping}; +use crate::{Alignment, Mapping, Strand}; pub fn mapping_to_record( mapping: Option<&Mapping>, @@ -16,7 +16,7 @@ pub fn mapping_to_record( let qual = match qual { Some(q) => Vec::from(q), None => { - let q = vec![255; seq.len()]; // Vec::with_capacity(seq.len()); + let q = vec![255; seq.len()]; q } }; @@ -26,9 +26,14 @@ pub fn mapping_to_record( .and_then(|a| a.cigar) .map(|c| cigar_to_cigarstr(&c)); + rec.set(qname, cigar.as_ref(), seq, &qual[..]); match mapping { Some(m) => { - // TODO: set strand + println!("Strand {m:?}"); + if m.strand == Strand::Reverse { + println!("here"); + rec.set_reverse(); + } // TODO: set secondary/supplementary flags rec.set_pos(m.target_start as i64); rec.set_mapq(m.mapq as u8); @@ -47,7 +52,6 @@ pub fn mapping_to_record( rec.set_insert_size(-1); } }; - rec.set(qname, cigar.as_ref(), seq, &qual[..]); // TODO: set AUX flags for cs/md if available rec } diff --git a/src/lib.rs b/src/lib.rs index 2fa2edf..ad78621 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -682,7 +682,6 @@ impl Aligner { } else { None }; - mappings.push(Mapping { target_name: Some( std::ffi::CStr::from_ptr(contig) diff --git a/test_data/genome.fasta b/test_data/genome.fasta deleted file mode 100644 index 8b13789..0000000 --- a/test_data/genome.fasta +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tests/htslib.rs b/tests/htslib.rs index e55e058..3a1a6bf 100644 --- a/tests/htslib.rs +++ b/tests/htslib.rs @@ -1,19 +1,63 @@ #[cfg(test)] -// #[cfg(feature = "htslib")] +#[cfg(feature = "htslib")] mod tests { use minimap2::htslib::mapping_to_record; use minimap2::{Aligner, Preset}; + use std::borrow::Cow; use std::collections::HashMap; use std::rc::Rc; - use std::borrow::Cow; + // #[derive(Debug, Clone, PartialEq, Eq)] + // pub enum AlignType { + // Primary, + // Secondary, + // Supplementary, + // } + + // impl From<&u16> for AlignType { + // fn from(flags: &u16) -> Self { + // if (*flags & 256 as u16) { + // Self::Secondary + // } else if (*flags & 2048 as u16) { + // Self::Supplementary + // } else { + // Self::Primary + // } + // } + // } + + // use bitflags::bitflags; + // bitflags! { + // struct SamFlags: u16 { + // const PAIRED = 0x1; + // const PROPER_PAIR = 0x2; + // const UNMAP = 0x4; + // const MUNMAP = 0x8; + // const REVERSE = 0x16; + // const MREVERSE = 0x32; + // const READ1 = 0x64; + // const READ2 = 0x128; + // const SECONDARY = 0x256; + // const QCFAIL = 0x512; + + // const DUP = 0x1024; + // const SUPPLEMENTARY = 0x2048; + // const PRIMARY = !(Self::SECONDARY | Self::SUPPLEMENTARY); + // } + // } use rust_htslib::bam::header::{Header, HeaderRecord}; - use rust_htslib::bam::{Format, Read, Reader, Writer, Record}; + use rust_htslib::bam::{Format, Read, Reader, Record, Writer}; - fn gdna_records() -> Vec { + fn gdna_records(query_name: &str) -> Vec { let mut reader = Reader::from_path("test_data/gDNA_vs_genome.sam").unwrap(); - let records = reader.records().into_iter().filter_map(|r| Some(r.unwrap())).collect(); + let records: Vec = reader + .records() + .into_iter() + .filter_map(|r| Some(r.unwrap())) + .filter(|r| String::from_utf8_lossy(r.qname()) == *query_name) + .collect(); + // for rec in reader.records() { // let rec = rec.unwrap(); // if rec.qname() == q @@ -21,12 +65,93 @@ mod tests { records } + fn get_query_sequence(records: &Vec) -> Vec { + let seq = records + .iter() + .find(|r| !(r.is_secondary() | r.is_supplementary())) + .map(|r| r.seq().as_bytes()) + .unwrap(); + seq + } + + fn get_test_case(query_name: &str) -> (Vec, Vec) { + let aligner = Aligner::builder() + .with_threads(1) + .with_index("test_data/genome.fa", None) + .unwrap() + .with_cigar(); + let mut header = Header::new(); + // TODO: would be nice to get this from the aligner index + header.push_record( + HeaderRecord::new(b"SQ") + .push_tag(b"SN", &String::from("chr1")) + .push_tag(b"LN", &1720), + ); + + // truth set from cli minimap + let expected_recs = gdna_records(query_name); + + // extract the query sequence from the primary record for this query + let query = get_query_sequence(&expected_recs); + let mappings = aligner.map(&query, true, true, None, None).unwrap(); + let observed_recs: Vec = mappings + .iter() + .filter_map(|m| { + let m = mapping_to_record( + Some(&m), + &query, + header.clone(), + None, + Some(query_name.as_bytes()), + ); + Some(m) + }) + .collect(); + (expected_recs, observed_recs) + } + + fn check_single_mapper(expected: &Vec, observed: &Vec) { + assert_eq!(expected.len(), observed.len()); + let e = expected.first().unwrap(); + let o = observed.first().unwrap(); + assert_eq!(o.seq().as_bytes(), e.seq().as_bytes()); + + assert_eq!(o.cigar(), e.cigar()); + assert_eq!(o.inner().core.pos, e.inner().core.pos); + assert_eq!(o.inner().core.mpos, e.inner().core.mpos); + assert_eq!(o.inner().core.mtid, e.inner().core.mtid); + assert_eq!(o.inner().core.tid, e.inner().core.tid); + // the bin attribute is associated with BAM format, so I don't think we need to set it + // assert_eq!(o.inner().core.bin, e.inner().core.bin); + assert_eq!(o.inner().core.qual, e.inner().core.qual); + assert_eq!(o.inner().core.l_extranul, e.inner().core.l_extranul); + // FIXFIX: renable this + //assert_eq!(o.inner().core.flag, e.inner().core.flag); + assert_eq!(o.inner().core.l_qname, e.inner().core.l_qname); + assert_eq!(o.inner().core.n_cigar, e.inner().core.n_cigar); + assert_eq!(o.inner().core.l_qseq, e.inner().core.l_qseq); + assert_eq!(o.inner().core.isize, e.inner().core.isize); + + } + #[test] - fn test_unspliced_alignments() { - let truth = gdna_records(); - println!("{truth:?}"); + fn test_perfect_fwd() { + let (expected, observed) = get_test_case(&"perfect_read.fwd".to_string()); + check_single_mapper(&expected, &observed); } + #[test] + fn test_perfect_rev() { + let (expected, observed) = get_test_case(&"perfect_read.rev".to_string()); + check_single_mapper(&expected, &observed); + let e: &Record = expected.first().unwrap(); + let o: &Record = observed.first().unwrap(); + assert_eq!(e.is_reverse(), true); + assert_eq!(o.is_reverse(), true); + } + + + #[test] fn test_mappy_output() { let seq = b"atCCTACACTGCATAAACTATTTTGcaccataaaaaaaagGGACatgtgtgGGTCTAAAATAATTTGCTGAGCAATTAATGATTTCTAAATGATGCTAAAGTGAACCATTGTAatgttatatgaaaaataaatacacaattaagATCAACACAGTGAAATAACATTGATTGGGTGATTTCAAATGGGGTCTATctgaataatgttttatttaacagtaatttttatttctatcaatttttagtaatatctacaaatattttgttttaggcTGCCAGAAGATCGGCGGTGCAAGGTCAGAGGTGAGATGTTAGGTGGTTCCACCAACTGCACGGAAGAGCTGCCCTCTGTCATTCAAAATTTGACAGGTACAAACAGactatattaaataagaaaaacaaactttttaaaggCTTGACCATTAGTGAATAGGTTATATGCTTATTATTTCCATTTAGCTTTTTGAGACTAGTATGATTAGACAAATCTGCTTAGttcattttcatataatattgaGGAACAAAATTTGTGAGATTTTGCTAAAATAACTTGCTTTGCTTGTTTATAGAGGCacagtaaatcttttttattattattataattttagattttttaatttttaaat"; From de4b68322ce1f34741b9320632b248fc83633b76 Mon Sep 17 00:00:00 2001 From: Eoghan Harrington Date: Fri, 13 Jan 2023 16:26:57 -0500 Subject: [PATCH 10/11] feat(htslib): add MMIndex stuct to get sequence lengths --- .gitignore | 1 + Cargo.toml | 3 -- src/htslib.rs | 77 +++++++++++++++++++++++++++++++++++++++++++++++++-- src/lib.rs | 1 + 4 files changed, 77 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index c53f643..9ef9162 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ test.mmi */target /*.mmi test_data/*.bam +test_data/*.mmi diff --git a/Cargo.toml b/Cargo.toml index 97a6c48..be7022c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,6 +51,3 @@ debug = true [profile.dev.package."*"] opt-level = 3 -[dev-dependencies] -bitflags = "1.3.2" -cargo-nextest = "0.9.48" diff --git a/src/htslib.rs b/src/htslib.rs index 1096694..1a4cf0d 100644 --- a/src/htslib.rs +++ b/src/htslib.rs @@ -1,6 +1,8 @@ +use crate::{Aligner, Mapping, Strand}; +use core::ffi; +use minimap2_sys::mm_idx_t; use rust_htslib::bam::record::{Cigar, CigarString}; use rust_htslib::bam::{Header, Record}; -use crate::{Alignment, Mapping, Strand}; pub fn mapping_to_record( mapping: Option<&Mapping>, @@ -56,7 +58,6 @@ pub fn mapping_to_record( rec } - fn cigar_to_cigarstr(cigar: &Vec<(u32, u8)>) -> CigarString { let op_vec: Vec = cigar .to_owned() @@ -77,4 +78,76 @@ fn cigar_to_cigarstr(cigar: &Vec<(u32, u8)>) -> CigarString { CigarString(op_vec) } +#[derive(Debug,PartialEq,Eq)] +pub struct SeqMetaData { + pub name: String, + pub length: u32, + pub is_alt: bool, +} + +#[derive(Debug)] +pub struct MMIndex { + pub inner: mm_idx_t, +} + +impl MMIndex { + pub fn n_seq(&self) -> u32 { + self.inner.n_seq + } + pub fn seqs(&self) -> Vec { + let mut seqs: Vec = Vec::with_capacity(self.n_seq() as usize); + for i in 0..self.n_seq() { + let _seq = unsafe { *(self.inner.seq).offset(i as isize) }; + let c_str = unsafe { ffi::CStr::from_ptr(_seq.name) }; + let rust_str = c_str.to_str().unwrap().to_string(); + seqs.push(SeqMetaData { + name: rust_str, + length: _seq.len, + is_alt: _seq.is_alt != 0, + }); + } + seqs + } +} + +impl From for MMIndex { + fn from(aligner: Aligner) -> Self { + MMIndex { + inner: aligner.idx.unwrap(), + } + } +} + +#[cfg(test)] +#[cfg(feature = "htslib")] +mod tests { + use super::*; + + #[test] + fn test_index() { + let aligner = Aligner::builder() + .with_threads(1) + .with_index("test_data/MT-human.fa", Some("test_data/MT-human.mmi")) + .unwrap(); + + let idx = MMIndex::from(aligner); + + let seqs = idx.seqs(); + + assert_eq!( + seqs, + vec![SeqMetaData { + name: "MT_human".to_string(), + length: 16569u32, + is_alt: false + }] + ); + + //for i in 0..idx.n_seq { + // unsafe { + // println!("{:?}", *(idx.seq).offset(i as isize)); + // } + //} + } +} diff --git a/src/lib.rs b/src/lib.rs index ad78621..e9b3a0f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1053,4 +1053,5 @@ b"GTTTATGTAGCTTATTCTATCCAAAGCAATGCACTGAAAATGTCTCGACGGGCCCACACGCCCCATAAACAAATAGGT assert_eq!(align.cs.is_some(), *cs); } } + } From dce1851af8d9092429792385f1dcabe247d3d55f Mon Sep 17 00:00:00 2001 From: Eoghan Harrington Date: Fri, 13 Jan 2023 20:49:47 -0500 Subject: [PATCH 11/11] feat(htslib): generate sam header from index --- src/htslib.rs | 42 +++++++++++++++++++++++++------------ tests/htslib.rs | 55 +++---------------------------------------------- 2 files changed, 32 insertions(+), 65 deletions(-) diff --git a/src/htslib.rs b/src/htslib.rs index 1a4cf0d..b041dfa 100644 --- a/src/htslib.rs +++ b/src/htslib.rs @@ -1,6 +1,7 @@ use crate::{Aligner, Mapping, Strand}; use core::ffi; use minimap2_sys::mm_idx_t; +use rust_htslib::bam::header::HeaderRecord; use rust_htslib::bam::record::{Cigar, CigarString}; use rust_htslib::bam::{Header, Record}; @@ -78,7 +79,7 @@ fn cigar_to_cigarstr(cigar: &Vec<(u32, u8)>) -> CigarString { CigarString(op_vec) } -#[derive(Debug,PartialEq,Eq)] +#[derive(Debug, PartialEq, Eq)] pub struct SeqMetaData { pub name: String, pub length: u32, @@ -109,10 +110,22 @@ impl MMIndex { } seqs } + + pub fn get_header(&self) -> Header { + let mut header = Header::new(); + for seq in self.seqs() { + header.push_record( + HeaderRecord::new(b"SQ") + .push_tag(b"SN", &seq.name) + .push_tag(b"LN", &seq.length), + ); + } + header + } } -impl From for MMIndex { - fn from(aligner: Aligner) -> Self { +impl From<&Aligner> for MMIndex { + fn from(aligner: &Aligner) -> Self { MMIndex { inner: aligner.idx.unwrap(), } @@ -124,17 +137,19 @@ impl From for MMIndex { mod tests { use super::*; - #[test] - fn test_index() { + fn get_aligner() -> Aligner { let aligner = Aligner::builder() .with_threads(1) .with_index("test_data/MT-human.fa", Some("test_data/MT-human.mmi")) .unwrap(); + aligner + } - let idx = MMIndex::from(aligner); - + #[test] + fn test_index() { + let aligner = get_aligner(); + let idx = MMIndex::from(&aligner); let seqs = idx.seqs(); - assert_eq!( seqs, vec![SeqMetaData { @@ -144,10 +159,11 @@ mod tests { }] ); - //for i in 0..idx.n_seq { - // unsafe { - // println!("{:?}", *(idx.seq).offset(i as isize)); - // } - //} + let header = idx.get_header(); + + let records = header.to_hashmap(); + let observed = records.get("SQ").unwrap().first().unwrap(); + assert_eq!(observed.get("SN").unwrap(), "MT_human"); + assert_eq!(observed.get("LN").unwrap(), "16569"); } } diff --git a/tests/htslib.rs b/tests/htslib.rs index 3a1a6bf..0b8da41 100644 --- a/tests/htslib.rs +++ b/tests/htslib.rs @@ -1,51 +1,8 @@ #[cfg(test)] #[cfg(feature = "htslib")] mod tests { - use minimap2::htslib::mapping_to_record; + use minimap2::htslib::{mapping_to_record, MMIndex}; use minimap2::{Aligner, Preset}; - use std::borrow::Cow; - use std::collections::HashMap; - use std::rc::Rc; - - // #[derive(Debug, Clone, PartialEq, Eq)] - // pub enum AlignType { - // Primary, - // Secondary, - // Supplementary, - // } - - // impl From<&u16> for AlignType { - // fn from(flags: &u16) -> Self { - // if (*flags & 256 as u16) { - // Self::Secondary - // } else if (*flags & 2048 as u16) { - // Self::Supplementary - // } else { - // Self::Primary - // } - // } - // } - - // use bitflags::bitflags; - // bitflags! { - // struct SamFlags: u16 { - // const PAIRED = 0x1; - // const PROPER_PAIR = 0x2; - // const UNMAP = 0x4; - // const MUNMAP = 0x8; - // const REVERSE = 0x16; - // const MREVERSE = 0x32; - // const READ1 = 0x64; - // const READ2 = 0x128; - // const SECONDARY = 0x256; - // const QCFAIL = 0x512; - - // const DUP = 0x1024; - // const SUPPLEMENTARY = 0x2048; - // const PRIMARY = !(Self::SECONDARY | Self::SUPPLEMENTARY); - // } - // } - use rust_htslib::bam::header::{Header, HeaderRecord}; use rust_htslib::bam::{Format, Read, Reader, Record, Writer}; @@ -80,14 +37,8 @@ mod tests { .with_index("test_data/genome.fa", None) .unwrap() .with_cigar(); - let mut header = Header::new(); - // TODO: would be nice to get this from the aligner index - header.push_record( - HeaderRecord::new(b"SQ") - .push_tag(b"SN", &String::from("chr1")) - .push_tag(b"LN", &1720), - ); - + let idx = MMIndex::from(&aligner); + let header = idx.get_header(); // truth set from cli minimap let expected_recs = gdna_records(query_name);