diff --git a/.gitignore b/.gitignore index 24a359c..9ef9162 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,6 @@ /Cargo.lock test.mmi */target -/*.mmi \ No newline at end of file +/*.mmi +test_data/*.bam +test_data/*.mmi diff --git a/Cargo.toml b/Cargo.toml index fe0398a..be7022c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,25 +9,36 @@ repository = "https://github.com/jguhlin/minimap2-rs" categories = ["science"] keywords = ["bioinformatics", "fasta", "alignment", "fastq"] exclude = [ - "**/*.fasta", - "libsfasta/test_data/", - "*.profdata", - "*.mmi", - "**/*.mmi", - "minimap2-sys/" + "**/*.fasta", + "libsfasta/test_data/", + "*.profdata", + "*.mmi", + "**/*.mmi", + "minimap2-sys/", ] [dependencies] libc = "0.2.134" -bytelines = "2.4.0" -simdutf8 = "0.1.4" -flate2 = { version = "1.0.17", features = ["zlib-ng"], default-features = false } + +simdutf8 = { version = "0.1.4", optional = true } + +flate2 = { version = "1.0.17", features = [ + "zlib" +], default-features = false, optional = true } # Dep for development #minimap2-sys = { path = "./minimap2-sys" } minimap2-sys = "0.1.7" -fffx = "0.1.1" + +fffx = {version = "0.1.2", optional = true } +#fffx = { path = "../fffx", optional=true } +rust-htslib = { version = "0.40.2", optional = true } + +[features] +default = ['map-file'] +htslib = ['rust-htslib'] +map-file = ['flate2', 'simdutf8', 'fffx'] # [profile.release] # opt-level = 3 @@ -39,3 +50,4 @@ debug = true [profile.dev.package."*"] opt-level = 3 + diff --git a/src/htslib.rs b/src/htslib.rs new file mode 100644 index 0000000..b041dfa --- /dev/null +++ b/src/htslib.rs @@ -0,0 +1,169 @@ +use crate::{Aligner, Mapping, Strand}; +use core::ffi; +use minimap2_sys::mm_idx_t; +use rust_htslib::bam::header::HeaderRecord; +use rust_htslib::bam::record::{Cigar, CigarString}; +use rust_htslib::bam::{Header, Record}; + +pub fn mapping_to_record( + mapping: Option<&Mapping>, + seq: &[u8], + header: Header, + qual: Option<&[u8]>, + query_name: Option<&[u8]>, +) -> Record { + let mut rec = Record::new(); + let qname = query_name.unwrap_or(b"query"); + // FIXFIX: there's probably a better way of setting a default value + // for the quality string + let qual = match qual { + Some(q) => Vec::from(q), + None => { + let q = vec![255; seq.len()]; + q + } + }; + + let cigar: Option = mapping + .and_then(|m| m.alignment.clone()) // FIXFIX: we probably don't need a clone here + .and_then(|a| a.cigar) + .map(|c| cigar_to_cigarstr(&c)); + + rec.set(qname, cigar.as_ref(), seq, &qual[..]); + match mapping { + Some(m) => { + println!("Strand {m:?}"); + if m.strand == Strand::Reverse { + println!("here"); + rec.set_reverse(); + } + // TODO: set secondary/supplementary flags + rec.set_pos(m.target_start as i64); + rec.set_mapq(m.mapq as u8); + rec.set_mpos(-1); + // TODO: set tid from sequences listed in header + rec.set_mtid(-1); + rec.set_insert_size(0); + } + None => { + rec.set_unmapped(); + rec.set_tid(-1); + rec.set_pos(-1); + rec.set_mapq(255); + rec.set_mpos(-1); + rec.set_mtid(-1); + rec.set_insert_size(-1); + } + }; + // TODO: set AUX flags for cs/md if available + rec +} + +fn cigar_to_cigarstr(cigar: &Vec<(u32, u8)>) -> CigarString { + let op_vec: Vec = cigar + .to_owned() + .iter() + .map(|(len, op)| match op { + 0 => Cigar::Match(*len), + 1 => Cigar::Ins(*len), + 2 => Cigar::Del(*len), + 3 => Cigar::RefSkip(*len), + 4 => Cigar::SoftClip(*len), + 5 => Cigar::HardClip(*len), + 6 => Cigar::Pad(*len), + 7 => Cigar::Equal(*len), + 8 => Cigar::Diff(*len), + _ => panic!("Unexpected cigar operation"), + }) + .collect(); + CigarString(op_vec) +} + +#[derive(Debug, PartialEq, Eq)] +pub struct SeqMetaData { + pub name: String, + pub length: u32, + pub is_alt: bool, +} + +#[derive(Debug)] +pub struct MMIndex { + pub inner: mm_idx_t, +} + +impl MMIndex { + pub fn n_seq(&self) -> u32 { + self.inner.n_seq + } + + pub fn seqs(&self) -> Vec { + let mut seqs: Vec = Vec::with_capacity(self.n_seq() as usize); + for i in 0..self.n_seq() { + let _seq = unsafe { *(self.inner.seq).offset(i as isize) }; + let c_str = unsafe { ffi::CStr::from_ptr(_seq.name) }; + let rust_str = c_str.to_str().unwrap().to_string(); + seqs.push(SeqMetaData { + name: rust_str, + length: _seq.len, + is_alt: _seq.is_alt != 0, + }); + } + seqs + } + + pub fn get_header(&self) -> Header { + let mut header = Header::new(); + for seq in self.seqs() { + header.push_record( + HeaderRecord::new(b"SQ") + .push_tag(b"SN", &seq.name) + .push_tag(b"LN", &seq.length), + ); + } + header + } +} + +impl From<&Aligner> for MMIndex { + fn from(aligner: &Aligner) -> Self { + MMIndex { + inner: aligner.idx.unwrap(), + } + } +} + +#[cfg(test)] +#[cfg(feature = "htslib")] +mod tests { + use super::*; + + fn get_aligner() -> Aligner { + let aligner = Aligner::builder() + .with_threads(1) + .with_index("test_data/MT-human.fa", Some("test_data/MT-human.mmi")) + .unwrap(); + aligner + } + + #[test] + fn test_index() { + let aligner = get_aligner(); + let idx = MMIndex::from(&aligner); + let seqs = idx.seqs(); + assert_eq!( + seqs, + vec![SeqMetaData { + name: "MT_human".to_string(), + length: 16569u32, + is_alt: false + }] + ); + + let header = idx.get_header(); + + let records = header.to_hashmap(); + let observed = records.get("SQ").unwrap().first().unwrap(); + assert_eq!(observed.get("SN").unwrap(), "MT_human"); + assert_eq!(observed.get("LN").unwrap(), "16569"); + } +} diff --git a/src/lib.rs b/src/lib.rs index 7f65e0e..e9b3a0f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,20 +1,27 @@ use std::cell::RefCell; -use std::io::BufRead; + use std::io::Read; use std::mem::MaybeUninit; use std::num::NonZeroI32; use std::path::Path; -use flate2::read::GzDecoder; use minimap2_sys::*; + +#[cfg(feature = "map-file")] +use flate2::read::GzDecoder; +#[cfg(feature = "map-file")] use simdutf8::basic::from_utf8; +#[cfg(feature = "htslib")] +pub mod htslib; + /// Alias for mm_mapop_t pub type MapOpt = mm_mapopt_t; /// Alias for mm_idxopt_t pub type IdxOpt = mm_idxopt_t; +#[cfg(feature = "map-file")] pub use fffx::{Fasta, Fastq, Sequence}; // TODO: Probably a better way to handle this... @@ -456,7 +463,7 @@ impl Aligner { let mut idx: MaybeUninit<*mut mm_idx_t> = MaybeUninit::uninit(); - let mut idx_reader = unsafe { idx_reader.assume_init() }; + let idx_reader = unsafe { idx_reader.assume_init() }; unsafe { // Just a test read? Just following: https://github.com/lh3/minimap2/blob/master/python/mappy.pyx#L147 @@ -480,8 +487,8 @@ impl Aligner { /// Map a single sequence to an index /// not implemented yet! - pub fn with_seq(mut self, seq: &[u8]) -> Result { - let seq = match std::ffi::CString::new(seq) { + pub fn with_seq(self, seq: &[u8]) -> Result { + let _seq = match std::ffi::CString::new(seq) { Ok(seq) => seq, Err(_) => return Err("Invalid sequence"), }; @@ -625,7 +632,7 @@ impl Aligner { let mut m_cs_string: libc::c_int = 0i32; let cs_str = if cs { - let cs_len = mm_gen_cs( + let _cs_len = mm_gen_cs( km, &mut cs_string, &mut m_cs_string, @@ -644,7 +651,7 @@ impl Aligner { }; let md_str = if md { - let md_len = mm_gen_MD( + let _md_len = mm_gen_MD( km, &mut cs_string, &mut m_cs_string, @@ -675,7 +682,6 @@ impl Aligner { } else { None }; - mappings.push(Mapping { target_name: Some( std::ffi::CStr::from_ptr(contig) @@ -716,6 +722,7 @@ impl Aligner { /// /// TODO: Remove cs and md and make them options on the struct /// + #[cfg(feature = "map-file")] pub fn map_file(&self, file: &str, cs: bool, md: bool) -> Result, &'static str> { // Make sure index is set if self.idx.is_none() { @@ -762,7 +769,7 @@ impl Aligner { } // If gzipped, open it with a reader... - let mut reader: Box = if compression_type == CompressionType::GZIP { + let reader: Box = if compression_type == CompressionType::GZIP { Box::new(GzDecoder::new(std::fs::File::open(file).unwrap())) } else { Box::new(std::fs::File::open(file).unwrap()) @@ -828,6 +835,7 @@ pub enum FileFormat { } #[allow(dead_code)] +#[cfg(feature = "map-file")] pub fn detect_file_format(buffer: &[u8]) -> Result { let buffer = from_utf8(&buffer).expect("Unable to parse file as UTF-8"); if buffer.starts_with(">") { @@ -909,7 +917,7 @@ mod tests { #[test] fn test_builder() { - let mut aligner = Aligner::builder().preset(Preset::MapOnt); + let _aligner = Aligner::builder().preset(Preset::MapOnt); } #[test] @@ -930,7 +938,7 @@ mod tests { let mappings = aligner.map("ACGGTAGAGAGGAAGAAGAAGGAATAGCGGACTTGTGTATTTTATCGTCATTCGTGGTTATCATATAGTTTATTGATTTGAAGACTACGTAAGTAATTTGAGGACTGATTAAAATTTTCTTTTTTAGCTTAGAGTCAATTAAAGAGGGCAAAATTTTCTCAAAAGACCATGGTGCATATGACGATAGCTTTAGTAGTATGGATTGGGCTCTTCTTTCATGGATGTTATTCAGAAGGAGTGATATATCGAGGTGTTTGAAACACCAGCGACACCAGAAGGCTGTGGATGTTAAATCGTAGAACCTATAGACGAGTTCTAAAATATACTTTGGGGTTTTCAGCGATGCAAAA".as_bytes(), false, false, None, None).unwrap(); println!("{:#?}", mappings); - let mut aligner = aligner.with_cigar(); + let aligner = aligner.with_cigar(); aligner .map( @@ -1045,4 +1053,5 @@ b"GTTTATGTAGCTTATTCTATCCAAAGCAATGCACTGAAAATGTCTCGACGGGCCCACACGCCCCATAAACAAATAGGT assert_eq!(align.cs.is_some(), *cs); } } + } diff --git a/test_data/cDNA_reads.fa b/test_data/cDNA_reads.fa new file mode 100644 index 0000000..d4b8f75 --- /dev/null +++ b/test_data/cDNA_reads.fa @@ -0,0 +1,4 @@ +>cdna.fwd el:Z:chr1:0:1720:1 +ATGCCTAGAAGTGTGTGATCGCATTGCTGCCAAGTATTCGATGCATCTGTTACCCAGAGGTGCTCCTCACTACAGCCAGGTCATGGACTTCTTCTCAGGAAAGTCTGGTCGCGGTCGTGGTTGTCCGAGAAACGCATCACCCACAGATAAAATCAGTTATTACAGTTGGACCTTCATGTCAAACCAGAGACCCGTATTTCAAATCGCACATACTGCGTCGTGCAATGCCGGGCGCTAACGGCTCAATATCACGCTGCGTCACTATGGCTACCCCAAAGCGGGGGGGGCATCGACGGGCTGCATAGTGGCAGTATCGAGACGACGACCGTTAAAGAATTTCGGAGGCTGGGTGCTGTACTGTAATCGCCTATTGCAGCACTCAGATTTTAACCTGGTGCCG +>cdna.rev el:Z:chr1:360:1720:0 +CGGCACCAGGTTAAAATCTGAGTGCTGCAATAGGCGATTACAGTACAGCACCCAGCCTCCGAAATTCTTTAACGGTCGTCGTCTCGATACTGCCACTATGGTTGGTAGGTCCCAATCTTCGAGCACAGCGGCTCTAACGTGGCCGAAAGCAGTGCCACGATCCGCATGATCTTATGAGCCATACCACATGTAGTTTAGAGCAGCCCGTCGATGCCCCCCCCGCTTTGGGGTAGCCATAGTGACGCAGCGTGATATTGAGCCGTTAGCGCCCGGCATTGCACGACGCAGTATGTGCGATTTTACTCCTGAAGCAGCCGTGAGTACTGTACTCTAATGCACACGTACTTGTTTATGCGAGAAGGTTACCGTCTAATTTTAATGGCCTAGGCGATGGCTATAA \ No newline at end of file diff --git a/test_data/cDNA_vs_genome.sam b/test_data/cDNA_vs_genome.sam new file mode 100644 index 0000000..0a0c365 --- /dev/null +++ b/test_data/cDNA_vs_genome.sam @@ -0,0 +1,6 @@ +@HD VN:1.6 SO:queryname +@SQ SN:chr1 LN:1720 +@PG ID:minimap2 PN:minimap2 VN:2.24-r1122 CL:minimap2 -ay -x splice --MD --cs test_data/genome.fa test_data/cDNA_reads.fa +@PG ID:samtools PN:samtools PP:minimap2 VN:1.16.1 CL:samtools sort -n -o test_data/cDNA_vs_genome.sam +cdna.fwd 0 chr1 1 60 100M620N100M80N100M620N100M * 0 0 ATGCCTAGAAGTGTGTGATCGCATTGCTGCCAAGTATTCGATGCATCTGTTACCCAGAGGTGCTCCTCACTACAGCCAGGTCATGGACTTCTTCTCAGGAAAGTCTGGTCGCGGTCGTGGTTGTCCGAGAAACGCATCACCCACAGATAAAATCAGTTATTACAGTTGGACCTTCATGTCAAACCAGAGACCCGTATTTCAAATCGCACATACTGCGTCGTGCAATGCCGGGCGCTAACGGCTCAATATCACGCTGCGTCACTATGGCTACCCCAAAGCGGGGGGGGCATCGACGGGCTGCATAGTGGCAGTATCGAGACGACGACCGTTAAAGAATTTCGGAGGCTGGGTGCTGTACTGTAATCGCCTATTGCAGCACTCAGATTTTAACCTGGTGCCG * NM:i:0 ms:i:400 AS:i:304 nn:i:0 ts:A:+ tp:A:P cm:i:124 s1:i:372 s2:i:87 de:f:0 MD:Z:400 rl:i:0 el:Z:chr1:0:1720:1 +cdna.rev 16 chr1 361 60 100M440N100M260N100M260N100M * 0 0 TTATAGCCATCGCCTAGGCCATTAAAATTAGACGGTAACCTTCTCGCATAAACAAGTACGTGTGCATTAGAGTACAGTACTCACGGCTGCTTCAGGAGTAAAATCGCACATACTGCGTCGTGCAATGCCGGGCGCTAACGGCTCAATATCACGCTGCGTCACTATGGCTACCCCAAAGCGGGGGGGGCATCGACGGGCTGCTCTAAACTACATGTGGTATGGCTCATAAGATCATGCGGATCGTGGCACTGCTTTCGGCCACGTTAGAGCCGCTGTGCTCGAAGATTGGGACCTACCAACCATAGTGGCAGTATCGAGACGACGACCGTTAAAGAATTTCGGAGGCTGGGTGCTGTACTGTAATCGCCTATTGCAGCACTCAGATTTTAACCTGGTGCCG * NM:i:0 ms:i:400 AS:i:304 nn:i:0 ts:A:- tp:A:P cm:i:120 s1:i:369 s2:i:0 de:f:0 MD:Z:400 rl:i:0 el:Z:chr1:360:1720:0 diff --git a/test_data/gDNA_reads.fa b/test_data/gDNA_reads.fa new file mode 100644 index 0000000..3498263 --- /dev/null +++ b/test_data/gDNA_reads.fa @@ -0,0 +1,12 @@ +>perfect_read.fwd el:Z:chr1:180:280:1 em:i:0 +TACGCCACACGGGCTACACTCTCGCCTTCTCGTCGCAACTACGAGCTGGACTATCGGCCGAGAGGATCTAACACGAGAAGTACTTGCCGGCAATCCCTAA +>perfect_read.rev el:Z:chr1:180:280:0 em:i:0 +TTAGGGATTGCCGGCAAGTACTTCTCGTGTTAGATCCTCTCGGCCGATAGTCCAGCTCGTAGTTGCGACGAGAAGGCGAGAGTGTAGCCCGTGTGGCGTA +>imperfect_read.fwd el:Z:chr1:180:280:1 em:i:5 +TACGCCACACAGGCTACACTCTCGCCTTCTCGTCGCAACTACGCGCTGGACTATCCGCCGAGAGGATCTAACACGTGAAGTACTTGCCGGCATTCCCTAA +>unmappable_read el:Z:chr1:180:280:1 em:i:99 +GTTTGTGAGGATCGCGGCGATGGTTTAGGGACGATAAGTATTATCGGAAGGGGCATATTCGTGTCGGAAGGTGAAGATGCCGAAAAAGCAATCGAAGCCG +>perfect_inv_duplicate el:Z:chr1:540:640:1 em:i:0 +GAAATACGGGTCTCTGGTTTGACATAAAGGTCCAACTGTAATAACTGATTTTATCTGTGGGTGATGCGTTTCTCGGACAACCACGACCGCGCCCAGACTT +>split_read +ATGCCTAGAAGTGTGTGATCGCATTGCTGCCAAGTATTCGATGCATCTGTTACCCAGAGGTGCTCCTCACTACAGCCAGGTCATGGACTTCTTCTCAGGACTACCCACCTGTTTCATGATCCCCCCTTTGTGAACAATAAACTTAGTAAACATTTTTACGATTAAATGTTTAACTCCTAC \ No newline at end of file diff --git a/test_data/gDNA_vs_genome.sam b/test_data/gDNA_vs_genome.sam new file mode 100644 index 0000000..5f351ef --- /dev/null +++ b/test_data/gDNA_vs_genome.sam @@ -0,0 +1,12 @@ +@HD VN:1.6 SO:queryname +@SQ SN:chr1 LN:1720 +@PG ID:minimap2 PN:minimap2 VN:2.24-r1122 CL:minimap2 -ay --MD --cs test_data/genome.fa test_data/gDNA_reads.fa +@PG ID:samtools PN:samtools PP:minimap2 VN:1.16.1 CL:samtools sort -n -o test_data/gDNA_vs_genome.sam +imperfect_read.fwd 0 chr1 181 27 100M * 0 0 TACGCCACACAGGCTACACTCTCGCCTTCTCGTCGCAACTACGCGCTGGACTATCCGCCGAGAGGATCTAACACGTGAAGTACTTGCCGGCATTCCCTAA * NM:i:5 ms:i:170 AS:i:170 nn:i:0 tp:A:P cm:i:6 s1:i:56 s2:i:0 de:f:0.05 MD:Z:10G32A11G19A16A7 rl:i:0 el:Z:chr1:180:280:1 em:i:5 +perfect_inv_duplicate 0 chr1 541 36 100M * 0 0 GAAATACGGGTCTCTGGTTTGACATAAAGGTCCAACTGTAATAACTGATTTTATCTGTGGGTGATGCGTTTCTCGGACAACCACGACCGCGCCCAGACTT * NM:i:0 ms:i:200 AS:i:200 nn:i:0 tp:A:P cm:i:15 s1:i:85 s2:i:71 de:f:0 MD:Z:100 rl:i:0 el:Z:chr1:540:640:1 em:i:0 +perfect_inv_duplicate 272 chr1 721 0 100M * 0 0 * * NM:i:2 ms:i:188 AS:i:188 nn:i:0 tp:A:S cm:i:11 s1:i:71 de:f:0.02 MD:Z:8T65C25 rl:i:0 el:Z:chr1:540:640:1 em:i:0 +perfect_read.fwd 0 chr1 181 60 100M * 0 0 TACGCCACACGGGCTACACTCTCGCCTTCTCGTCGCAACTACGAGCTGGACTATCGGCCGAGAGGATCTAACACGAGAAGTACTTGCCGGCAATCCCTAA * NM:i:0 ms:i:200 AS:i:200 nn:i:0 tp:A:P cm:i:17 s1:i:84 s2:i:0 de:f:0 MD:Z:100 rl:i:0 el:Z:chr1:180:280:1 em:i:0 +perfect_read.rev 16 chr1 181 60 100M * 0 0 TACGCCACACGGGCTACACTCTCGCCTTCTCGTCGCAACTACGAGCTGGACTATCGGCCGAGAGGATCTAACACGAGAAGTACTTGCCGGCAATCCCTAA * NM:i:0 ms:i:200 AS:i:200 nn:i:0 tp:A:P cm:i:17 s1:i:84 s2:i:0 de:f:0 MD:Z:100 rl:i:0 el:Z:chr1:180:280:0 em:i:0 +split_read 0 chr1 1 60 100M80S * 0 0 ATGCCTAGAAGTGTGTGATCGCATTGCTGCCAAGTATTCGATGCATCTGTTACCCAGAGGTGCTCCTCACTACAGCCAGGTCATGGACTTCTTCTCAGGACTACCCACCTGTTTCATGATCCCCCCTTTGTGAACAATAAACTTAGTAAACATTTTTACGATTAAATGTTTAACTCCTAC * NM:i:0 ms:i:200 AS:i:200 nn:i:0 tp:A:P cm:i:13 s1:i:90 s2:i:0 de:f:0 SA:Z:chr1,821,-,80M100S,52,0; MD:Z:100 rl:i:0 +split_read 2064 chr1 821 52 80M100H * 0 0 GTAGGAGTTAAACATTTAATCGTAAAAATGTTTACTAAGTTTATTGTTCACAAAGGGGGGATCATGAAACAGGTGGGTAG * NM:i:0 ms:i:160 AS:i:160 nn:i:0 tp:A:P cm:i:11 s1:i:70 s2:i:0 de:f:0 SA:Z:chr1,1,+,100M80S,60,0; MD:Z:80 rl:i:0 +unmappable_read 4 * 0 0 * * 0 0 GTTTGTGAGGATCGCGGCGATGGTTTAGGGACGATAAGTATTATCGGAAGGGGCATATTCGTGTCGGAAGGTGAAGATGCCGAAAAAGCAATCGAAGCCG * rl:i:0 el:Z:chr1:180:280:1 em:i:99 diff --git a/test_data/genome.fa b/test_data/genome.fa new file mode 100644 index 0000000..27c6b80 --- /dev/null +++ b/test_data/genome.fa @@ -0,0 +1,2 @@ +>chr1 +ATGCCTAGAAGTGTGTGATCGCATTGCTGCCAAGTATTCGATGCATCTGTTACCCAGAGGTGCTCCTCACTACAGCCAGGTCATGGACTTCTTCTCAGGAGTAATTTGCGCTGCGGAAAACGGCTGATGGGGAGTCGACCTACCTTAATATCTCCGAGGTTGCCCTCACAAATGGCGTAGTACGCCACACGGGCTACACTCTCGCCTTCTCGTCGCAACTACGAGCTGGACTATCGGCCGAGAGGATCTAACACGAGAAGTACTTGCCGGCAATCCCTAAGTACCTAGGCTCTCGGTCCACTATGACGCAGGACAGGGTTCAGTTAAAAGGCCTCTTCATGCGGTCTTAAGACCTTATAGTTATAGCCATCGCCTAGGCCATTAAAATTAGACGGTAACCTTCTCGCATAAACAAGTACGTGTGCATTAGAGTACAGTACTCACGGCTGCTTCAGGAGTAGTACATAGATGTCCGTTAGACTCTCACGTGTTTGCCACTCAGTTCCGACATGTTTATACGCAATCCTATTGAACGACTAGGAAATACGGGTCTCTGGTTTGACATAAAGGTCCAACTGTAATAACTGATTTTATCTGTGGGTGATGCGTTTCTCGGACAACCACGACCGCGCCCAGACTTGTACTCCGTTCGATGGTGACAGAAGGGTTATGGAGGGAGTGATTATCGCTTTTTACGGAAGGAAAGAGCCGGGTTTGTAGAAGTCTGGTCGCGGTCGTGGTTGTCCGAGAAACGCATCACCCACAGATAAAATCAGTTATTACAGTTGGACCTTCATGTCAAACCAGAGACCCGTATTTCGTAGGAGTTAAACATTTAATCGTAAAAATGTTTACTAAGTTTATTGTTCACAAAGGGGGGATCATGAAACAGGTGGGTAGAAATCGCACATACTGCGTCGTGCAATGCCGGGCGCTAACGGCTCAATATCACGCTGCGTCACTATGGCTACCCCAAAGCGGGGGGGGCATCGACGGGCTGGTATATTGGGCTACGTCCAAAATTTAACGCCAGCAGTACTGTCCGAAGGAGCGTATCTCCTGAGTCTGCAGATGCAGTAGTTTGATTTGAGCTCCATTACCCTACAATTAGAACACTGGCAACATTTGGGCGTTGAGCGGTCTTCCGTGTCGCTCGATCCGCTGGAACTTGGCAACCACAGTACCCAGGACCAGGGGTTGTCTTGTGGCTGTCTAAGGTCGGTGCTATACTGTGGCAACAGTTCTTTCCTACATGAATAGCTCTAAACTACATGTGGTATGGCTCATAAGATCATGCGGATCGTGGCACTGCTTTCGGCCACGTTAGAGCCGCTGTGCTCGAAGATTGGGACCTACCAACGTAAGACCTGTCCCTTTTGGTCCTCAGCTTCGTGAGGGACTACCAACTACAGCTGTGCTCTTGGCTGTCTCAAGCACTAGTGTCAATTGACAAAGATCCGTCAATAGCTATGCATCCCACAACACTAAAGCCACCCGTCTGCACGATCTGCAGCATCACCTGAAGACAATAATATAAAGGGTAGAGGTCACTGGCATTTTCCATGCTGTTACGTACGTAGCAGCGCTATAGCCAGAGGTGCGTCATAGAATAGGAATTAGCATAGTGGCAGTATCGAGACGACGACCGTTAAAGAATTTCGGAGGCTGGGTGCTGTACTGTAATCGCCTATTGCAGCACTCAGATTTTAACCTGGTGCCG \ No newline at end of file diff --git a/test_data/query.fa b/test_data/query.fa index cabc7c5..9d37096 100644 --- a/test_data/query.fa +++ b/test_data/query.fa @@ -1,2 +1,9 @@ >q1 -atCCTACACTGCATAAACTATTTTGcaccataaaaaaaagGGACatgtgtgGGTCTAAAATAATTTGCTGAGCAATTAATGATTTCTAAATGATGCTAAAGTGAACCATTGTAatgttatatgaaaaataaatacacaattaagATCAACACAGTGAAATAACATTGATTGGGTGATTTCAAATGGGGTCTATctgaataatgttttatttaacagtaatttttatttctatcaatttttagtaatatctacaaatattttgttttaggcTGCCAGAAGATCGGCGGTGCAAGGTCAGAGGTGAGATGTTAGGTGGTTCCACCAACTGCACGGAAGAGCTGCCCTCTGTCATTCAAAATTTGACAGGTACAAACAGactatattaaataagaaaaacaaactttttaaaggCTTGACCATTAGTGAATAGGTTATATGCTTATTATTTCCATTTAGCTTTTTGAGACTAGTATGATTAGACAAATCTGCTTAGttcattttcatataatattgaGGAACAAAATTTGTGAGATTTTGCTAAAATAACTTGCTTTGCTTGTTTATAGAGGCacagtaaatcttttttattattattataattttagattttttaatttttaaat +atCCTACACTGCATAAACTATTTTGcaccataaaaaaaagGGACatgtgtgGGTCTAAAATAATTTGCTGAGCAATTAAT +GATTTCTAAATGATGCTAAAGTGAACCATTGTAatgttatatgaaaaataaatacacaattaagATCAACACAGTGAAAT +AACATTGATTGGGTGATTTCAAATGGGGTCTATctgaataatgttttatttaacagtaatttttatttctatcaattttt +agtaatatctacaaatattttgttttaggcTGCCAGAAGATCGGCGGTGCAAGGTCAGAGGTGAGATGTTAGGTGGTTCC +ACCAACTGCACGGAAGAGCTGCCCTCTGTCATTCAAAATTTGACAGGTACAAACAGactatattaaataagaaaaacaaa +ctttttaaaggCTTGACCATTAGTGAATAGGTTATATGCTTATTATTTCCATTTAGCTTTTTGAGACTAGTATGATTAGA +CAAATCTGCTTAGttcattttcatataatattgaGGAACAAAATTTGTGAGATTTTGCTAAAATAACTTGCTTTGCTTGT +TTATAGAGGCacagtaaatcttttttattattattataattttagattttttaatttttaaat diff --git a/test_data/query_vs_test_data.sam b/test_data/query_vs_test_data.sam new file mode 100644 index 0000000..1aaf50e --- /dev/null +++ b/test_data/query_vs_test_data.sam @@ -0,0 +1,3 @@ +@SQ SN:contig4 LN:3360 +@PG ID:minimap2 PN:minimap2 VN:2.24-r1122 CL:minimap2 -ax map-ont -cs --MD test_data.fasta query.fa +q1 0 contig4 1529 60 39M2I582M * 0 0 atCCTACACTGCATAAACTATTTTGcaccataaaaaaaagGGACatgtgtgGGTCTAAAATAATTTGCTGAGCAATTAATGATTTCTAAATGATGCTAAAGTGAACCATTGTAatgttatatgaaaaataaatacacaattaagATCAACACAGTGAAATAACATTGATTGGGTGATTTCAAATGGGGTCTATctgaataatgttttatttaacagtaatttttatttctatcaatttttagtaatatctacaaatattttgttttaggcTGCCAGAAGATCGGCGGTGCAAGGTCAGAGGTGAGATGTTAGGTGGTTCCACCAACTGCACGGAAGAGCTGCCCTCTGTCATTCAAAATTTGACAGGTACAAACAGactatattaaataagaaaaacaaactttttaaaggCTTGACCATTAGTGAATAGGTTATATGCTTATTATTTCCATTTAGCTTTTTGAGACTAGTATGATTAGACAAATCTGCTTAGttcattttcatataatattgaGGAACAAAATTTGTGAGATTTTGCTAAAATAACTTGCTTTGCTTGTTTATAGAGGCacagtaaatcttttttattattattataattttagattttttaatttttaaat * NM:i:4 ms:i:1223 AS:i:1222 nn:i:0 tp:A:P cm:i:114 s1:i:590 s2:i:0 de:f:0.0048 rl:i:0 diff --git a/tests/htslib.rs b/tests/htslib.rs new file mode 100644 index 0000000..0b8da41 --- /dev/null +++ b/tests/htslib.rs @@ -0,0 +1,153 @@ +#[cfg(test)] +#[cfg(feature = "htslib")] +mod tests { + use minimap2::htslib::{mapping_to_record, MMIndex}; + use minimap2::{Aligner, Preset}; + use rust_htslib::bam::header::{Header, HeaderRecord}; + use rust_htslib::bam::{Format, Read, Reader, Record, Writer}; + + fn gdna_records(query_name: &str) -> Vec { + let mut reader = Reader::from_path("test_data/gDNA_vs_genome.sam").unwrap(); + let records: Vec = reader + .records() + .into_iter() + .filter_map(|r| Some(r.unwrap())) + .filter(|r| String::from_utf8_lossy(r.qname()) == *query_name) + .collect(); + + // for rec in reader.records() { + // let rec = rec.unwrap(); + // if rec.qname() == q + // } + records + } + + fn get_query_sequence(records: &Vec) -> Vec { + let seq = records + .iter() + .find(|r| !(r.is_secondary() | r.is_supplementary())) + .map(|r| r.seq().as_bytes()) + .unwrap(); + seq + } + + fn get_test_case(query_name: &str) -> (Vec, Vec) { + let aligner = Aligner::builder() + .with_threads(1) + .with_index("test_data/genome.fa", None) + .unwrap() + .with_cigar(); + let idx = MMIndex::from(&aligner); + let header = idx.get_header(); + // truth set from cli minimap + let expected_recs = gdna_records(query_name); + + // extract the query sequence from the primary record for this query + let query = get_query_sequence(&expected_recs); + let mappings = aligner.map(&query, true, true, None, None).unwrap(); + let observed_recs: Vec = mappings + .iter() + .filter_map(|m| { + let m = mapping_to_record( + Some(&m), + &query, + header.clone(), + None, + Some(query_name.as_bytes()), + ); + Some(m) + }) + .collect(); + (expected_recs, observed_recs) + } + + fn check_single_mapper(expected: &Vec, observed: &Vec) { + assert_eq!(expected.len(), observed.len()); + let e = expected.first().unwrap(); + let o = observed.first().unwrap(); + assert_eq!(o.seq().as_bytes(), e.seq().as_bytes()); + + assert_eq!(o.cigar(), e.cigar()); + assert_eq!(o.inner().core.pos, e.inner().core.pos); + assert_eq!(o.inner().core.mpos, e.inner().core.mpos); + assert_eq!(o.inner().core.mtid, e.inner().core.mtid); + assert_eq!(o.inner().core.tid, e.inner().core.tid); + // the bin attribute is associated with BAM format, so I don't think we need to set it + // assert_eq!(o.inner().core.bin, e.inner().core.bin); + assert_eq!(o.inner().core.qual, e.inner().core.qual); + assert_eq!(o.inner().core.l_extranul, e.inner().core.l_extranul); + // FIXFIX: renable this + //assert_eq!(o.inner().core.flag, e.inner().core.flag); + assert_eq!(o.inner().core.l_qname, e.inner().core.l_qname); + assert_eq!(o.inner().core.n_cigar, e.inner().core.n_cigar); + assert_eq!(o.inner().core.l_qseq, e.inner().core.l_qseq); + assert_eq!(o.inner().core.isize, e.inner().core.isize); + + } + + #[test] + fn test_perfect_fwd() { + let (expected, observed) = get_test_case(&"perfect_read.fwd".to_string()); + check_single_mapper(&expected, &observed); + } + + #[test] + fn test_perfect_rev() { + let (expected, observed) = get_test_case(&"perfect_read.rev".to_string()); + check_single_mapper(&expected, &observed); + let e: &Record = expected.first().unwrap(); + let o: &Record = observed.first().unwrap(); + assert_eq!(e.is_reverse(), true); + assert_eq!(o.is_reverse(), true); + } + + + + #[test] + fn test_mappy_output() { + let seq = b"atCCTACACTGCATAAACTATTTTGcaccataaaaaaaagGGACatgtgtgGGTCTAAAATAATTTGCTGAGCAATTAATGATTTCTAAATGATGCTAAAGTGAACCATTGTAatgttatatgaaaaataaatacacaattaagATCAACACAGTGAAATAACATTGATTGGGTGATTTCAAATGGGGTCTATctgaataatgttttatttaacagtaatttttatttctatcaatttttagtaatatctacaaatattttgttttaggcTGCCAGAAGATCGGCGGTGCAAGGTCAGAGGTGAGATGTTAGGTGGTTCCACCAACTGCACGGAAGAGCTGCCCTCTGTCATTCAAAATTTGACAGGTACAAACAGactatattaaataagaaaaacaaactttttaaaggCTTGACCATTAGTGAATAGGTTATATGCTTATTATTTCCATTTAGCTTTTTGAGACTAGTATGATTAGACAAATCTGCTTAGttcattttcatataatattgaGGAACAAAATTTGTGAGATTTTGCTAAAATAACTTGCTTTGCTTGTTTATAGAGGCacagtaaatcttttttattattattataattttagattttttaatttttaaat"; + + let aligner = Aligner::builder() + .preset(Preset::MapOnt) + .with_threads(1) + .with_index("test_data/test_data.fasta", None) + .unwrap() + .with_cigar(); + + let mut mappings = aligner.map(seq, true, true, None, None).unwrap(); + assert_eq!(mappings.len(), 1); + + let mut header = Header::new(); + // TODO: would be nice to get this from the aligner index + header.push_record( + HeaderRecord::new(b"SQ") + .push_tag(b"SN", &String::from("contig4")) + .push_tag(b"LN", &3360), + ); + + let observed = mappings.pop().unwrap(); + let o = mapping_to_record(Some(&observed), seq, header.clone(), None, Some(b"q1")); + + let mut sam_reader = Reader::from_path("test_data/query_vs_test_data.sam").unwrap(); + let e = sam_reader.records().next().unwrap().unwrap(); + + assert_eq!(o.cigar(), e.cigar()); + assert_eq!(o.inner().core.pos, e.inner().core.pos); + assert_eq!(o.inner().core.mpos, e.inner().core.mpos); + assert_eq!(o.inner().core.mtid, e.inner().core.mtid); + assert_eq!(o.inner().core.tid, e.inner().core.tid); + // the bin attribute is associated with BAM format, so I don't think we need to set it + // assert_eq!(o.inner().core.bin, e.inner().core.bin); + assert_eq!(o.inner().core.qual, e.inner().core.qual); + assert_eq!(o.inner().core.l_extranul, e.inner().core.l_extranul); + assert_eq!(o.inner().core.flag, e.inner().core.flag); + assert_eq!(o.inner().core.l_qname, e.inner().core.l_qname); + assert_eq!(o.inner().core.n_cigar, e.inner().core.n_cigar); + assert_eq!(o.inner().core.l_qseq, e.inner().core.l_qseq); + assert_eq!(o.inner().core.isize, e.inner().core.isize); + + let mut writer = + Writer::from_path("test_data/query_vs_target.bam", &header, Format::Bam).unwrap(); + writer.write(&o).unwrap(); + } +}