diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index 5214bb321d..d87c220aa0 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -96,216 +96,6 @@ impl TryFrom<&str> for HashFunctions { } } -#[derive(Debug)] -pub struct ReadingFrame { - fw: Vec, // Forward frame - rc: Option>, // Reverse complement (optional, not used for protein input) - hash_function: HashFunctions, -} - -impl std::fmt::Display for ReadingFrame { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let fw_str = String::from_utf8(self.fw.clone()).expect("Invalid UTF-8 sequence in fw"); - if let Some(rc) = &self.rc { - let rc_str = String::from_utf8(rc.clone()).expect("Invalid UTF-8 sequence in rc"); - write!(f, "Forward: {}, Reverse Complement: {}", fw_str, rc_str) - } else { - write!(f, "Forward: {}, Reverse Complement: None", fw_str) - } - } -} - -impl ReadingFrame { - /// Create a k-mer iterator for this reading frame - pub fn kmer_iter(&self, ksize: usize, seed: u64, force: bool) -> KmerIterator { - KmerIterator::new( - &self.fw, - self.rc.as_deref(), - ksize, - seed, - force, - self.hash_function.clone(), - ) - } -} - -#[derive(Debug)] -pub struct ReadingFrames(Vec); - -impl std::fmt::Display for ReadingFrames { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - for (index, frame) in self.0.iter().enumerate() { - writeln!(f, "Frame {}: {}", index + 1, frame)?; - } - Ok(()) - } -} - -impl ReadingFrames { - /// Create ReadingFrames based on the input sequence, moltype, and is_protein flag - pub fn new(sequence: &[u8], is_protein: bool, hash_function: &HashFunctions) -> Self { - if is_protein { - // for protein input, return one forward frame - let frames = vec![ReadingFrame { - fw: sequence.to_vec(), - rc: None, - hash_function: hash_function.clone(), - }]; - Self(frames) - } else if hash_function.dna() { - // DNA: return forward + rc frames - let dna_rc = revcomp(sequence); - let frames = vec![ReadingFrame { - fw: sequence.to_vec(), - rc: Some(dna_rc), - hash_function: hash_function.clone(), - }]; - Self(frames) - } else if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { - // translation: build 6 frames - let dna_rc = revcomp(sequence); // Compute reverse complement for translation - let dayhoff = hash_function.dayhoff(); - let hp = hash_function.hp(); - Self::translate_frames(sequence, &dna_rc, dayhoff, hp, hash_function) - } else if hash_function.skipm1n3() || hash_function.skipm2n3() { - // skipmers: build 6 frames, following skip pattern - let (m, n) = if hash_function.skipm1n3() { - (1, 3) - } else { - (2, 3) - }; - Self::skipmer_frames(sequence, n, m, hash_function) - } else { - panic!("Unsupported moltype: {}", hash_function); - } - } - - /// Generate translated frames - fn translate_frames( - sequence: &[u8], - dna_rc: &[u8], - dayhoff: bool, - hp: bool, - hash_function: &HashFunctions, - ) -> Self { - let frames: Vec = (0..3) - .map(|frame_number| ReadingFrame { - fw: translated_frame(sequence, frame_number, dayhoff, hp), - rc: Some(translated_frame(dna_rc, frame_number, dayhoff, hp)), - hash_function: hash_function.clone(), - }) - .collect(); - Self(frames) - } - - /// Generate skipmer frames - fn skipmer_frames(sequence: &[u8], n: usize, m: usize, hash_function: &HashFunctions) -> Self { - let frames: Vec = (0..3) - .map(|frame_number| { - let fw = skipmer_frame(sequence, frame_number, n, m); - ReadingFrame { - fw: fw.clone(), - rc: Some(revcomp(&fw)), - hash_function: hash_function.clone(), - } - }) - .collect(); - Self(frames) - } - - /// Access the frames - pub fn frames(&self) -> &Vec { - &self.0 - } -} - -pub struct KmerIterator<'a> { - fw: &'a [u8], - rc: Option<&'a [u8]>, - ksize: usize, - index: usize, - len: usize, - seed: u64, - force: bool, - hash_function: HashFunctions, -} - -impl<'a> KmerIterator<'a> { - pub fn new( - fw: &'a [u8], - rc: Option<&'a [u8]>, - ksize: usize, - seed: u64, - force: bool, - hash_function: HashFunctions, - ) -> Self { - Self { - fw, - rc, - ksize, - index: 0, - len: fw.len(), - seed, - force, - hash_function: hash_function, - } - } -} - -impl<'a> Iterator for KmerIterator<'a> { - type Item = Result; - - fn next(&mut self) -> Option { - if self.index + self.ksize > self.len { - return None; // End of iteration - } - - // Forward k-mer - let kmer = &self.fw[self.index..self.index + self.ksize]; - - // Validate the k-mer only if the hash function is DNA-based - if self.hash_function.dna() { - for j in self.index..self.index + self.ksize { - if !VALID[self.fw[j] as usize] { - if !self.force { - return Some(Err(Error::InvalidDNA { - message: String::from_utf8(kmer.to_vec()).unwrap(), - })); - } else { - self.index += 1; - return Some(Ok(0)); // Skip invalid k-mer - } - } - } - } - // keeping note from prior implementation :) - // ... and then while moving the k-mer window forward for the sequence - // we move another window backwards for the RC. - // For a ksize = 3, and a sequence AGTCGT (len = 6): - // +-+---------+---------------+-------+ - // seq RC |i|i + ksize|len - ksize - i|len - i| - // AGTCGT ACGACT +-+---------+---------------+-------+ - // +-> +-> |0| 2 | 3 | 6 | - // +-> +-> |1| 3 | 2 | 5 | - // +-> +-> |2| 4 | 1 | 4 | - // +-> +-> |3| 5 | 0 | 3 | - // +-+---------+---------------+-------+ - // (leaving this table here because I had to draw to - // get the indices correctly) - - // Reverse complement k-mer (if rc exists) - let hash = if let Some(rc) = self.rc { - let krc = &rc[self.len - self.ksize - self.index..self.len - self.index]; - crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed) - } else { - crate::_hash_murmur(kmer, self.seed) // Use only forward k-mer if rc is None - }; - - self.index += 1; - Some(Ok(hash)) - } -} - const COMPLEMENT: [u8; 256] = { let mut lookup = [0; 256]; lookup[b'A' as usize] = b'T'; @@ -832,300 +622,4 @@ mod test { assert_eq!(colors.len(), 2); } - - #[test] - fn test_reading_frames_new_dna() { - let sequence = b"AGTCGT"; - let hash_function = HashFunctions::Murmur64Dna; - - let frames = ReadingFrames::new(sequence, false, &hash_function); - - assert_eq!(frames.frames().len(), 1); // Only one fw/rc readingframe for DNA - assert_eq!(frames.frames()[0].fw, sequence.to_vec()); - assert_eq!(frames.frames()[0].rc, Some(b"ACGACT".to_vec())); - } - - #[test] - fn test_reading_frames_new_is_protein() { - let sequence = b"MVLSPADKTNVKAAW"; - let hash_function = HashFunctions::Murmur64Protein; - - let frames = ReadingFrames::new(sequence, true, &hash_function); - - assert_eq!(frames.frames().len(), 1); // Only one frame for protein - assert_eq!(frames.frames()[0].fw, sequence.to_vec()); - assert_eq!(frames.frames()[0].rc, None); // No reverse complement for protein - } - - #[test] - fn test_reading_frames_translate_frames() { - let sequence = b"AGTCGTCGAGCT"; - let hash_function = HashFunctions::Murmur64Protein; - - let frames = ReadingFrames::new(sequence, false, &hash_function); - eprintln!("frames: {}", frames); - - assert_eq!(frames.frames().len(), 3); // Three translated frames - assert_eq!(frames.frames()[0].fw, b"SRRA".to_vec()); - assert_eq!(frames.frames()[0].rc, Some(b"SSTT".to_vec())); - assert_eq!(frames.frames()[1].fw, b"VVE".to_vec()); - assert_eq!(frames.frames()[1].rc, Some(b"ARR".to_vec())); - assert_eq!(frames.frames()[2].fw, b"SSS".to_vec()); - assert_eq!(frames.frames()[2].rc, Some(b"LDD".to_vec())); - } - - #[test] - fn test_reading_frames_skipmer_frames_m1n3() { - let sequence = b"AGTCGTCGAGCT"; - let hash_function = HashFunctions::Murmur64Skipm1n3; - - let frames = ReadingFrames::new(sequence, false, &hash_function); - - assert_eq!(frames.frames().len(), 3); // Three skipmer frames - - // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) - let expected_fw_frame_0 = b"ACCG".to_vec(); // Frame 0: Keep 1 base, skip 2 bases, starting from index 0 - let expected_fw_frame_1 = b"GGGC".to_vec(); // Frame 1: Keep 1 base, skip 2 bases, starting from index 1 - let expected_fw_frame_2 = b"TTAT".to_vec(); // Frame 2: Keep 1 base, skip 2 bases, starting from index 2 - - let expected_rc_frame_0 = revcomp(&expected_fw_frame_0); - let expected_rc_frame_1 = revcomp(&expected_fw_frame_1); - let expected_rc_frame_2 = revcomp(&expected_fw_frame_2); - - // Check forward and reverse complement sequences for each frame - assert_eq!(frames.frames()[0].fw, expected_fw_frame_0); - assert_eq!(frames.frames()[0].rc, Some(expected_rc_frame_0)); - - assert_eq!(frames.frames()[1].fw, expected_fw_frame_1); - assert_eq!(frames.frames()[1].rc, Some(expected_rc_frame_1)); - - assert_eq!(frames.frames()[2].fw, expected_fw_frame_2); - assert_eq!(frames.frames()[2].rc, Some(expected_rc_frame_2)); - } - - #[test] - fn test_reading_frames_skipmer_frames_m2n3() { - let sequence = b"AGTCGTCGAGCT"; - let hash_function = HashFunctions::Murmur64Skipm2n3; - - let frames = ReadingFrames::new(sequence, false, &hash_function); - - assert_eq!(frames.frames().len(), 3); // Three skipmer frames - - // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) - let expected_fw_frame_0 = b"AGCGCGGC".to_vec(); - let expected_fw_frame_1 = b"GTGTGACT".to_vec(); - let expected_fw_frame_2 = b"TCTCAGT".to_vec(); - - let expected_rc_frame_0 = revcomp(&expected_fw_frame_0); - let expected_rc_frame_1 = revcomp(&expected_fw_frame_1); - let expected_rc_frame_2 = revcomp(&expected_fw_frame_2); - - // Check forward and reverse complement sequences for each frame - assert_eq!(frames.frames()[0].fw, expected_fw_frame_0); - assert_eq!(frames.frames()[0].rc, Some(expected_rc_frame_0)); - - assert_eq!(frames.frames()[1].fw, expected_fw_frame_1); - assert_eq!(frames.frames()[1].rc, Some(expected_rc_frame_1)); - - assert_eq!(frames.frames()[2].fw, expected_fw_frame_2); - assert_eq!(frames.frames()[2].rc, Some(expected_rc_frame_2)); - } - - #[test] - fn test_reading_frame_kmer_iter() { - let sequence = b"AGTCGT"; - let rc = revcomp(sequence); - let hash_function = HashFunctions::Murmur64Dna; - let frame = ReadingFrame { - fw: sequence.to_vec(), - rc: Some(rc.clone()), - hash_function, - }; - eprintln!("frame: {}", frame); - assert_eq!(frame.fw, sequence.to_vec()); - assert_eq!(frame.rc, Some(b"ACGACT".to_vec())); - - let kmer_iter = frame.kmer_iter(3, 42, false); - - // Expected k-mers from the forward and reverse complement sequence - let expected_kmers = vec![ - (b"AGT".to_vec(), b"ACT".to_vec()), - (b"GTC".to_vec(), b"GAC".to_vec()), - (b"TCG".to_vec(), b"CGA".to_vec()), - (b"CGT".to_vec(), b"ACG".to_vec()), - ]; - - // Compute hashes for expected k-mers - let expected_hashes: Vec = expected_kmers - .iter() - .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) - .collect(); - - // Collect hashes produced by the kmer_iter - let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // Check that produced hashes match expected hashes in order - assert_eq!( - produced_hashes, expected_hashes, - "Hashes do not match in order" - ); - } - - #[test] - fn test_kmer_iter_is_protein() { - let sequence = b"MVLSPADKTNVKAAW"; - let hash_function = HashFunctions::Murmur64Protein; - - let frames = ReadingFrames::new(sequence, true, &hash_function); - - // Only one forward frame for protein - assert_eq!(frames.frames().len(), 1); - let frame = &frames.frames()[0]; - - let kmer_iter = frame.kmer_iter(3, 42, false); - - // Expected k-mers for protein sequence - let expected_kmers = vec![ - b"MVL".to_vec(), - b"VLS".to_vec(), - b"LSP".to_vec(), - b"SPA".to_vec(), - b"PAD".to_vec(), - b"ADK".to_vec(), - b"DKT".to_vec(), - b"KTN".to_vec(), - b"TNV".to_vec(), - b"NVK".to_vec(), - b"VKA".to_vec(), - b"KAA".to_vec(), - b"AAW".to_vec(), - ]; - - // Compute hashes for expected k-mers - let expected_hashes: Vec = expected_kmers - .iter() - .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) - .collect(); - - // Collect hashes produced by the kmer_iter - let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // Check that produced hashes match expected hashes in order - assert_eq!( - produced_hashes, expected_hashes, - "Hashes do not match in order" - ); - } - - // #[test] - // fn test_kmer_iter_translate_frames() { - // let sequence = b"AGTCGTCGAGCT"; - // let hash_function = HashFunctions::Murmur64Protein; - - // let frames = ReadingFrames::new(sequence, false, &hash_function); - - // // Three translated frames - // assert_eq!(frames.frames().len(), 3); - - // // Expected k-mers for translated frames - // let expected_kmers = vec![ - // vec![b"SRR".to_vec(), b"RRA".to_vec()], - // vec![b"VVE".to_vec(), b"VE".to_vec()], - // vec![b"SSS".to_vec(), b"SS".to_vec()], - // ]; - - // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { - // let kmer_iter = frame.kmer_iter(3, 42, false); - - // // Compute hashes for expected k-mers - // let expected_hashes: Vec = expected_frame_kmers - // .iter() - // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) - // .collect(); - - // // Collect hashes produced by the kmer_iter - // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // // Check that produced hashes match expected hashes in order - // assert_eq!( - // produced_hashes, expected_hashes, - // "Hashes do not match in order for frame" - // ); - // } - // } - - // #[test] - // fn test_kmer_iter_skipmer_m1n3() { - // let sequence = b"AGTCGTCGAGCT"; - // let hash_function = HashFunctions::Murmur64Skipm1n3; - - // let frames = ReadingFrames::new(sequence, false, &hash_function); - - // // Three skipmer frames - // assert_eq!(frames.frames().len(), 3); - - // // Expected k-mers for skipmer (m=1, n=3) - // let expected_kmers = vec![ - // vec![b"ACC".to_vec()], - // vec![b"GTG".to_vec()], - // vec![b"TCG".to_vec()], - // ]; - - // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { - // let kmer_iter = frame.kmer_iter(3, 42, false); - - // // Compute hashes for expected k-mers - // let expected_hashes: Vec = expected_frame_kmers - // .iter() - // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) - // .collect(); - - // // Collect hashes produced by the kmer_iter - // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // // Check that produced hashes match expected hashes in order - // assert_eq!( - // produced_hashes, expected_hashes, - // "Hashes do not match in order for frame" - // ); - // } - // } - - // #[test] - // fn test_kmer_iter_skipmer_m2n3() { - // let sequence = b"AGTCGTCGAGCT"; - // let hash_function = HashFunctions::Murmur64Skipm2n3; - - // let frames = ReadingFrames::new(sequence, false, &hash_function); - - // // Three skipmer frames - // assert_eq!(frames.frames().len(), 3); - - // // Expected k-mers for skipmer (m=2, n=3) - // let expected_kmers = vec![ - // vec![b"AGC".to_vec()], - // vec![b"GTG".to_vec()], - // vec![b"TCA".to_vec()], - // ]; - - // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { - // let kmer_iter = frame.kmer_iter(3, 42, false); - - // // Compute hashes for expected k-mers - // let expected_hashes: Vec = expected_frame_kmers - // .iter() - // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) - // .collect(); - - // // Collect hashes produced by the kmer_iter - // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // // Check that produced hashes match expected hashes in order - // assert_eq!( - // produced_hashes, expected_hashes, - // "Hashes do not match in order for frame" - // ); - // } - // } } diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index c0ea0c4c3c..c7f99786e1 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -10,12 +10,13 @@ use std::path::Path; use std::str; use cfg_if::cfg_if; +use needletail::sequence; #[cfg(feature = "parallel")] use rayon::prelude::*; use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; -use crate::encodings::{HashFunctions, ReadingFrames}; +use crate::encodings::{revcomp, to_aa, HashFunctions, VALID}; use crate::prelude::*; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; @@ -163,17 +164,202 @@ impl SigsTrait for Sketch { } } +#[derive(Debug)] +pub enum ReadingFrame { + DNA { + fw: Vec, + rc: Vec, // Reverse complement is required + len: usize, + }, + Protein { + fw: Vec, // Only forward frame + len: usize, + }, +} + +impl std::fmt::Display for ReadingFrame { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ReadingFrame::DNA { fw, rc, len } => { + let fw_str = String::from_utf8(fw.clone()).expect("Invalid UTF-8 sequence in fw"); + let rc_str = String::from_utf8(rc.clone()).expect("Invalid UTF-8 sequence in rc"); + write!( + f, + "Type: DNA ({}bp), Forward: {}, Reverse Complement: {}", + len, fw_str, rc_str + ) + } + ReadingFrame::Protein { fw, len } => { + let fw_str = String::from_utf8(fw.clone()).expect("Invalid UTF-8 sequence in fw"); + write!(f, "Type: Protein ({}aa), Forward: {}", len, fw_str) + } + } + } +} + +impl ReadingFrame { + pub fn new_dna(sequence: &[u8]) -> Self { + let fw = sequence.to_vec(); + let rc = revcomp(sequence); + let len = sequence.len(); + ReadingFrame::DNA { fw, rc, len } + } + + pub fn new_protein(sequence: &[u8]) -> Self { + let fw = sequence.to_vec(); + let len = fw.len(); + ReadingFrame::Protein { fw, len } + } + + pub fn new_skipmer(seq: &[u8], start: usize, m: usize, n: usize) -> Self { + if start > n { + panic!("Skipmer frame number must be <= n ({})", n); + } + // Generate forward skipmer frame + let fw: Vec = seq + .iter() + .skip(start) + .enumerate() + .filter_map(|(i, &base)| if i % n < m { Some(base) } else { None }) + .collect(); + + let len = fw.len(); + let rc = revcomp(&fw); + ReadingFrame::DNA { fw, rc, len } + } + + pub fn new_translated(sequence: &[u8], frame_number: usize, dayhoff: bool, hp: bool) -> Self { + if frame_number > 2 { + panic!("Frame number must be 0, 1, or 2"); + } + + // Translate the forward frame + let fw: Vec = sequence + .iter() + .cloned() + .skip(frame_number) // Skip the initial bases for the frame + .take(sequence.len() - frame_number) // Adjust length based on skipped bases + .collect::>() // Collect the DNA subsequence + .chunks(3) // Group into codons (triplets) + .filter_map(|codon| to_aa(codon, dayhoff, hp).ok()) // Translate each codon + .flatten() // Flatten the nested results into a single sequence + .collect(); + + let len = fw.len(); + + // Return a Protein reading frame (no reverse complement for translated frames) + ReadingFrame::Protein { fw, len } + } + + pub fn kmer_iter<'a>(&'a self, ksize: usize, seed: u64, force: bool) -> KmerIterator<'a> { + match self { + ReadingFrame::DNA { fw, rc, len } => KmerIterator::new(self, ksize, seed, force), + ReadingFrame::Protein { fw, len } => KmerIterator::new(self, ksize, seed, force), + } + } +} + +pub struct KmerIterator<'a> { + frame: &'a ReadingFrame, // Reference to the ReadingFrame + ksize: usize, + index: usize, + seed: u64, + force: bool, +} + +impl<'a> KmerIterator<'a> { + pub fn new(frame: &'a ReadingFrame, ksize: usize, seed: u64, force: bool) -> Self { + Self { + frame, + ksize, + index: 0, + seed, + force, + } + } + + fn out_of_bounds(&self, length: usize) -> bool { + self.index + self.ksize > length + } + + fn validate_dna_kmer(&self, kmer: &[u8]) -> Result<(), Error> { + for &nt in kmer { + if !VALID[nt as usize] { + return Err(Error::InvalidDNA { + message: String::from_utf8_lossy(kmer).to_string(), + }); + } + } + Ok(()) + } + + fn get_dna_hash(&mut self, fw: &[u8], rc: &[u8]) -> Option> { + let kmer = &fw[self.index..self.index + self.ksize]; + + if !self.force { + if let Err(e) = self.validate_dna_kmer(kmer) { + self.index += 1; + return Some(Err(e)); + } + } + + let krc = &rc[rc.len() - self.ksize - self.index..rc.len() - self.index]; + let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed); + // NTP TESTING + eprintln!( + "Forward DNA k-mer: {}, Reverse Complement k-mer: {}, hash: {}", + String::from_utf8_lossy(kmer), + String::from_utf8_lossy(krc), + hash, + ); + + self.index += 1; + Some(Ok(hash)) + } + + fn get_protein_hash(&mut self, fw: &[u8]) -> Option> { + let kmer = &fw[self.index..self.index + self.ksize]; + + let hash = crate::_hash_murmur(kmer, self.seed); + // NTP TESTING + eprintln!( + "Protein k-mer: {}, hash: {}", + String::from_utf8_lossy(kmer), + hash + ); + + self.index += 1; + Some(Ok(hash)) + } +} + +impl<'a> Iterator for KmerIterator<'a> { + type Item = Result; + + fn next(&mut self) -> Option { + match self.frame { + ReadingFrame::DNA { fw, rc, len } => { + if self.out_of_bounds(*len) { + return None; + } + self.get_dna_hash(fw, rc) + } + ReadingFrame::Protein { fw, len } => { + if self.out_of_bounds(*len) { + return None; + } + self.get_protein_hash(fw) + } + } + } +} + pub struct SeqToHashes { - sequence: Vec, - kmer_index: usize, k_size: usize, - max_index: usize, force: bool, - is_protein: bool, - hash_function: HashFunctions, seed: u64, - hashes_buffer: Vec, - reading_frames: ReadingFrames, + frames: Vec, + frame_index: usize, // Index of the current frame } impl SeqToHashes { @@ -184,51 +370,84 @@ impl SeqToHashes { is_protein: bool, hash_function: HashFunctions, seed: u64, - ) -> SeqToHashes { + ) -> Self { + let mut ksize: usize = k_size; + // Adjust kmer size for protein-based hash functions - let adjusted_k_size = - if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { - k_size / 3 - } else { - k_size - }; + if is_protein || hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { + ksize = k_size / 3; + } - // Determine the maximum index for k-mer generation - let max_index = if seq.len() >= adjusted_k_size { - seq.len() - adjusted_k_size + 1 + // uppercase the sequence + let sequence = seq.to_ascii_uppercase(); + + // Generate frames based on sequence type and hash function + let frames = if is_protein { + Self::protein_frames(&sequence) + } else if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { + Self::translated_frames(&sequence, &hash_function) + } else if hash_function.skipm1n3() || hash_function.skipm2n3() { + Self::skipmer_frames(&sequence, &hash_function) } else { - 0 + Self::dna_frames(&sequence) }; - // Initialize ReadingFrames based on the sequence and hash function - let reading_frames = - ReadingFrames::new(&seq.to_ascii_uppercase(), is_protein, &hash_function); - SeqToHashes { - sequence: seq.to_ascii_uppercase(), - kmer_index: 0, - k_size: adjusted_k_size, - max_index, + k_size: ksize, force, - is_protein, - hash_function, seed, - hashes_buffer: Vec::with_capacity(1000), - reading_frames, + frames, + frame_index: 0, } } - // some helper functions. If we remove, we could probably just rm - // these fields from SeqToHashes, since ReadingFrames handles this now - pub fn get_sequence(&self) -> &[u8] { - &self.sequence - } - pub fn get_hash_function(&self) -> &HashFunctions { - &self.hash_function + /// Generate DNA frames (forward + reverse complement) + fn dna_frames(seq: &[u8]) -> Vec { + vec![ReadingFrame::new_dna(&seq.to_ascii_uppercase())] + } + + fn protein_frames(seq: &[u8]) -> Vec { + vec![ReadingFrame::new_protein(&seq.to_ascii_uppercase())] + } + + fn translated_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { + let revcomp_sequence = revcomp(&seq.to_ascii_uppercase()); + (0..3) + .flat_map(|frame_number| { + vec![ + ReadingFrame::new_translated( + &seq.to_ascii_uppercase(), + frame_number, + hash_function.dayhoff(), + hash_function.hp(), + ), + ReadingFrame::new_translated( + &revcomp_sequence, + frame_number, + hash_function.dayhoff(), + hash_function.hp(), + ), + ] + }) + .collect() } - pub fn is_protein(&self) -> bool { - self.is_protein + fn skipmer_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { + let (m, n) = if hash_function.skipm1n3() { + (1, 3) + } else { + (2, 3) + }; + (0..3) + .flat_map(|frame_number| { + vec![ReadingFrame::new_skipmer( + &seq.to_ascii_uppercase(), + frame_number, + m, + n, + )] + }) + .collect() } } @@ -236,28 +455,24 @@ impl Iterator for SeqToHashes { type Item = Result; fn next(&mut self) -> Option { - // Check if we've processed all k-mers - if self.kmer_index >= self.max_index { - return None; - } + // Iterate over the frames using frame_index + while self.frame_index < self.frames.len() { + let frame = &self.frames[self.frame_index]; - // Iterate over the frames - for frame in self.reading_frames.frames().iter() { - let kmer_iter = frame.kmer_iter(self.k_size, self.seed, self.force); - for result in kmer_iter { - match result { - Ok(hash) => self.hashes_buffer.push(hash), - Err(e) => { - if !self.force { - return Some(Err(e)); - } - } - } + // Create a KmerIterator for the current frame + let mut kmer_iter = frame.kmer_iter(self.k_size, self.seed, self.force); + + // Process k-mers in the current frame + if let Some(hash_result) = kmer_iter.next() { + return Some(hash_result); // Return the next hash } + + // Move to the next frame if the current one is exhausted + self.frame_index += 1; } - self.kmer_index += 1; - Some(Ok(0)) // Return 0 to indicate progress; actual hashes are stored in `hashes_buffer` + // All frames exhausted + None } } @@ -1288,4 +1503,321 @@ mod test { assert_eq!(hash, expected_hash, "Mismatch in skipmer hash"); } } + + // #[test] + // fn test_reading_frames_new_dna() { + // let sequence = b"AGTCGT"; + // let hash_function = HashFunctions::Murmur64Dna; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // assert_eq!(frames.frames().len(), 1); // Only one fw/rc readingframe for DNA + // assert_eq!(frames.frames()[0].fw, sequence.to_vec()); + // assert_eq!(frames.frames()[0].rc, Some(b"ACGACT".to_vec())); + // } + + // #[test] + // fn test_reading_frames_new_is_protein() { + // let sequence = b"MVLSPADKTNVKAAW"; + // let hash_function = HashFunctions::Murmur64Protein; + + // let frames = ReadingFrames::new(sequence, true, &hash_function); + + // assert_eq!(frames.frames().len(), 1); // Only one frame for protein + // assert_eq!(frames.frames()[0].fw, sequence.to_vec()); + // assert_eq!(frames.frames()[0].rc, None); // No reverse complement for protein + // } + + // #[test] + // fn test_reading_frames_translate_frames() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Protein; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + // eprintln!("frames: {}", frames); + + // assert_eq!(frames.frames().len(), 3); // Three translated frames + // assert_eq!(frames.frames()[0].fw, b"SRRA".to_vec()); + // assert_eq!(frames.frames()[0].rc, Some(b"SSTT".to_vec())); + // assert_eq!(frames.frames()[1].fw, b"VVE".to_vec()); + // assert_eq!(frames.frames()[1].rc, Some(b"ARR".to_vec())); + // assert_eq!(frames.frames()[2].fw, b"SSS".to_vec()); + // assert_eq!(frames.frames()[2].rc, Some(b"LDD".to_vec())); + // } + + // #[test] + // fn test_reading_frames_skipmer_frames_m1n3() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Skipm1n3; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // assert_eq!(frames.frames().len(), 3); // Three skipmer frames + + // // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) + // let expected_fw_frame_0 = b"ACCG".to_vec(); // Frame 0: Keep 1 base, skip 2 bases, starting from index 0 + // let expected_fw_frame_1 = b"GGGC".to_vec(); // Frame 1: Keep 1 base, skip 2 bases, starting from index 1 + // let expected_fw_frame_2 = b"TTAT".to_vec(); // Frame 2: Keep 1 base, skip 2 bases, starting from index 2 + + // let expected_rc_frame_0 = revcomp(&expected_fw_frame_0); + // let expected_rc_frame_1 = revcomp(&expected_fw_frame_1); + // let expected_rc_frame_2 = revcomp(&expected_fw_frame_2); + + // // Check forward and reverse complement sequences for each frame + // assert_eq!(frames.frames()[0].fw, expected_fw_frame_0); + // assert_eq!(frames.frames()[0].rc, Some(expected_rc_frame_0)); + + // assert_eq!(frames.frames()[1].fw, expected_fw_frame_1); + // assert_eq!(frames.frames()[1].rc, Some(expected_rc_frame_1)); + + // assert_eq!(frames.frames()[2].fw, expected_fw_frame_2); + // assert_eq!(frames.frames()[2].rc, Some(expected_rc_frame_2)); + // } + + // #[test] + // fn test_reading_frames_skipmer_frames_m2n3() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Skipm2n3; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // assert_eq!(frames.frames().len(), 3); // Three skipmer frames + + // // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) + // let expected_fw_frame_0 = b"AGCGCGGC".to_vec(); + // let expected_fw_frame_1 = b"GTGTGACT".to_vec(); + // let expected_fw_frame_2 = b"TCTCAGT".to_vec(); + + // let expected_rc_frame_0 = revcomp(&expected_fw_frame_0); + // let expected_rc_frame_1 = revcomp(&expected_fw_frame_1); + // let expected_rc_frame_2 = revcomp(&expected_fw_frame_2); + + // // Check forward and reverse complement sequences for each frame + // assert_eq!(frames.frames()[0].fw, expected_fw_frame_0); + // assert_eq!(frames.frames()[0].rc, Some(expected_rc_frame_0)); + + // assert_eq!(frames.frames()[1].fw, expected_fw_frame_1); + // assert_eq!(frames.frames()[1].rc, Some(expected_rc_frame_1)); + + // assert_eq!(frames.frames()[2].fw, expected_fw_frame_2); + // assert_eq!(frames.frames()[2].rc, Some(expected_rc_frame_2)); + // } + + // #[test] + // fn test_reading_frame_kmer_iter() { + // let sequence = b"AGTCGT"; + // let rc = revcomp(sequence); + // let hash_function = HashFunctions::Murmur64Dna; + // let frame = ReadingFrame { + // fw: sequence.to_vec(), + // rc: Some(rc.clone()), + // hash_function, + // is_translated: false, + // }; + // eprintln!("frame: {}", frame); + // assert_eq!(frame.fw, sequence.to_vec()); + // assert_eq!(frame.rc, Some(b"ACGACT".to_vec())); + + // let kmer_iter = frame.kmer_iter(3, 42, false); + + // // Expected k-mers from the forward and reverse complement sequence + // let expected_kmers = vec![ + // (b"AGT".to_vec(), b"ACT".to_vec()), + // (b"GTC".to_vec(), b"GAC".to_vec()), + // (b"TCG".to_vec(), b"CGA".to_vec()), + // (b"CGT".to_vec(), b"ACG".to_vec()), + // ]; + + // // Compute hashes for expected k-mers + // let expected_hashes: Vec = expected_kmers + // .iter() + // .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // // Check that produced hashes match expected hashes in order + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order" + // ); + // } + + // #[test] + // fn test_kmer_iter_is_protein() { + // let sequence = b"MVLSPADKTNVKAAW"; + // let hash_function = HashFunctions::Murmur64Protein; + + // let frames = ReadingFrames::new(sequence, true, &hash_function); + + // // Only one forward frame for protein + // assert_eq!(frames.frames().len(), 1); + // let frame = &frames.frames()[0]; + + // let kmer_iter = frame.kmer_iter(3, 42, false); + + // // Expected k-mers for protein sequence + // let expected_kmers = vec![ + // b"MVL".to_vec(), + // b"VLS".to_vec(), + // b"LSP".to_vec(), + // b"SPA".to_vec(), + // b"PAD".to_vec(), + // b"ADK".to_vec(), + // b"DKT".to_vec(), + // b"KTN".to_vec(), + // b"TNV".to_vec(), + // b"NVK".to_vec(), + // b"VKA".to_vec(), + // b"KAA".to_vec(), + // b"AAW".to_vec(), + // ]; + + // // Compute hashes for expected k-mers + // let expected_hashes: Vec = expected_kmers + // .iter() + // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // // Check that produced hashes match expected hashes in order + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order" + // ); + // } + + // #[test] + // fn test_kmer_iter_translate_frames() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Protein; + // let ksize =3; + // let seed = 42; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // // Three translated frames + // assert_eq!(frames.frames().len(), 3); + // eprintln!("frames: {}", frames); + + // // Expected k-mers for translated frames + // let frame1_kmers = vec![ + // vec![b"SRR".to_vec(), b"SST".to_vec()], + // vec![b"RRA".to_vec(), b"STT".to_vec()], + // ]; + // let frame2_kmers = vec![vec![b"VVE".to_vec(), b"ARR".to_vec()]]; + // let frame3_kmers = vec![vec![b"SSS".to_vec(), b"LDD".to_vec()]]; + // let expected_kmers = vec![frame1_kmers, frame2_kmers, frame3_kmers]; + // // let expected_kmers = vec![ + // // vec![b"SRR".to_vec(), b"SST".to_vec()], + // // vec![b"RRA".to_vec(), b"STT".to_vec()], + // // vec![b"VVE".to_vec(), b"ARR".to_vec()], + // // vec![b"SSS".to_vec(), b"LDD".to_vec()], + // // ]; + + // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { + // let kmer_iter = frame.kmer_iter(ksize, seed, false); + + // // Compute hashes for expected k-mers + // // let expected_hashes: Vec = expected_frame_kmers + // // .iter() + // // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + // // .collect(); + // let expected_hashes: Vec = expected_frame_kmers + // .iter() + // .flat_map(|kmers| { + // kmers.iter().map(|fw_kmer| { + // let rc_kmer = revcomp(fw_kmer); // Compute reverse complement + // crate::_hash_murmur(std::cmp::min(fw_kmer, &rc_kmer), seed) + // }) + // }) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // // Check that produced hashes match expected hashes in order + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order for frame" + // ); + // } + // } + + // #[test] + // fn test_kmer_iter_skipmer_m1n3() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Skipm1n3; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // // Three skipmer frames + // assert_eq!(frames.frames().len(), 3); + + // // Expected k-mers for skipmer (m=1, n=3) + // let expected_kmers = vec![ + // vec![b"ACC".to_vec()], + // vec![b"GTG".to_vec()], + // vec![b"TCG".to_vec()], + // ]; + + // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { + // let kmer_iter = frame.kmer_iter(3, 42, false); + + // // Compute hashes for expected k-mers + // let expected_hashes: Vec = expected_frame_kmers + // .iter() + // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // // Check that produced hashes match expected hashes in order + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order for frame" + // ); + // } + // } + + // #[test] + // fn test_kmer_iter_skipmer_m2n3() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Skipm2n3; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // // Three skipmer frames + // assert_eq!(frames.frames().len(), 3); + + // // Expected k-mers for skipmer (m=2, n=3) + // let expected_kmers = vec![ + // vec![b"AGC".to_vec()], + // vec![b"GTG".to_vec()], + // vec![b"TCA".to_vec()], + // ]; + + // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { + // let kmer_iter = frame.kmer_iter(3, 42, false); + + // // Compute hashes for expected k-mers + // let expected_hashes: Vec = expected_frame_kmers + // .iter() + // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // // Check that produced hashes match expected hashes in order + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order for frame" + // ); + // } + // } }