From 1868ab297501aa89d48ff6411296b1ac737b078e Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 21 Nov 2024 14:57:37 -0800 Subject: [PATCH 01/16] init m2n3, m1n3 skipmer options --- src/core/src/cmd.rs | 27 ++++++++- src/core/src/encodings.rs | 16 ++++-- src/core/src/signature.rs | 100 +++++++++++++++++++++++++-------- src/core/src/sketch/minhash.rs | 3 +- 4 files changed, 115 insertions(+), 31 deletions(-) diff --git a/src/core/src/cmd.rs b/src/core/src/cmd.rs index 115d2672af..d5bfb092ba 100644 --- a/src/core/src/cmd.rs +++ b/src/core/src/cmd.rs @@ -44,7 +44,11 @@ pub struct ComputeParameters { #[getset(get_copy = "pub", set = "pub")] #[builder(default = false)] - skipmer: bool, + skipm1n3: bool, + + #[getset(get_copy = "pub", set = "pub")] + #[builder(default = false)] + skipm2n3: bool, #[getset(get_copy = "pub", set = "pub")] #[builder(default = false)] @@ -169,12 +173,29 @@ pub fn build_template(params: &ComputeParameters) -> Vec { )); } - if params.skipmer { + if params.skipm1n3 { + ksigs.push(Sketch::LargeMinHash( + KmerMinHashBTree::builder() + .num(params.num_hashes) + .ksize(*k) + .hash_function(HashFunctions::Murmur64Skipm1n3) + .max_hash(max_hash) + .seed(params.seed) + .abunds(if params.track_abundance { + Some(Default::default()) + } else { + None + }) + .build(), + )); + } + + if params.skipm2n3 { ksigs.push(Sketch::LargeMinHash( KmerMinHashBTree::builder() .num(params.num_hashes) .ksize(*k) - .hash_function(HashFunctions::Murmur64Skipmer) + .hash_function(HashFunctions::Murmur64Skipm2n3) .max_hash(max_hash) .seed(params.seed) .abunds(if params.track_abundance { diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index 0aba272a5c..a7c2f3cef2 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -31,7 +31,8 @@ pub enum HashFunctions { Murmur64Protein, Murmur64Dayhoff, Murmur64Hp, - Murmur64Skipmer, + Murmur64Skipm1n3, + Murmur64Skipm2n3, Custom(String), } @@ -51,8 +52,11 @@ impl HashFunctions { pub fn hp(&self) -> bool { *self == HashFunctions::Murmur64Hp } - pub fn skipmer(&self) -> bool { - *self == HashFunctions::Murmur64Skipmer + pub fn skipm1n3(&self) -> bool { + *self == HashFunctions::Murmur64Skipm1n3 + } + pub fn skipm2n3(&self) -> bool { + *self == HashFunctions::Murmur64Skipm2n3 } } @@ -66,7 +70,8 @@ impl std::fmt::Display for HashFunctions { HashFunctions::Murmur64Protein => "protein", HashFunctions::Murmur64Dayhoff => "dayhoff", HashFunctions::Murmur64Hp => "hp", - HashFunctions::Murmur64Skipmer => "skipmer", + HashFunctions::Murmur64Skipm1n3 => "skipm1n3", + HashFunctions::Murmur64Skipm2n3 => "skipm2n3", HashFunctions::Custom(v) => v, } ) @@ -82,7 +87,8 @@ impl TryFrom<&str> for HashFunctions { "dayhoff" => Ok(HashFunctions::Murmur64Dayhoff), "hp" => Ok(HashFunctions::Murmur64Hp), "protein" => Ok(HashFunctions::Murmur64Protein), - "skipmer" => Ok(HashFunctions::Murmur64Skipmer), + "skipm1n3" => Ok(HashFunctions::Murmur64Skipm1n3), + "skipm2n3" => Ok(HashFunctions::Murmur64Skipm2n3), v => unimplemented!("{v}"), } } diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 93580170a2..b84c593f6e 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -184,6 +184,11 @@ pub struct SeqToHashes { prot_configured: bool, aa_seq: Vec, translate_iter_step: usize, + + skipmer_configured: bool, + skip_m: usize, + skip_n: usize, + skip_len: usize, } impl SeqToHashes { @@ -228,6 +233,10 @@ impl SeqToHashes { prot_configured: false, aa_seq: Vec::new(), translate_iter_step: 0, + skipmer_configured: false, + skip_m: 2, + skip_n: 3, + skip_len: 0, } } @@ -267,13 +276,33 @@ impl Iterator for SeqToHashes { if !self.dna_configured { self.dna_ksize = self.k_size; self.dna_len = self.sequence.len(); + + if self.hash_function.skipm1n3() + || self.hash_function.skipm2n3() && !self.skipmer_configured + { + if self.hash_function.skipm1n3() { + self.skip_m = 1 + }; + if self.hash_function.skipm2n3() { + self.skip_m = 2 + }; + // eqn from skipmer paper. might want to add one to dna_ksize to round up? + self.skip_len = + self.skip_n * (((self.dna_ksize) / self.skip_m) - 1) + self.skip_m; + // my prior eqn + // self.skip_len = self.dna_ksize + ((self.dna_ksize + 1) / self.skip_m) - 1; // add 1 to round up rather than down + + // check that we can actually build skipmers + if self.k_size < self.skip_n { + unimplemented!() + } + } + // have enough sequence to kmerize? if self.dna_len < self.dna_ksize || (self.hash_function.protein() && self.dna_len < self.k_size * 3) || (self.hash_function.dayhoff() && self.dna_len < self.k_size * 3) || (self.hash_function.hp() && self.dna_len < self.k_size * 3) - || (self.hash_function.skipmer() - // add 1 to round up rather than down - && self.dna_len < (self.k_size + ((self.k_size + 1) / 2) - 1)) + || (self.skipmer_configured && self.dna_len < self.skip_len) { return None; } @@ -323,26 +352,19 @@ impl Iterator for SeqToHashes { let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed); self.kmer_index += 1; Some(Ok(hash)) - } else if self.hash_function.skipmer() { - // check that we can actually build skipmers - if self.k_size < 3 { - unimplemented!() - // return None - } - let extended_length = self.dna_ksize + ((self.dna_ksize + 1) / 2) - 1; // add 1 to round up rather than down - + } else if self.skipmer_configured { // Check bounds to ensure we don't exceed the sequence length - if self.kmer_index + extended_length > self.sequence.len() { + if self.kmer_index + self.skip_len > self.sequence.len() { return None; } // Build skipmer with DNA base validation let mut kmer: Vec = Vec::with_capacity(self.dna_ksize); for (_i, &base) in self.sequence - [self.kmer_index..self.kmer_index + extended_length] + [self.kmer_index..self.kmer_index + self.skip_len] .iter() .enumerate() - .filter(|&(i, _)| i % 3 != 2) + .filter(|&(i, _)| i % self.skip_n < self.skip_m) .take(self.dna_ksize) { // Use the validate_base method to check the base @@ -354,11 +376,11 @@ impl Iterator for SeqToHashes { } // Generate reverse complement skipmer - let krc: Vec = self.dna_rc[self.dna_len - extended_length - self.kmer_index + let krc: Vec = self.dna_rc[self.dna_len - self.skip_len - self.kmer_index ..self.dna_len - self.kmer_index] .iter() .enumerate() - .filter(|&(i, _)| i % 3 != 2) + .filter(|&(i, _)| i % self.skip_n < self.skip_m) .take(self.dna_ksize) .map(|(_, &base)| base) .collect(); @@ -1103,12 +1125,12 @@ mod test { } #[test] - fn signature_skipmer_add_sequence() { + fn signature_skipm2n3_add_sequence() { let params = ComputeParameters::builder() .ksizes(vec![3, 4, 5, 6]) .num_hashes(3u32) .dna(false) - .skipmer(true) + .skipm2n3(true) .build(); let mut sig = Signature::from_params(¶ms); @@ -1122,14 +1144,48 @@ mod test { assert_eq!(sig.signatures[3].size(), 1); } + #[test] + fn signature_skipm1n3_add_sequence() { + let params = ComputeParameters::builder() + .ksizes(vec![3, 4, 5, 6]) + .num_hashes(3u32) + .dna(false) + .skipm1n3(true) + .build(); + + let mut sig = Signature::from_params(¶ms); + sig.add_sequence(b"ATGCATGA", false).unwrap(); + + assert_eq!(sig.signatures.len(), 4); + dbg!(&sig.signatures); + assert_eq!(sig.signatures[0].size(), 3); + assert_eq!(sig.signatures[1].size(), 3); + assert_eq!(sig.signatures[2].size(), 2); + assert_eq!(sig.signatures[3].size(), 1); + } + + #[test] + #[should_panic(expected = "not implemented")] + fn signature_skipm2n3_add_sequence_too_small() { + let params = ComputeParameters::builder() + .ksizes(vec![2]) + .num_hashes(3u32) + .dna(false) + .skipm2n3(true) + .build(); + + let mut sig = Signature::from_params(¶ms); + sig.add_sequence(b"ATGCATGA", false).unwrap(); + } + #[test] #[should_panic(expected = "not implemented")] - fn signature_skipmer_add_sequence_too_small() { + fn signature_skipm1n3_add_sequence_too_small() { let params = ComputeParameters::builder() .ksizes(vec![2]) .num_hashes(3u32) .dna(false) - .skipmer(true) + .skipm1n3(true) .build(); let mut sig = Signature::from_params(¶ms); @@ -1414,7 +1470,7 @@ mod test { } #[test] - fn test_seqtohashes_skipmer() { + fn test_seqtohashes_skipm2n3() { let sequence = b"AGTCGTCA"; // let rc_seq = b"TGACGACT"; let k_size = 5; @@ -1427,7 +1483,7 @@ mod test { k_size, force, false, - HashFunctions::Murmur64Skipmer, + HashFunctions::Murmur64Skipm2n3, seed, ); diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 3c8f3b4240..d8ca2337a0 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -153,7 +153,8 @@ impl<'de> Deserialize<'de> for KmerMinHash { "dayhoff" => HashFunctions::Murmur64Dayhoff, "hp" => HashFunctions::Murmur64Hp, "dna" => HashFunctions::Murmur64Dna, - "skipmer" => HashFunctions::Murmur64Skipmer, + "skipm1n3" => HashFunctions::Murmur64Skipm1n3, + "skipm2n3" => HashFunctions::Murmur64Skipm2n3, _ => unimplemented!(), // TODO: throw error here }; From 2ba40b0e4c5696d3589030b2edfed75333646b9f Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 2 Dec 2024 17:18:46 -0800 Subject: [PATCH 02/16] init ReadingFrames --- src/core/src/encodings.rs | 93 +++++++++++++++++++++++++++++++++++++++ src/core/src/signature.rs | 17 ++++++- 2 files changed, 108 insertions(+), 2 deletions(-) diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index a7c2f3cef2..5a215a3e9a 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -94,6 +94,21 @@ impl TryFrom<&str> for HashFunctions { } } +#[derive(Debug)] +pub struct ReadingFrames { + forward: [Vec; 3], + revcomp: [Vec; 3], +} + +impl ReadingFrames { + pub fn new(forward: [Vec; 3], rc: [Vec; 3]) -> Self { + ReadingFrames { + forward, + revcomp: rc, + } + } +} + const COMPLEMENT: [u8; 256] = { let mut lookup = [0; 256]; lookup[b'A' as usize] = b'T'; @@ -112,6 +127,84 @@ pub fn revcomp(seq: &[u8]) -> Vec { .collect() } +/// Generate a single translated frame from a DNA sequence +/// +/// * `sequence`: The DNA sequence as a slice of bytes. +/// * `frame_number`: The frame to translate (0, 1, or 2). +/// * `dayhoff`: Whether to use the Dayhoff amino acid alphabet. +/// * `hp`: Whether to use the hydrophobic-polar amino acid alphabet. +/// +/// Returns a translated frame as a `Vec`. +pub fn translated_frame(sequence: &[u8], frame_number: usize, dayhoff: bool, hp: bool) -> Vec { + if frame_number > 2 { + panic!("Frame number must be 0, 1, or 2"); + } + + sequence + .iter() + .cloned() + .skip(frame_number) // Skip the initial bases for the frame + .take(sequence.len() - frame_number) // Adjust length based on skipped bases + .collect::>() // Collect the DNA subsequence + .chunks(3) // Group into codons (triplets) + .filter_map(|codon| to_aa(codon, dayhoff, hp).ok()) // Translate each codon + .flatten() // Flatten the nested results into a single sequence + .collect() +} + +pub fn make_translated_frames( + sequence: &[u8], + dna_rc: &[u8], + dayhoff: bool, + hp: bool, +) -> ReadingFrames { + // Generate forward frames + let forward = [ + translated_frame(sequence, 0, dayhoff, hp), + translated_frame(sequence, 1, dayhoff, hp), + translated_frame(sequence, 2, dayhoff, hp), + ]; + + // Generate reverse complement frames + let revcomp = [ + translated_frame(dna_rc, 0, dayhoff, hp), + translated_frame(dna_rc, 1, dayhoff, hp), + translated_frame(dna_rc, 2, dayhoff, hp), + ]; + + // Return a ReadingFrames object + ReadingFrames::new(forward, revcomp) +} + +fn skipmer_frame(seq: &[u8], start: usize, n: usize, m: usize) -> Vec { + seq.iter() + .skip(start) + .enumerate() + .filter_map(|(i, &base)| if i % n < m { Some(base) } else { None }) + .collect() +} + +pub fn make_skipmer_frames(seq: &[u8], n: usize, m: usize) -> ReadingFrames { + if m >= n { + panic!("m must be less than n"); + } + // Generate the first three forward frames + let forward: [Vec; 3] = [ + skipmer_frame(seq, 0, n, m), + skipmer_frame(seq, 1, n, m), + skipmer_frame(seq, 2, n, m), + ]; + // Generate the reverse complement frames + let reverse_complement: [Vec; 3] = [ + revcomp(&forward[0]), + revcomp(&forward[1]), + revcomp(&forward[2]), + ]; + + // Return the frames in a structured format + ReadingFrames::new(forward, reverse_complement) +} + static CODONTABLE: Lazy> = Lazy::new(|| { [ // F diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index b84c593f6e..ac580ed0a7 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -287,8 +287,12 @@ impl Iterator for SeqToHashes { self.skip_m = 2 }; // eqn from skipmer paper. might want to add one to dna_ksize to round up? - self.skip_len = - self.skip_n * (((self.dna_ksize) / self.skip_m) - 1) + self.skip_m; + // to do - check if we need to enforce k = multiple of n for revcomp k-mers to work + // , or if we can keep the round up trick I'm using here. + eprintln!("setting skipmer extended length"); + self.skip_len = (self.skip_n * (((self.dna_ksize + 1) / self.skip_m) - 1)) + + self.skip_m; + eprintln!("skipmer extended length: {}", self.skip_len); // my prior eqn // self.skip_len = self.dna_ksize + ((self.dna_ksize + 1) / self.skip_m) - 1; // add 1 to round up rather than down @@ -296,8 +300,10 @@ impl Iterator for SeqToHashes { if self.k_size < self.skip_n { unimplemented!() } + self.skipmer_configured = true; } // have enough sequence to kmerize? + eprintln!("checking seq len"); if self.dna_len < self.dna_ksize || (self.hash_function.protein() && self.dna_len < self.k_size * 3) || (self.hash_function.dayhoff() && self.dna_len < self.k_size * 3) @@ -307,12 +313,15 @@ impl Iterator for SeqToHashes { return None; } // pre-calculate the reverse complement for the full sequence... + eprintln!("precalculating revcomp"); + // NOTE: Shall we precalc skipmer seq here too? + maybe translated frames? self.dna_rc = revcomp(&self.sequence); self.dna_configured = true; } // Processing DNA if self.hash_function.dna() { + eprintln!("processing DNA"); let kmer = &self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize]; // validate the bases @@ -353,6 +362,7 @@ impl Iterator for SeqToHashes { self.kmer_index += 1; Some(Ok(hash)) } else if self.skipmer_configured { + eprintln!("processing skipmer"); // Check bounds to ensure we don't exceed the sequence length if self.kmer_index + self.skip_len > self.sequence.len() { return None; @@ -372,8 +382,10 @@ impl Iterator for SeqToHashes { self.kmer_index += 1; // Move to the next position if skipping is forced return Some(result); } + eprintln!("base {}", base); kmer.push(base); } + // eprintln!("skipmer kmer: {:?}", kmer); // Generate reverse complement skipmer let krc: Vec = self.dna_rc[self.dna_len - self.skip_len - self.kmer_index @@ -387,6 +399,7 @@ impl Iterator for SeqToHashes { let hash = crate::_hash_murmur(std::cmp::min(&kmer, &krc), self.seed); self.kmer_index += 1; + eprintln!("built skipmer hash"); Some(Ok(hash)) } else if self.hashes_buffer.is_empty() && self.translate_iter_step == 0 { // Processing protein by translating DNA From 31f301aedc02907612ddfb5dde4cbf473508e464 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 3 Dec 2024 13:50:39 -0800 Subject: [PATCH 03/16] allow single fwd frame for prot input --- src/core/src/encodings.rs | 204 +++++++++++++++++------ src/core/src/signature.rs | 333 ++++++-------------------------------- 2 files changed, 202 insertions(+), 335 deletions(-) diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index 5a215a3e9a..671f58310c 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -95,20 +95,165 @@ impl TryFrom<&str> for HashFunctions { } #[derive(Debug)] -pub struct ReadingFrames { - forward: [Vec; 3], - revcomp: [Vec; 3], +pub struct ReadingFrame { + fw: Vec, // Forward frame + rc: Option>, // Reverse complement (optional, not used for protein input) } +impl ReadingFrame { + /// Create a k-mer iterator for this reading frame + pub fn kmer_iter(&self, ksize: usize, seed: u64, force: bool) -> KmerIterator { + KmerIterator::new(&self.fw, self.rc.as_deref(), ksize, seed, force) + } +} + +#[derive(Debug)] +pub struct ReadingFrames(Vec); + impl ReadingFrames { - pub fn new(forward: [Vec; 3], rc: [Vec; 3]) -> Self { - ReadingFrames { - forward, - revcomp: rc, + /// Create ReadingFrames based on the input sequence, moltype, and protein flag + pub fn new(sequence: &[u8], is_protein: bool, hash_function: &HashFunctions) -> Self { + if is_protein { + // for protein input, return one forward frame + let frames = vec![ReadingFrame { + fw: sequence.to_vec(), + rc: None, + }]; + Self(frames) + } else if hash_function.dna() { + // DNA: just forward + rc + let dna_rc = revcomp(sequence); + let frames = vec![ReadingFrame { + fw: sequence.to_vec(), + rc: Some(dna_rc), + }]; + Self(frames) + } else if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { + // translation: build 6 frames + let dna_rc = revcomp(sequence); // Compute reverse complement for translation + let dayhoff = hash_function.dayhoff(); + let hp = hash_function.hp(); + Self::translate_frames(sequence, &dna_rc, dayhoff, hp) + } else if hash_function.skipm1n3() || hash_function.skipm2n3() { + // Skipmers: build 6 frames, following skip pattern + let (m, n) = if hash_function.skipm1n3() { + (1, 3) + } else { + (2, 3) + }; + Self::skipmer_frames(sequence, n, m) + } else { + panic!("Unsupported moltype: {}", hash_function); + } + } + + /// Generate translated frames + fn translate_frames(sequence: &[u8], dna_rc: &[u8], dayhoff: bool, hp: bool) -> Self { + let frames: Vec = (0..3) + .map(|frame_number| ReadingFrame { + fw: translated_frame(sequence, frame_number, dayhoff, hp), + rc: Some(translated_frame(dna_rc, frame_number, dayhoff, hp)), + }) + .collect(); + Self(frames) + } + + /// Generate skipmer frames + fn skipmer_frames(sequence: &[u8], n: usize, m: usize) -> Self { + let frames: Vec = (0..3) + .map(|frame_number| { + let fw = skipmer_frame(sequence, frame_number, n, m); + ReadingFrame { + fw: fw.clone(), + rc: Some(revcomp(&fw)), + } + }) + .collect(); + Self(frames) + } + + /// Access the frames + pub fn frames(&self) -> &Vec { + &self.0 + } +} + +pub struct KmerIterator<'a> { + fw: &'a [u8], + rc: Option<&'a [u8]>, + ksize: usize, + index: usize, + len: usize, + seed: u64, + force: bool, +} + +impl<'a> KmerIterator<'a> { + pub fn new(fw: &'a [u8], rc: Option<&'a [u8]>, ksize: usize, seed: u64, force: bool) -> Self { + Self { + fw, + rc, + ksize, + index: 0, + len: fw.len(), + seed, + force, } } } +impl<'a> Iterator for KmerIterator<'a> { + type Item = Result; + + fn next(&mut self) -> Option { + if self.index + self.ksize > self.len { + return None; // End of iteration + } + + // Forward k-mer + let kmer = &self.fw[self.index..self.index + self.ksize]; + + // Validate the k-mer + for j in self.index..self.index + self.ksize { + if !VALID[self.fw[j] as usize] { + if !self.force { + return Some(Err(Error::InvalidDNA { + message: String::from_utf8(kmer.to_vec()).unwrap(), + })); + } else { + self.index += 1; + return Some(Ok(0)); // Skip invalid k-mer + } + } + } + + // Reverse complement k-mer (if rc exists) + + // ... and then while moving the k-mer window forward for the sequence + // we move another window backwards for the RC. + // For a ksize = 3, and a sequence AGTCGT (len = 6): + // +-+---------+---------------+-------+ + // seq RC |i|i + ksize|len - ksize - i|len - i| + // AGTCGT ACGACT +-+---------+---------------+-------+ + // +-> +-> |0| 2 | 3 | 6 | + // +-> +-> |1| 3 | 2 | 5 | + // +-> +-> |2| 4 | 1 | 4 | + // +-> +-> |3| 5 | 0 | 3 | + // +-+---------+---------------+-------+ + // (leaving this table here because I had to draw to + // get the indices correctly) + let hash = if let Some(rc) = self.rc { + let krc = &rc[self.len - self.ksize - self.index..self.len - self.index]; + crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed) + } else { + crate::_hash_murmur(kmer, self.seed) // Use only forward k-mer if rc is None + }; + + self.index += 1; + Some(Ok(hash)) + } +} + const COMPLEMENT: [u8; 256] = { let mut lookup = [0; 256]; lookup[b'A' as usize] = b'T'; @@ -152,30 +297,6 @@ pub fn translated_frame(sequence: &[u8], frame_number: usize, dayhoff: bool, hp: .collect() } -pub fn make_translated_frames( - sequence: &[u8], - dna_rc: &[u8], - dayhoff: bool, - hp: bool, -) -> ReadingFrames { - // Generate forward frames - let forward = [ - translated_frame(sequence, 0, dayhoff, hp), - translated_frame(sequence, 1, dayhoff, hp), - translated_frame(sequence, 2, dayhoff, hp), - ]; - - // Generate reverse complement frames - let revcomp = [ - translated_frame(dna_rc, 0, dayhoff, hp), - translated_frame(dna_rc, 1, dayhoff, hp), - translated_frame(dna_rc, 2, dayhoff, hp), - ]; - - // Return a ReadingFrames object - ReadingFrames::new(forward, revcomp) -} - fn skipmer_frame(seq: &[u8], start: usize, n: usize, m: usize) -> Vec { seq.iter() .skip(start) @@ -184,27 +305,6 @@ fn skipmer_frame(seq: &[u8], start: usize, n: usize, m: usize) -> Vec { .collect() } -pub fn make_skipmer_frames(seq: &[u8], n: usize, m: usize) -> ReadingFrames { - if m >= n { - panic!("m must be less than n"); - } - // Generate the first three forward frames - let forward: [Vec; 3] = [ - skipmer_frame(seq, 0, n, m), - skipmer_frame(seq, 1, n, m), - skipmer_frame(seq, 2, n, m), - ]; - // Generate the reverse complement frames - let reverse_complement: [Vec; 3] = [ - revcomp(&forward[0]), - revcomp(&forward[1]), - revcomp(&forward[2]), - ]; - - // Return the frames in a structured format - ReadingFrames::new(forward, reverse_complement) -} - static CODONTABLE: Lazy> = Lazy::new(|| { [ // F diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index ac580ed0a7..c0ea0c4c3c 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -15,7 +15,7 @@ use rayon::prelude::*; use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; -use crate::encodings::{aa_to_dayhoff, aa_to_hp, revcomp, to_aa, HashFunctions, VALID}; +use crate::encodings::{HashFunctions, ReadingFrames}; use crate::prelude::*; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; @@ -163,7 +163,6 @@ impl SigsTrait for Sketch { } } -// Iterator for converting sequence to hashes pub struct SeqToHashes { sequence: Vec, kmer_index: usize, @@ -174,21 +173,7 @@ pub struct SeqToHashes { hash_function: HashFunctions, seed: u64, hashes_buffer: Vec, - - dna_configured: bool, - dna_rc: Vec, - dna_ksize: usize, - dna_len: usize, - dna_last_position_check: usize, - - prot_configured: bool, - aa_seq: Vec, - translate_iter_step: usize, - - skipmer_configured: bool, - skip_m: usize, - skip_n: usize, - skip_len: usize, + reading_frames: ReadingFrames, } impl SeqToHashes { @@ -200,298 +185,79 @@ impl SeqToHashes { hash_function: HashFunctions, seed: u64, ) -> SeqToHashes { - let mut ksize: usize = k_size; - - // Divide the kmer size by 3 if protein - if is_protein || hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { - ksize = k_size / 3; - } + // Adjust kmer size for protein-based hash functions + let adjusted_k_size = + if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { + k_size / 3 + } else { + k_size + }; - // By setting _max_index to 0, the iterator will return None and exit - let _max_index = if seq.len() >= ksize { - seq.len() - ksize + 1 + // Determine the maximum index for k-mer generation + let max_index = if seq.len() >= adjusted_k_size { + seq.len() - adjusted_k_size + 1 } else { 0 }; + // Initialize ReadingFrames based on the sequence and hash function + let reading_frames = + ReadingFrames::new(&seq.to_ascii_uppercase(), is_protein, &hash_function); + SeqToHashes { - // Here we convert the sequence to upper case sequence: seq.to_ascii_uppercase(), - k_size: ksize, kmer_index: 0, - max_index: _max_index, + k_size: adjusted_k_size, + max_index, force, is_protein, hash_function, seed, hashes_buffer: Vec::with_capacity(1000), - dna_configured: false, - dna_rc: Vec::with_capacity(1000), - dna_ksize: 0, - dna_len: 0, - dna_last_position_check: 0, - prot_configured: false, - aa_seq: Vec::new(), - translate_iter_step: 0, - skipmer_configured: false, - skip_m: 2, - skip_n: 3, - skip_len: 0, + reading_frames, } } + // some helper functions. If we remove, we could probably just rm + // these fields from SeqToHashes, since ReadingFrames handles this now + pub fn get_sequence(&self) -> &[u8] { + &self.sequence + } - fn validate_base(&self, base: u8, kmer: &[u8]) -> Option> { - if !VALID[base as usize] { - if !self.force { - return Some(Err(Error::InvalidDNA { - message: String::from_utf8(kmer.to_owned()).unwrap_or_default(), - })); - } else { - return Some(Ok(0)); // Skip this position if forced - } - } - None // Base is valid, so return None to continue + pub fn get_hash_function(&self) -> &HashFunctions { + &self.hash_function } -} -/* -Iterator that return a kmer hash for all modes except translate. -In translate mode: - - all the frames are processed at once and converted to hashes. - - all the hashes are stored in `hashes_buffer` - - after processing all the kmers, `translate_iter_step` is incremented - per iteration to iterate over all the indeces of the `hashes_buffer`. - - the iterator will die once `translate_iter_step` == length(hashes_buffer) -More info https://github.com/sourmash-bio/sourmash/pull/1946 -*/ + pub fn is_protein(&self) -> bool { + self.is_protein + } +} impl Iterator for SeqToHashes { type Item = Result; fn next(&mut self) -> Option { - if (self.kmer_index < self.max_index) || !self.hashes_buffer.is_empty() { - // Processing DNA or Translated DNA - if !self.is_protein { - // Setting the parameters only in the first iteration - if !self.dna_configured { - self.dna_ksize = self.k_size; - self.dna_len = self.sequence.len(); - - if self.hash_function.skipm1n3() - || self.hash_function.skipm2n3() && !self.skipmer_configured - { - if self.hash_function.skipm1n3() { - self.skip_m = 1 - }; - if self.hash_function.skipm2n3() { - self.skip_m = 2 - }; - // eqn from skipmer paper. might want to add one to dna_ksize to round up? - // to do - check if we need to enforce k = multiple of n for revcomp k-mers to work - // , or if we can keep the round up trick I'm using here. - eprintln!("setting skipmer extended length"); - self.skip_len = (self.skip_n * (((self.dna_ksize + 1) / self.skip_m) - 1)) - + self.skip_m; - eprintln!("skipmer extended length: {}", self.skip_len); - // my prior eqn - // self.skip_len = self.dna_ksize + ((self.dna_ksize + 1) / self.skip_m) - 1; // add 1 to round up rather than down - - // check that we can actually build skipmers - if self.k_size < self.skip_n { - unimplemented!() - } - self.skipmer_configured = true; - } - // have enough sequence to kmerize? - eprintln!("checking seq len"); - if self.dna_len < self.dna_ksize - || (self.hash_function.protein() && self.dna_len < self.k_size * 3) - || (self.hash_function.dayhoff() && self.dna_len < self.k_size * 3) - || (self.hash_function.hp() && self.dna_len < self.k_size * 3) - || (self.skipmer_configured && self.dna_len < self.skip_len) - { - return None; - } - // pre-calculate the reverse complement for the full sequence... - eprintln!("precalculating revcomp"); - // NOTE: Shall we precalc skipmer seq here too? + maybe translated frames? - self.dna_rc = revcomp(&self.sequence); - self.dna_configured = true; - } - - // Processing DNA - if self.hash_function.dna() { - eprintln!("processing DNA"); - let kmer = &self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize]; - - // validate the bases - for j in std::cmp::max(self.kmer_index, self.dna_last_position_check) - ..self.kmer_index + self.dna_ksize - { - if !VALID[self.sequence[j] as usize] { - if !self.force { - return Some(Err(Error::InvalidDNA { - message: String::from_utf8(kmer.to_vec()).unwrap(), - })); - } else { - self.kmer_index += 1; - // Move the iterator to the next step - return Some(Ok(0)); - } - } - self.dna_last_position_check += 1; - } - - // ... and then while moving the k-mer window forward for the sequence - // we move another window backwards for the RC. - // For a ksize = 3, and a sequence AGTCGT (len = 6): - // +-+---------+---------------+-------+ - // seq RC |i|i + ksize|len - ksize - i|len - i| - // AGTCGT ACGACT +-+---------+---------------+-------+ - // +-> +-> |0| 2 | 3 | 6 | - // +-> +-> |1| 3 | 2 | 5 | - // +-> +-> |2| 4 | 1 | 4 | - // +-> +-> |3| 5 | 0 | 3 | - // +-+---------+---------------+-------+ - // (leaving this table here because I had to draw to - // get the indices correctly) - - let krc = &self.dna_rc[self.dna_len - self.dna_ksize - self.kmer_index - ..self.dna_len - self.kmer_index]; - let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed); - self.kmer_index += 1; - Some(Ok(hash)) - } else if self.skipmer_configured { - eprintln!("processing skipmer"); - // Check bounds to ensure we don't exceed the sequence length - if self.kmer_index + self.skip_len > self.sequence.len() { - return None; - } + // Check if we've processed all k-mers + if self.kmer_index >= self.max_index { + return None; + } - // Build skipmer with DNA base validation - let mut kmer: Vec = Vec::with_capacity(self.dna_ksize); - for (_i, &base) in self.sequence - [self.kmer_index..self.kmer_index + self.skip_len] - .iter() - .enumerate() - .filter(|&(i, _)| i % self.skip_n < self.skip_m) - .take(self.dna_ksize) - { - // Use the validate_base method to check the base - if let Some(result) = self.validate_base(base, &kmer) { - self.kmer_index += 1; // Move to the next position if skipping is forced - return Some(result); + // Iterate over the frames + for frame in self.reading_frames.frames().iter() { + let kmer_iter = frame.kmer_iter(self.k_size, self.seed, self.force); + for result in kmer_iter { + match result { + Ok(hash) => self.hashes_buffer.push(hash), + Err(e) => { + if !self.force { + return Some(Err(e)); } - eprintln!("base {}", base); - kmer.push(base); - } - // eprintln!("skipmer kmer: {:?}", kmer); - - // Generate reverse complement skipmer - let krc: Vec = self.dna_rc[self.dna_len - self.skip_len - self.kmer_index - ..self.dna_len - self.kmer_index] - .iter() - .enumerate() - .filter(|&(i, _)| i % self.skip_n < self.skip_m) - .take(self.dna_ksize) - .map(|(_, &base)| base) - .collect(); - - let hash = crate::_hash_murmur(std::cmp::min(&kmer, &krc), self.seed); - self.kmer_index += 1; - eprintln!("built skipmer hash"); - Some(Ok(hash)) - } else if self.hashes_buffer.is_empty() && self.translate_iter_step == 0 { - // Processing protein by translating DNA - // TODO: Implement iterator over frames instead of hashes_buffer. - - for frame_number in 0..3 { - let substr: Vec = self - .sequence - .iter() - .cloned() - .skip(frame_number) - .take(self.sequence.len() - frame_number) - .collect(); - - let aa = to_aa( - &substr, - self.hash_function.dayhoff(), - self.hash_function.hp(), - ) - .unwrap(); - - aa.windows(self.k_size).for_each(|n| { - let hash = crate::_hash_murmur(n, self.seed); - self.hashes_buffer.push(hash); - }); - - let rc_substr: Vec = self - .dna_rc - .iter() - .cloned() - .skip(frame_number) - .take(self.dna_rc.len() - frame_number) - .collect(); - let aa_rc = to_aa( - &rc_substr, - self.hash_function.dayhoff(), - self.hash_function.hp(), - ) - .unwrap(); - - aa_rc.windows(self.k_size).for_each(|n| { - let hash = crate::_hash_murmur(n, self.seed); - self.hashes_buffer.push(hash); - }); } - Some(Ok(0)) - } else { - if self.translate_iter_step == self.hashes_buffer.len() { - self.hashes_buffer.clear(); - self.kmer_index = self.max_index; - return Some(Ok(0)); - } - let curr_idx = self.translate_iter_step; - self.translate_iter_step += 1; - Some(Ok(self.hashes_buffer[curr_idx])) - } - } else { - // Processing protein - // The kmer size is already divided by 3 - - if self.hash_function.protein() { - let aa_kmer = &self.sequence[self.kmer_index..self.kmer_index + self.k_size]; - let hash = crate::_hash_murmur(aa_kmer, self.seed); - self.kmer_index += 1; - Some(Ok(hash)) - } else { - if !self.prot_configured { - self.aa_seq = match &self.hash_function { - HashFunctions::Murmur64Dayhoff => { - self.sequence.iter().cloned().map(aa_to_dayhoff).collect() - } - HashFunctions::Murmur64Hp => { - self.sequence.iter().cloned().map(aa_to_hp).collect() - } - invalid => { - return Some(Err(Error::InvalidHashFunction { - function: format!("{}", invalid), - })); - } - }; - } - - let aa_kmer = &self.aa_seq[self.kmer_index..self.kmer_index + self.k_size]; - let hash = crate::_hash_murmur(aa_kmer, self.seed); - self.kmer_index += 1; - Some(Ok(hash)) } } - } else { - // End the iterator - None } + + self.kmer_index += 1; + Some(Ok(0)) // Return 0 to indicate progress; actual hashes are stored in `hashes_buffer` } } @@ -1450,13 +1216,13 @@ mod test { let k_size = 7; let seed = 42; let force = true; // Force skip over invalid bases if needed - + let is_protein = false; // Initialize SeqToHashes iterator using the new constructor let mut seq_to_hashes = SeqToHashes::new( sequence, k_size, force, - false, + is_protein, HashFunctions::Murmur64Dna, seed, ); @@ -1489,13 +1255,14 @@ mod test { let k_size = 5; let seed = 42; let force = true; // Force skip over invalid bases if needed + let is_protein = false; // Initialize SeqToHashes iterator using the new constructor let mut seq_to_hashes = SeqToHashes::new( sequence, k_size, force, - false, + is_protein, HashFunctions::Murmur64Skipm2n3, seed, ); From 9c28580af0f11dd4a74b3e74660caaa2d5cc6a12 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 4 Dec 2024 15:23:48 -0800 Subject: [PATCH 04/16] init testing ReadingFrame --- src/core/src/encodings.rs | 394 +++++++++++++++++++++++++++++++++++--- 1 file changed, 372 insertions(+), 22 deletions(-) diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index 671f58310c..5214bb321d 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -52,9 +52,11 @@ impl HashFunctions { pub fn hp(&self) -> bool { *self == HashFunctions::Murmur64Hp } + pub fn skipm1n3(&self) -> bool { *self == HashFunctions::Murmur64Skipm1n3 } + pub fn skipm2n3(&self) -> bool { *self == HashFunctions::Murmur64Skipm2n3 } @@ -98,34 +100,65 @@ impl TryFrom<&str> for HashFunctions { pub struct ReadingFrame { fw: Vec, // Forward frame rc: Option>, // Reverse complement (optional, not used for protein input) + hash_function: HashFunctions, +} + +impl std::fmt::Display for ReadingFrame { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let fw_str = String::from_utf8(self.fw.clone()).expect("Invalid UTF-8 sequence in fw"); + if let Some(rc) = &self.rc { + let rc_str = String::from_utf8(rc.clone()).expect("Invalid UTF-8 sequence in rc"); + write!(f, "Forward: {}, Reverse Complement: {}", fw_str, rc_str) + } else { + write!(f, "Forward: {}, Reverse Complement: None", fw_str) + } + } } impl ReadingFrame { /// Create a k-mer iterator for this reading frame pub fn kmer_iter(&self, ksize: usize, seed: u64, force: bool) -> KmerIterator { - KmerIterator::new(&self.fw, self.rc.as_deref(), ksize, seed, force) + KmerIterator::new( + &self.fw, + self.rc.as_deref(), + ksize, + seed, + force, + self.hash_function.clone(), + ) } } #[derive(Debug)] pub struct ReadingFrames(Vec); +impl std::fmt::Display for ReadingFrames { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + for (index, frame) in self.0.iter().enumerate() { + writeln!(f, "Frame {}: {}", index + 1, frame)?; + } + Ok(()) + } +} + impl ReadingFrames { - /// Create ReadingFrames based on the input sequence, moltype, and protein flag + /// Create ReadingFrames based on the input sequence, moltype, and is_protein flag pub fn new(sequence: &[u8], is_protein: bool, hash_function: &HashFunctions) -> Self { if is_protein { // for protein input, return one forward frame let frames = vec![ReadingFrame { fw: sequence.to_vec(), rc: None, + hash_function: hash_function.clone(), }]; Self(frames) } else if hash_function.dna() { - // DNA: just forward + rc + // DNA: return forward + rc frames let dna_rc = revcomp(sequence); let frames = vec![ReadingFrame { fw: sequence.to_vec(), rc: Some(dna_rc), + hash_function: hash_function.clone(), }]; Self(frames) } else if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { @@ -133,39 +166,47 @@ impl ReadingFrames { let dna_rc = revcomp(sequence); // Compute reverse complement for translation let dayhoff = hash_function.dayhoff(); let hp = hash_function.hp(); - Self::translate_frames(sequence, &dna_rc, dayhoff, hp) + Self::translate_frames(sequence, &dna_rc, dayhoff, hp, hash_function) } else if hash_function.skipm1n3() || hash_function.skipm2n3() { - // Skipmers: build 6 frames, following skip pattern + // skipmers: build 6 frames, following skip pattern let (m, n) = if hash_function.skipm1n3() { (1, 3) } else { (2, 3) }; - Self::skipmer_frames(sequence, n, m) + Self::skipmer_frames(sequence, n, m, hash_function) } else { panic!("Unsupported moltype: {}", hash_function); } } /// Generate translated frames - fn translate_frames(sequence: &[u8], dna_rc: &[u8], dayhoff: bool, hp: bool) -> Self { + fn translate_frames( + sequence: &[u8], + dna_rc: &[u8], + dayhoff: bool, + hp: bool, + hash_function: &HashFunctions, + ) -> Self { let frames: Vec = (0..3) .map(|frame_number| ReadingFrame { fw: translated_frame(sequence, frame_number, dayhoff, hp), rc: Some(translated_frame(dna_rc, frame_number, dayhoff, hp)), + hash_function: hash_function.clone(), }) .collect(); Self(frames) } /// Generate skipmer frames - fn skipmer_frames(sequence: &[u8], n: usize, m: usize) -> Self { + fn skipmer_frames(sequence: &[u8], n: usize, m: usize, hash_function: &HashFunctions) -> Self { let frames: Vec = (0..3) .map(|frame_number| { let fw = skipmer_frame(sequence, frame_number, n, m); ReadingFrame { fw: fw.clone(), rc: Some(revcomp(&fw)), + hash_function: hash_function.clone(), } }) .collect(); @@ -186,10 +227,18 @@ pub struct KmerIterator<'a> { len: usize, seed: u64, force: bool, + hash_function: HashFunctions, } impl<'a> KmerIterator<'a> { - pub fn new(fw: &'a [u8], rc: Option<&'a [u8]>, ksize: usize, seed: u64, force: bool) -> Self { + pub fn new( + fw: &'a [u8], + rc: Option<&'a [u8]>, + ksize: usize, + seed: u64, + force: bool, + hash_function: HashFunctions, + ) -> Self { Self { fw, rc, @@ -198,6 +247,7 @@ impl<'a> KmerIterator<'a> { len: fw.len(), seed, force, + hash_function: hash_function, } } } @@ -213,22 +263,22 @@ impl<'a> Iterator for KmerIterator<'a> { // Forward k-mer let kmer = &self.fw[self.index..self.index + self.ksize]; - // Validate the k-mer - for j in self.index..self.index + self.ksize { - if !VALID[self.fw[j] as usize] { - if !self.force { - return Some(Err(Error::InvalidDNA { - message: String::from_utf8(kmer.to_vec()).unwrap(), - })); - } else { - self.index += 1; - return Some(Ok(0)); // Skip invalid k-mer + // Validate the k-mer only if the hash function is DNA-based + if self.hash_function.dna() { + for j in self.index..self.index + self.ksize { + if !VALID[self.fw[j] as usize] { + if !self.force { + return Some(Err(Error::InvalidDNA { + message: String::from_utf8(kmer.to_vec()).unwrap(), + })); + } else { + self.index += 1; + return Some(Ok(0)); // Skip invalid k-mer + } } } } - - // Reverse complement k-mer (if rc exists) - + // keeping note from prior implementation :) // ... and then while moving the k-mer window forward for the sequence // we move another window backwards for the RC. // For a ksize = 3, and a sequence AGTCGT (len = 6): @@ -242,6 +292,8 @@ impl<'a> Iterator for KmerIterator<'a> { // +-+---------+---------------+-------+ // (leaving this table here because I had to draw to // get the indices correctly) + + // Reverse complement k-mer (if rc exists) let hash = if let Some(rc) = self.rc { let krc = &rc[self.len - self.ksize - self.index..self.len - self.index]; crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed) @@ -710,6 +762,8 @@ impl<'a> Iterator for Indices<'a> { #[cfg(test)] mod test { + use proptest::collection::vec; + use super::*; #[test] @@ -778,4 +832,300 @@ mod test { assert_eq!(colors.len(), 2); } + + #[test] + fn test_reading_frames_new_dna() { + let sequence = b"AGTCGT"; + let hash_function = HashFunctions::Murmur64Dna; + + let frames = ReadingFrames::new(sequence, false, &hash_function); + + assert_eq!(frames.frames().len(), 1); // Only one fw/rc readingframe for DNA + assert_eq!(frames.frames()[0].fw, sequence.to_vec()); + assert_eq!(frames.frames()[0].rc, Some(b"ACGACT".to_vec())); + } + + #[test] + fn test_reading_frames_new_is_protein() { + let sequence = b"MVLSPADKTNVKAAW"; + let hash_function = HashFunctions::Murmur64Protein; + + let frames = ReadingFrames::new(sequence, true, &hash_function); + + assert_eq!(frames.frames().len(), 1); // Only one frame for protein + assert_eq!(frames.frames()[0].fw, sequence.to_vec()); + assert_eq!(frames.frames()[0].rc, None); // No reverse complement for protein + } + + #[test] + fn test_reading_frames_translate_frames() { + let sequence = b"AGTCGTCGAGCT"; + let hash_function = HashFunctions::Murmur64Protein; + + let frames = ReadingFrames::new(sequence, false, &hash_function); + eprintln!("frames: {}", frames); + + assert_eq!(frames.frames().len(), 3); // Three translated frames + assert_eq!(frames.frames()[0].fw, b"SRRA".to_vec()); + assert_eq!(frames.frames()[0].rc, Some(b"SSTT".to_vec())); + assert_eq!(frames.frames()[1].fw, b"VVE".to_vec()); + assert_eq!(frames.frames()[1].rc, Some(b"ARR".to_vec())); + assert_eq!(frames.frames()[2].fw, b"SSS".to_vec()); + assert_eq!(frames.frames()[2].rc, Some(b"LDD".to_vec())); + } + + #[test] + fn test_reading_frames_skipmer_frames_m1n3() { + let sequence = b"AGTCGTCGAGCT"; + let hash_function = HashFunctions::Murmur64Skipm1n3; + + let frames = ReadingFrames::new(sequence, false, &hash_function); + + assert_eq!(frames.frames().len(), 3); // Three skipmer frames + + // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) + let expected_fw_frame_0 = b"ACCG".to_vec(); // Frame 0: Keep 1 base, skip 2 bases, starting from index 0 + let expected_fw_frame_1 = b"GGGC".to_vec(); // Frame 1: Keep 1 base, skip 2 bases, starting from index 1 + let expected_fw_frame_2 = b"TTAT".to_vec(); // Frame 2: Keep 1 base, skip 2 bases, starting from index 2 + + let expected_rc_frame_0 = revcomp(&expected_fw_frame_0); + let expected_rc_frame_1 = revcomp(&expected_fw_frame_1); + let expected_rc_frame_2 = revcomp(&expected_fw_frame_2); + + // Check forward and reverse complement sequences for each frame + assert_eq!(frames.frames()[0].fw, expected_fw_frame_0); + assert_eq!(frames.frames()[0].rc, Some(expected_rc_frame_0)); + + assert_eq!(frames.frames()[1].fw, expected_fw_frame_1); + assert_eq!(frames.frames()[1].rc, Some(expected_rc_frame_1)); + + assert_eq!(frames.frames()[2].fw, expected_fw_frame_2); + assert_eq!(frames.frames()[2].rc, Some(expected_rc_frame_2)); + } + + #[test] + fn test_reading_frames_skipmer_frames_m2n3() { + let sequence = b"AGTCGTCGAGCT"; + let hash_function = HashFunctions::Murmur64Skipm2n3; + + let frames = ReadingFrames::new(sequence, false, &hash_function); + + assert_eq!(frames.frames().len(), 3); // Three skipmer frames + + // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) + let expected_fw_frame_0 = b"AGCGCGGC".to_vec(); + let expected_fw_frame_1 = b"GTGTGACT".to_vec(); + let expected_fw_frame_2 = b"TCTCAGT".to_vec(); + + let expected_rc_frame_0 = revcomp(&expected_fw_frame_0); + let expected_rc_frame_1 = revcomp(&expected_fw_frame_1); + let expected_rc_frame_2 = revcomp(&expected_fw_frame_2); + + // Check forward and reverse complement sequences for each frame + assert_eq!(frames.frames()[0].fw, expected_fw_frame_0); + assert_eq!(frames.frames()[0].rc, Some(expected_rc_frame_0)); + + assert_eq!(frames.frames()[1].fw, expected_fw_frame_1); + assert_eq!(frames.frames()[1].rc, Some(expected_rc_frame_1)); + + assert_eq!(frames.frames()[2].fw, expected_fw_frame_2); + assert_eq!(frames.frames()[2].rc, Some(expected_rc_frame_2)); + } + + #[test] + fn test_reading_frame_kmer_iter() { + let sequence = b"AGTCGT"; + let rc = revcomp(sequence); + let hash_function = HashFunctions::Murmur64Dna; + let frame = ReadingFrame { + fw: sequence.to_vec(), + rc: Some(rc.clone()), + hash_function, + }; + eprintln!("frame: {}", frame); + assert_eq!(frame.fw, sequence.to_vec()); + assert_eq!(frame.rc, Some(b"ACGACT".to_vec())); + + let kmer_iter = frame.kmer_iter(3, 42, false); + + // Expected k-mers from the forward and reverse complement sequence + let expected_kmers = vec![ + (b"AGT".to_vec(), b"ACT".to_vec()), + (b"GTC".to_vec(), b"GAC".to_vec()), + (b"TCG".to_vec(), b"CGA".to_vec()), + (b"CGT".to_vec(), b"ACG".to_vec()), + ]; + + // Compute hashes for expected k-mers + let expected_hashes: Vec = expected_kmers + .iter() + .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) + .collect(); + + // Collect hashes produced by the kmer_iter + let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // Check that produced hashes match expected hashes in order + assert_eq!( + produced_hashes, expected_hashes, + "Hashes do not match in order" + ); + } + + #[test] + fn test_kmer_iter_is_protein() { + let sequence = b"MVLSPADKTNVKAAW"; + let hash_function = HashFunctions::Murmur64Protein; + + let frames = ReadingFrames::new(sequence, true, &hash_function); + + // Only one forward frame for protein + assert_eq!(frames.frames().len(), 1); + let frame = &frames.frames()[0]; + + let kmer_iter = frame.kmer_iter(3, 42, false); + + // Expected k-mers for protein sequence + let expected_kmers = vec![ + b"MVL".to_vec(), + b"VLS".to_vec(), + b"LSP".to_vec(), + b"SPA".to_vec(), + b"PAD".to_vec(), + b"ADK".to_vec(), + b"DKT".to_vec(), + b"KTN".to_vec(), + b"TNV".to_vec(), + b"NVK".to_vec(), + b"VKA".to_vec(), + b"KAA".to_vec(), + b"AAW".to_vec(), + ]; + + // Compute hashes for expected k-mers + let expected_hashes: Vec = expected_kmers + .iter() + .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + .collect(); + + // Collect hashes produced by the kmer_iter + let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // Check that produced hashes match expected hashes in order + assert_eq!( + produced_hashes, expected_hashes, + "Hashes do not match in order" + ); + } + + // #[test] + // fn test_kmer_iter_translate_frames() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Protein; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // // Three translated frames + // assert_eq!(frames.frames().len(), 3); + + // // Expected k-mers for translated frames + // let expected_kmers = vec![ + // vec![b"SRR".to_vec(), b"RRA".to_vec()], + // vec![b"VVE".to_vec(), b"VE".to_vec()], + // vec![b"SSS".to_vec(), b"SS".to_vec()], + // ]; + + // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { + // let kmer_iter = frame.kmer_iter(3, 42, false); + + // // Compute hashes for expected k-mers + // let expected_hashes: Vec = expected_frame_kmers + // .iter() + // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // // Check that produced hashes match expected hashes in order + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order for frame" + // ); + // } + // } + + // #[test] + // fn test_kmer_iter_skipmer_m1n3() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Skipm1n3; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // // Three skipmer frames + // assert_eq!(frames.frames().len(), 3); + + // // Expected k-mers for skipmer (m=1, n=3) + // let expected_kmers = vec![ + // vec![b"ACC".to_vec()], + // vec![b"GTG".to_vec()], + // vec![b"TCG".to_vec()], + // ]; + + // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { + // let kmer_iter = frame.kmer_iter(3, 42, false); + + // // Compute hashes for expected k-mers + // let expected_hashes: Vec = expected_frame_kmers + // .iter() + // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // // Check that produced hashes match expected hashes in order + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order for frame" + // ); + // } + // } + + // #[test] + // fn test_kmer_iter_skipmer_m2n3() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Skipm2n3; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // // Three skipmer frames + // assert_eq!(frames.frames().len(), 3); + + // // Expected k-mers for skipmer (m=2, n=3) + // let expected_kmers = vec![ + // vec![b"AGC".to_vec()], + // vec![b"GTG".to_vec()], + // vec![b"TCA".to_vec()], + // ]; + + // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { + // let kmer_iter = frame.kmer_iter(3, 42, false); + + // // Compute hashes for expected k-mers + // let expected_hashes: Vec = expected_frame_kmers + // .iter() + // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // // Check that produced hashes match expected hashes in order + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order for frame" + // ); + // } + // } } From f16f7378a1831b2a40fe7a88cab7919846556d27 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 5 Dec 2024 16:29:50 -0800 Subject: [PATCH 05/16] simplify and move into SeqToHashes --- src/core/src/encodings.rs | 506 ----------------------------- src/core/src/signature.rs | 648 ++++++++++++++++++++++++++++++++++---- 2 files changed, 590 insertions(+), 564 deletions(-) diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index 5214bb321d..d87c220aa0 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -96,216 +96,6 @@ impl TryFrom<&str> for HashFunctions { } } -#[derive(Debug)] -pub struct ReadingFrame { - fw: Vec, // Forward frame - rc: Option>, // Reverse complement (optional, not used for protein input) - hash_function: HashFunctions, -} - -impl std::fmt::Display for ReadingFrame { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let fw_str = String::from_utf8(self.fw.clone()).expect("Invalid UTF-8 sequence in fw"); - if let Some(rc) = &self.rc { - let rc_str = String::from_utf8(rc.clone()).expect("Invalid UTF-8 sequence in rc"); - write!(f, "Forward: {}, Reverse Complement: {}", fw_str, rc_str) - } else { - write!(f, "Forward: {}, Reverse Complement: None", fw_str) - } - } -} - -impl ReadingFrame { - /// Create a k-mer iterator for this reading frame - pub fn kmer_iter(&self, ksize: usize, seed: u64, force: bool) -> KmerIterator { - KmerIterator::new( - &self.fw, - self.rc.as_deref(), - ksize, - seed, - force, - self.hash_function.clone(), - ) - } -} - -#[derive(Debug)] -pub struct ReadingFrames(Vec); - -impl std::fmt::Display for ReadingFrames { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - for (index, frame) in self.0.iter().enumerate() { - writeln!(f, "Frame {}: {}", index + 1, frame)?; - } - Ok(()) - } -} - -impl ReadingFrames { - /// Create ReadingFrames based on the input sequence, moltype, and is_protein flag - pub fn new(sequence: &[u8], is_protein: bool, hash_function: &HashFunctions) -> Self { - if is_protein { - // for protein input, return one forward frame - let frames = vec![ReadingFrame { - fw: sequence.to_vec(), - rc: None, - hash_function: hash_function.clone(), - }]; - Self(frames) - } else if hash_function.dna() { - // DNA: return forward + rc frames - let dna_rc = revcomp(sequence); - let frames = vec![ReadingFrame { - fw: sequence.to_vec(), - rc: Some(dna_rc), - hash_function: hash_function.clone(), - }]; - Self(frames) - } else if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { - // translation: build 6 frames - let dna_rc = revcomp(sequence); // Compute reverse complement for translation - let dayhoff = hash_function.dayhoff(); - let hp = hash_function.hp(); - Self::translate_frames(sequence, &dna_rc, dayhoff, hp, hash_function) - } else if hash_function.skipm1n3() || hash_function.skipm2n3() { - // skipmers: build 6 frames, following skip pattern - let (m, n) = if hash_function.skipm1n3() { - (1, 3) - } else { - (2, 3) - }; - Self::skipmer_frames(sequence, n, m, hash_function) - } else { - panic!("Unsupported moltype: {}", hash_function); - } - } - - /// Generate translated frames - fn translate_frames( - sequence: &[u8], - dna_rc: &[u8], - dayhoff: bool, - hp: bool, - hash_function: &HashFunctions, - ) -> Self { - let frames: Vec = (0..3) - .map(|frame_number| ReadingFrame { - fw: translated_frame(sequence, frame_number, dayhoff, hp), - rc: Some(translated_frame(dna_rc, frame_number, dayhoff, hp)), - hash_function: hash_function.clone(), - }) - .collect(); - Self(frames) - } - - /// Generate skipmer frames - fn skipmer_frames(sequence: &[u8], n: usize, m: usize, hash_function: &HashFunctions) -> Self { - let frames: Vec = (0..3) - .map(|frame_number| { - let fw = skipmer_frame(sequence, frame_number, n, m); - ReadingFrame { - fw: fw.clone(), - rc: Some(revcomp(&fw)), - hash_function: hash_function.clone(), - } - }) - .collect(); - Self(frames) - } - - /// Access the frames - pub fn frames(&self) -> &Vec { - &self.0 - } -} - -pub struct KmerIterator<'a> { - fw: &'a [u8], - rc: Option<&'a [u8]>, - ksize: usize, - index: usize, - len: usize, - seed: u64, - force: bool, - hash_function: HashFunctions, -} - -impl<'a> KmerIterator<'a> { - pub fn new( - fw: &'a [u8], - rc: Option<&'a [u8]>, - ksize: usize, - seed: u64, - force: bool, - hash_function: HashFunctions, - ) -> Self { - Self { - fw, - rc, - ksize, - index: 0, - len: fw.len(), - seed, - force, - hash_function: hash_function, - } - } -} - -impl<'a> Iterator for KmerIterator<'a> { - type Item = Result; - - fn next(&mut self) -> Option { - if self.index + self.ksize > self.len { - return None; // End of iteration - } - - // Forward k-mer - let kmer = &self.fw[self.index..self.index + self.ksize]; - - // Validate the k-mer only if the hash function is DNA-based - if self.hash_function.dna() { - for j in self.index..self.index + self.ksize { - if !VALID[self.fw[j] as usize] { - if !self.force { - return Some(Err(Error::InvalidDNA { - message: String::from_utf8(kmer.to_vec()).unwrap(), - })); - } else { - self.index += 1; - return Some(Ok(0)); // Skip invalid k-mer - } - } - } - } - // keeping note from prior implementation :) - // ... and then while moving the k-mer window forward for the sequence - // we move another window backwards for the RC. - // For a ksize = 3, and a sequence AGTCGT (len = 6): - // +-+---------+---------------+-------+ - // seq RC |i|i + ksize|len - ksize - i|len - i| - // AGTCGT ACGACT +-+---------+---------------+-------+ - // +-> +-> |0| 2 | 3 | 6 | - // +-> +-> |1| 3 | 2 | 5 | - // +-> +-> |2| 4 | 1 | 4 | - // +-> +-> |3| 5 | 0 | 3 | - // +-+---------+---------------+-------+ - // (leaving this table here because I had to draw to - // get the indices correctly) - - // Reverse complement k-mer (if rc exists) - let hash = if let Some(rc) = self.rc { - let krc = &rc[self.len - self.ksize - self.index..self.len - self.index]; - crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed) - } else { - crate::_hash_murmur(kmer, self.seed) // Use only forward k-mer if rc is None - }; - - self.index += 1; - Some(Ok(hash)) - } -} - const COMPLEMENT: [u8; 256] = { let mut lookup = [0; 256]; lookup[b'A' as usize] = b'T'; @@ -832,300 +622,4 @@ mod test { assert_eq!(colors.len(), 2); } - - #[test] - fn test_reading_frames_new_dna() { - let sequence = b"AGTCGT"; - let hash_function = HashFunctions::Murmur64Dna; - - let frames = ReadingFrames::new(sequence, false, &hash_function); - - assert_eq!(frames.frames().len(), 1); // Only one fw/rc readingframe for DNA - assert_eq!(frames.frames()[0].fw, sequence.to_vec()); - assert_eq!(frames.frames()[0].rc, Some(b"ACGACT".to_vec())); - } - - #[test] - fn test_reading_frames_new_is_protein() { - let sequence = b"MVLSPADKTNVKAAW"; - let hash_function = HashFunctions::Murmur64Protein; - - let frames = ReadingFrames::new(sequence, true, &hash_function); - - assert_eq!(frames.frames().len(), 1); // Only one frame for protein - assert_eq!(frames.frames()[0].fw, sequence.to_vec()); - assert_eq!(frames.frames()[0].rc, None); // No reverse complement for protein - } - - #[test] - fn test_reading_frames_translate_frames() { - let sequence = b"AGTCGTCGAGCT"; - let hash_function = HashFunctions::Murmur64Protein; - - let frames = ReadingFrames::new(sequence, false, &hash_function); - eprintln!("frames: {}", frames); - - assert_eq!(frames.frames().len(), 3); // Three translated frames - assert_eq!(frames.frames()[0].fw, b"SRRA".to_vec()); - assert_eq!(frames.frames()[0].rc, Some(b"SSTT".to_vec())); - assert_eq!(frames.frames()[1].fw, b"VVE".to_vec()); - assert_eq!(frames.frames()[1].rc, Some(b"ARR".to_vec())); - assert_eq!(frames.frames()[2].fw, b"SSS".to_vec()); - assert_eq!(frames.frames()[2].rc, Some(b"LDD".to_vec())); - } - - #[test] - fn test_reading_frames_skipmer_frames_m1n3() { - let sequence = b"AGTCGTCGAGCT"; - let hash_function = HashFunctions::Murmur64Skipm1n3; - - let frames = ReadingFrames::new(sequence, false, &hash_function); - - assert_eq!(frames.frames().len(), 3); // Three skipmer frames - - // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) - let expected_fw_frame_0 = b"ACCG".to_vec(); // Frame 0: Keep 1 base, skip 2 bases, starting from index 0 - let expected_fw_frame_1 = b"GGGC".to_vec(); // Frame 1: Keep 1 base, skip 2 bases, starting from index 1 - let expected_fw_frame_2 = b"TTAT".to_vec(); // Frame 2: Keep 1 base, skip 2 bases, starting from index 2 - - let expected_rc_frame_0 = revcomp(&expected_fw_frame_0); - let expected_rc_frame_1 = revcomp(&expected_fw_frame_1); - let expected_rc_frame_2 = revcomp(&expected_fw_frame_2); - - // Check forward and reverse complement sequences for each frame - assert_eq!(frames.frames()[0].fw, expected_fw_frame_0); - assert_eq!(frames.frames()[0].rc, Some(expected_rc_frame_0)); - - assert_eq!(frames.frames()[1].fw, expected_fw_frame_1); - assert_eq!(frames.frames()[1].rc, Some(expected_rc_frame_1)); - - assert_eq!(frames.frames()[2].fw, expected_fw_frame_2); - assert_eq!(frames.frames()[2].rc, Some(expected_rc_frame_2)); - } - - #[test] - fn test_reading_frames_skipmer_frames_m2n3() { - let sequence = b"AGTCGTCGAGCT"; - let hash_function = HashFunctions::Murmur64Skipm2n3; - - let frames = ReadingFrames::new(sequence, false, &hash_function); - - assert_eq!(frames.frames().len(), 3); // Three skipmer frames - - // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) - let expected_fw_frame_0 = b"AGCGCGGC".to_vec(); - let expected_fw_frame_1 = b"GTGTGACT".to_vec(); - let expected_fw_frame_2 = b"TCTCAGT".to_vec(); - - let expected_rc_frame_0 = revcomp(&expected_fw_frame_0); - let expected_rc_frame_1 = revcomp(&expected_fw_frame_1); - let expected_rc_frame_2 = revcomp(&expected_fw_frame_2); - - // Check forward and reverse complement sequences for each frame - assert_eq!(frames.frames()[0].fw, expected_fw_frame_0); - assert_eq!(frames.frames()[0].rc, Some(expected_rc_frame_0)); - - assert_eq!(frames.frames()[1].fw, expected_fw_frame_1); - assert_eq!(frames.frames()[1].rc, Some(expected_rc_frame_1)); - - assert_eq!(frames.frames()[2].fw, expected_fw_frame_2); - assert_eq!(frames.frames()[2].rc, Some(expected_rc_frame_2)); - } - - #[test] - fn test_reading_frame_kmer_iter() { - let sequence = b"AGTCGT"; - let rc = revcomp(sequence); - let hash_function = HashFunctions::Murmur64Dna; - let frame = ReadingFrame { - fw: sequence.to_vec(), - rc: Some(rc.clone()), - hash_function, - }; - eprintln!("frame: {}", frame); - assert_eq!(frame.fw, sequence.to_vec()); - assert_eq!(frame.rc, Some(b"ACGACT".to_vec())); - - let kmer_iter = frame.kmer_iter(3, 42, false); - - // Expected k-mers from the forward and reverse complement sequence - let expected_kmers = vec![ - (b"AGT".to_vec(), b"ACT".to_vec()), - (b"GTC".to_vec(), b"GAC".to_vec()), - (b"TCG".to_vec(), b"CGA".to_vec()), - (b"CGT".to_vec(), b"ACG".to_vec()), - ]; - - // Compute hashes for expected k-mers - let expected_hashes: Vec = expected_kmers - .iter() - .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) - .collect(); - - // Collect hashes produced by the kmer_iter - let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // Check that produced hashes match expected hashes in order - assert_eq!( - produced_hashes, expected_hashes, - "Hashes do not match in order" - ); - } - - #[test] - fn test_kmer_iter_is_protein() { - let sequence = b"MVLSPADKTNVKAAW"; - let hash_function = HashFunctions::Murmur64Protein; - - let frames = ReadingFrames::new(sequence, true, &hash_function); - - // Only one forward frame for protein - assert_eq!(frames.frames().len(), 1); - let frame = &frames.frames()[0]; - - let kmer_iter = frame.kmer_iter(3, 42, false); - - // Expected k-mers for protein sequence - let expected_kmers = vec![ - b"MVL".to_vec(), - b"VLS".to_vec(), - b"LSP".to_vec(), - b"SPA".to_vec(), - b"PAD".to_vec(), - b"ADK".to_vec(), - b"DKT".to_vec(), - b"KTN".to_vec(), - b"TNV".to_vec(), - b"NVK".to_vec(), - b"VKA".to_vec(), - b"KAA".to_vec(), - b"AAW".to_vec(), - ]; - - // Compute hashes for expected k-mers - let expected_hashes: Vec = expected_kmers - .iter() - .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) - .collect(); - - // Collect hashes produced by the kmer_iter - let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // Check that produced hashes match expected hashes in order - assert_eq!( - produced_hashes, expected_hashes, - "Hashes do not match in order" - ); - } - - // #[test] - // fn test_kmer_iter_translate_frames() { - // let sequence = b"AGTCGTCGAGCT"; - // let hash_function = HashFunctions::Murmur64Protein; - - // let frames = ReadingFrames::new(sequence, false, &hash_function); - - // // Three translated frames - // assert_eq!(frames.frames().len(), 3); - - // // Expected k-mers for translated frames - // let expected_kmers = vec![ - // vec![b"SRR".to_vec(), b"RRA".to_vec()], - // vec![b"VVE".to_vec(), b"VE".to_vec()], - // vec![b"SSS".to_vec(), b"SS".to_vec()], - // ]; - - // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { - // let kmer_iter = frame.kmer_iter(3, 42, false); - - // // Compute hashes for expected k-mers - // let expected_hashes: Vec = expected_frame_kmers - // .iter() - // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) - // .collect(); - - // // Collect hashes produced by the kmer_iter - // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // // Check that produced hashes match expected hashes in order - // assert_eq!( - // produced_hashes, expected_hashes, - // "Hashes do not match in order for frame" - // ); - // } - // } - - // #[test] - // fn test_kmer_iter_skipmer_m1n3() { - // let sequence = b"AGTCGTCGAGCT"; - // let hash_function = HashFunctions::Murmur64Skipm1n3; - - // let frames = ReadingFrames::new(sequence, false, &hash_function); - - // // Three skipmer frames - // assert_eq!(frames.frames().len(), 3); - - // // Expected k-mers for skipmer (m=1, n=3) - // let expected_kmers = vec![ - // vec![b"ACC".to_vec()], - // vec![b"GTG".to_vec()], - // vec![b"TCG".to_vec()], - // ]; - - // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { - // let kmer_iter = frame.kmer_iter(3, 42, false); - - // // Compute hashes for expected k-mers - // let expected_hashes: Vec = expected_frame_kmers - // .iter() - // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) - // .collect(); - - // // Collect hashes produced by the kmer_iter - // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // // Check that produced hashes match expected hashes in order - // assert_eq!( - // produced_hashes, expected_hashes, - // "Hashes do not match in order for frame" - // ); - // } - // } - - // #[test] - // fn test_kmer_iter_skipmer_m2n3() { - // let sequence = b"AGTCGTCGAGCT"; - // let hash_function = HashFunctions::Murmur64Skipm2n3; - - // let frames = ReadingFrames::new(sequence, false, &hash_function); - - // // Three skipmer frames - // assert_eq!(frames.frames().len(), 3); - - // // Expected k-mers for skipmer (m=2, n=3) - // let expected_kmers = vec![ - // vec![b"AGC".to_vec()], - // vec![b"GTG".to_vec()], - // vec![b"TCA".to_vec()], - // ]; - - // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { - // let kmer_iter = frame.kmer_iter(3, 42, false); - - // // Compute hashes for expected k-mers - // let expected_hashes: Vec = expected_frame_kmers - // .iter() - // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) - // .collect(); - - // // Collect hashes produced by the kmer_iter - // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // // Check that produced hashes match expected hashes in order - // assert_eq!( - // produced_hashes, expected_hashes, - // "Hashes do not match in order for frame" - // ); - // } - // } } diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index c0ea0c4c3c..c7f99786e1 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -10,12 +10,13 @@ use std::path::Path; use std::str; use cfg_if::cfg_if; +use needletail::sequence; #[cfg(feature = "parallel")] use rayon::prelude::*; use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; -use crate::encodings::{HashFunctions, ReadingFrames}; +use crate::encodings::{revcomp, to_aa, HashFunctions, VALID}; use crate::prelude::*; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; @@ -163,17 +164,202 @@ impl SigsTrait for Sketch { } } +#[derive(Debug)] +pub enum ReadingFrame { + DNA { + fw: Vec, + rc: Vec, // Reverse complement is required + len: usize, + }, + Protein { + fw: Vec, // Only forward frame + len: usize, + }, +} + +impl std::fmt::Display for ReadingFrame { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ReadingFrame::DNA { fw, rc, len } => { + let fw_str = String::from_utf8(fw.clone()).expect("Invalid UTF-8 sequence in fw"); + let rc_str = String::from_utf8(rc.clone()).expect("Invalid UTF-8 sequence in rc"); + write!( + f, + "Type: DNA ({}bp), Forward: {}, Reverse Complement: {}", + len, fw_str, rc_str + ) + } + ReadingFrame::Protein { fw, len } => { + let fw_str = String::from_utf8(fw.clone()).expect("Invalid UTF-8 sequence in fw"); + write!(f, "Type: Protein ({}aa), Forward: {}", len, fw_str) + } + } + } +} + +impl ReadingFrame { + pub fn new_dna(sequence: &[u8]) -> Self { + let fw = sequence.to_vec(); + let rc = revcomp(sequence); + let len = sequence.len(); + ReadingFrame::DNA { fw, rc, len } + } + + pub fn new_protein(sequence: &[u8]) -> Self { + let fw = sequence.to_vec(); + let len = fw.len(); + ReadingFrame::Protein { fw, len } + } + + pub fn new_skipmer(seq: &[u8], start: usize, m: usize, n: usize) -> Self { + if start > n { + panic!("Skipmer frame number must be <= n ({})", n); + } + // Generate forward skipmer frame + let fw: Vec = seq + .iter() + .skip(start) + .enumerate() + .filter_map(|(i, &base)| if i % n < m { Some(base) } else { None }) + .collect(); + + let len = fw.len(); + let rc = revcomp(&fw); + ReadingFrame::DNA { fw, rc, len } + } + + pub fn new_translated(sequence: &[u8], frame_number: usize, dayhoff: bool, hp: bool) -> Self { + if frame_number > 2 { + panic!("Frame number must be 0, 1, or 2"); + } + + // Translate the forward frame + let fw: Vec = sequence + .iter() + .cloned() + .skip(frame_number) // Skip the initial bases for the frame + .take(sequence.len() - frame_number) // Adjust length based on skipped bases + .collect::>() // Collect the DNA subsequence + .chunks(3) // Group into codons (triplets) + .filter_map(|codon| to_aa(codon, dayhoff, hp).ok()) // Translate each codon + .flatten() // Flatten the nested results into a single sequence + .collect(); + + let len = fw.len(); + + // Return a Protein reading frame (no reverse complement for translated frames) + ReadingFrame::Protein { fw, len } + } + + pub fn kmer_iter<'a>(&'a self, ksize: usize, seed: u64, force: bool) -> KmerIterator<'a> { + match self { + ReadingFrame::DNA { fw, rc, len } => KmerIterator::new(self, ksize, seed, force), + ReadingFrame::Protein { fw, len } => KmerIterator::new(self, ksize, seed, force), + } + } +} + +pub struct KmerIterator<'a> { + frame: &'a ReadingFrame, // Reference to the ReadingFrame + ksize: usize, + index: usize, + seed: u64, + force: bool, +} + +impl<'a> KmerIterator<'a> { + pub fn new(frame: &'a ReadingFrame, ksize: usize, seed: u64, force: bool) -> Self { + Self { + frame, + ksize, + index: 0, + seed, + force, + } + } + + fn out_of_bounds(&self, length: usize) -> bool { + self.index + self.ksize > length + } + + fn validate_dna_kmer(&self, kmer: &[u8]) -> Result<(), Error> { + for &nt in kmer { + if !VALID[nt as usize] { + return Err(Error::InvalidDNA { + message: String::from_utf8_lossy(kmer).to_string(), + }); + } + } + Ok(()) + } + + fn get_dna_hash(&mut self, fw: &[u8], rc: &[u8]) -> Option> { + let kmer = &fw[self.index..self.index + self.ksize]; + + if !self.force { + if let Err(e) = self.validate_dna_kmer(kmer) { + self.index += 1; + return Some(Err(e)); + } + } + + let krc = &rc[rc.len() - self.ksize - self.index..rc.len() - self.index]; + let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed); + // NTP TESTING + eprintln!( + "Forward DNA k-mer: {}, Reverse Complement k-mer: {}, hash: {}", + String::from_utf8_lossy(kmer), + String::from_utf8_lossy(krc), + hash, + ); + + self.index += 1; + Some(Ok(hash)) + } + + fn get_protein_hash(&mut self, fw: &[u8]) -> Option> { + let kmer = &fw[self.index..self.index + self.ksize]; + + let hash = crate::_hash_murmur(kmer, self.seed); + // NTP TESTING + eprintln!( + "Protein k-mer: {}, hash: {}", + String::from_utf8_lossy(kmer), + hash + ); + + self.index += 1; + Some(Ok(hash)) + } +} + +impl<'a> Iterator for KmerIterator<'a> { + type Item = Result; + + fn next(&mut self) -> Option { + match self.frame { + ReadingFrame::DNA { fw, rc, len } => { + if self.out_of_bounds(*len) { + return None; + } + self.get_dna_hash(fw, rc) + } + ReadingFrame::Protein { fw, len } => { + if self.out_of_bounds(*len) { + return None; + } + self.get_protein_hash(fw) + } + } + } +} + pub struct SeqToHashes { - sequence: Vec, - kmer_index: usize, k_size: usize, - max_index: usize, force: bool, - is_protein: bool, - hash_function: HashFunctions, seed: u64, - hashes_buffer: Vec, - reading_frames: ReadingFrames, + frames: Vec, + frame_index: usize, // Index of the current frame } impl SeqToHashes { @@ -184,51 +370,84 @@ impl SeqToHashes { is_protein: bool, hash_function: HashFunctions, seed: u64, - ) -> SeqToHashes { + ) -> Self { + let mut ksize: usize = k_size; + // Adjust kmer size for protein-based hash functions - let adjusted_k_size = - if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { - k_size / 3 - } else { - k_size - }; + if is_protein || hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { + ksize = k_size / 3; + } - // Determine the maximum index for k-mer generation - let max_index = if seq.len() >= adjusted_k_size { - seq.len() - adjusted_k_size + 1 + // uppercase the sequence + let sequence = seq.to_ascii_uppercase(); + + // Generate frames based on sequence type and hash function + let frames = if is_protein { + Self::protein_frames(&sequence) + } else if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { + Self::translated_frames(&sequence, &hash_function) + } else if hash_function.skipm1n3() || hash_function.skipm2n3() { + Self::skipmer_frames(&sequence, &hash_function) } else { - 0 + Self::dna_frames(&sequence) }; - // Initialize ReadingFrames based on the sequence and hash function - let reading_frames = - ReadingFrames::new(&seq.to_ascii_uppercase(), is_protein, &hash_function); - SeqToHashes { - sequence: seq.to_ascii_uppercase(), - kmer_index: 0, - k_size: adjusted_k_size, - max_index, + k_size: ksize, force, - is_protein, - hash_function, seed, - hashes_buffer: Vec::with_capacity(1000), - reading_frames, + frames, + frame_index: 0, } } - // some helper functions. If we remove, we could probably just rm - // these fields from SeqToHashes, since ReadingFrames handles this now - pub fn get_sequence(&self) -> &[u8] { - &self.sequence - } - pub fn get_hash_function(&self) -> &HashFunctions { - &self.hash_function + /// Generate DNA frames (forward + reverse complement) + fn dna_frames(seq: &[u8]) -> Vec { + vec![ReadingFrame::new_dna(&seq.to_ascii_uppercase())] + } + + fn protein_frames(seq: &[u8]) -> Vec { + vec![ReadingFrame::new_protein(&seq.to_ascii_uppercase())] + } + + fn translated_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { + let revcomp_sequence = revcomp(&seq.to_ascii_uppercase()); + (0..3) + .flat_map(|frame_number| { + vec![ + ReadingFrame::new_translated( + &seq.to_ascii_uppercase(), + frame_number, + hash_function.dayhoff(), + hash_function.hp(), + ), + ReadingFrame::new_translated( + &revcomp_sequence, + frame_number, + hash_function.dayhoff(), + hash_function.hp(), + ), + ] + }) + .collect() } - pub fn is_protein(&self) -> bool { - self.is_protein + fn skipmer_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { + let (m, n) = if hash_function.skipm1n3() { + (1, 3) + } else { + (2, 3) + }; + (0..3) + .flat_map(|frame_number| { + vec![ReadingFrame::new_skipmer( + &seq.to_ascii_uppercase(), + frame_number, + m, + n, + )] + }) + .collect() } } @@ -236,28 +455,24 @@ impl Iterator for SeqToHashes { type Item = Result; fn next(&mut self) -> Option { - // Check if we've processed all k-mers - if self.kmer_index >= self.max_index { - return None; - } + // Iterate over the frames using frame_index + while self.frame_index < self.frames.len() { + let frame = &self.frames[self.frame_index]; - // Iterate over the frames - for frame in self.reading_frames.frames().iter() { - let kmer_iter = frame.kmer_iter(self.k_size, self.seed, self.force); - for result in kmer_iter { - match result { - Ok(hash) => self.hashes_buffer.push(hash), - Err(e) => { - if !self.force { - return Some(Err(e)); - } - } - } + // Create a KmerIterator for the current frame + let mut kmer_iter = frame.kmer_iter(self.k_size, self.seed, self.force); + + // Process k-mers in the current frame + if let Some(hash_result) = kmer_iter.next() { + return Some(hash_result); // Return the next hash } + + // Move to the next frame if the current one is exhausted + self.frame_index += 1; } - self.kmer_index += 1; - Some(Ok(0)) // Return 0 to indicate progress; actual hashes are stored in `hashes_buffer` + // All frames exhausted + None } } @@ -1288,4 +1503,321 @@ mod test { assert_eq!(hash, expected_hash, "Mismatch in skipmer hash"); } } + + // #[test] + // fn test_reading_frames_new_dna() { + // let sequence = b"AGTCGT"; + // let hash_function = HashFunctions::Murmur64Dna; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // assert_eq!(frames.frames().len(), 1); // Only one fw/rc readingframe for DNA + // assert_eq!(frames.frames()[0].fw, sequence.to_vec()); + // assert_eq!(frames.frames()[0].rc, Some(b"ACGACT".to_vec())); + // } + + // #[test] + // fn test_reading_frames_new_is_protein() { + // let sequence = b"MVLSPADKTNVKAAW"; + // let hash_function = HashFunctions::Murmur64Protein; + + // let frames = ReadingFrames::new(sequence, true, &hash_function); + + // assert_eq!(frames.frames().len(), 1); // Only one frame for protein + // assert_eq!(frames.frames()[0].fw, sequence.to_vec()); + // assert_eq!(frames.frames()[0].rc, None); // No reverse complement for protein + // } + + // #[test] + // fn test_reading_frames_translate_frames() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Protein; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + // eprintln!("frames: {}", frames); + + // assert_eq!(frames.frames().len(), 3); // Three translated frames + // assert_eq!(frames.frames()[0].fw, b"SRRA".to_vec()); + // assert_eq!(frames.frames()[0].rc, Some(b"SSTT".to_vec())); + // assert_eq!(frames.frames()[1].fw, b"VVE".to_vec()); + // assert_eq!(frames.frames()[1].rc, Some(b"ARR".to_vec())); + // assert_eq!(frames.frames()[2].fw, b"SSS".to_vec()); + // assert_eq!(frames.frames()[2].rc, Some(b"LDD".to_vec())); + // } + + // #[test] + // fn test_reading_frames_skipmer_frames_m1n3() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Skipm1n3; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // assert_eq!(frames.frames().len(), 3); // Three skipmer frames + + // // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) + // let expected_fw_frame_0 = b"ACCG".to_vec(); // Frame 0: Keep 1 base, skip 2 bases, starting from index 0 + // let expected_fw_frame_1 = b"GGGC".to_vec(); // Frame 1: Keep 1 base, skip 2 bases, starting from index 1 + // let expected_fw_frame_2 = b"TTAT".to_vec(); // Frame 2: Keep 1 base, skip 2 bases, starting from index 2 + + // let expected_rc_frame_0 = revcomp(&expected_fw_frame_0); + // let expected_rc_frame_1 = revcomp(&expected_fw_frame_1); + // let expected_rc_frame_2 = revcomp(&expected_fw_frame_2); + + // // Check forward and reverse complement sequences for each frame + // assert_eq!(frames.frames()[0].fw, expected_fw_frame_0); + // assert_eq!(frames.frames()[0].rc, Some(expected_rc_frame_0)); + + // assert_eq!(frames.frames()[1].fw, expected_fw_frame_1); + // assert_eq!(frames.frames()[1].rc, Some(expected_rc_frame_1)); + + // assert_eq!(frames.frames()[2].fw, expected_fw_frame_2); + // assert_eq!(frames.frames()[2].rc, Some(expected_rc_frame_2)); + // } + + // #[test] + // fn test_reading_frames_skipmer_frames_m2n3() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Skipm2n3; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // assert_eq!(frames.frames().len(), 3); // Three skipmer frames + + // // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) + // let expected_fw_frame_0 = b"AGCGCGGC".to_vec(); + // let expected_fw_frame_1 = b"GTGTGACT".to_vec(); + // let expected_fw_frame_2 = b"TCTCAGT".to_vec(); + + // let expected_rc_frame_0 = revcomp(&expected_fw_frame_0); + // let expected_rc_frame_1 = revcomp(&expected_fw_frame_1); + // let expected_rc_frame_2 = revcomp(&expected_fw_frame_2); + + // // Check forward and reverse complement sequences for each frame + // assert_eq!(frames.frames()[0].fw, expected_fw_frame_0); + // assert_eq!(frames.frames()[0].rc, Some(expected_rc_frame_0)); + + // assert_eq!(frames.frames()[1].fw, expected_fw_frame_1); + // assert_eq!(frames.frames()[1].rc, Some(expected_rc_frame_1)); + + // assert_eq!(frames.frames()[2].fw, expected_fw_frame_2); + // assert_eq!(frames.frames()[2].rc, Some(expected_rc_frame_2)); + // } + + // #[test] + // fn test_reading_frame_kmer_iter() { + // let sequence = b"AGTCGT"; + // let rc = revcomp(sequence); + // let hash_function = HashFunctions::Murmur64Dna; + // let frame = ReadingFrame { + // fw: sequence.to_vec(), + // rc: Some(rc.clone()), + // hash_function, + // is_translated: false, + // }; + // eprintln!("frame: {}", frame); + // assert_eq!(frame.fw, sequence.to_vec()); + // assert_eq!(frame.rc, Some(b"ACGACT".to_vec())); + + // let kmer_iter = frame.kmer_iter(3, 42, false); + + // // Expected k-mers from the forward and reverse complement sequence + // let expected_kmers = vec![ + // (b"AGT".to_vec(), b"ACT".to_vec()), + // (b"GTC".to_vec(), b"GAC".to_vec()), + // (b"TCG".to_vec(), b"CGA".to_vec()), + // (b"CGT".to_vec(), b"ACG".to_vec()), + // ]; + + // // Compute hashes for expected k-mers + // let expected_hashes: Vec = expected_kmers + // .iter() + // .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // // Check that produced hashes match expected hashes in order + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order" + // ); + // } + + // #[test] + // fn test_kmer_iter_is_protein() { + // let sequence = b"MVLSPADKTNVKAAW"; + // let hash_function = HashFunctions::Murmur64Protein; + + // let frames = ReadingFrames::new(sequence, true, &hash_function); + + // // Only one forward frame for protein + // assert_eq!(frames.frames().len(), 1); + // let frame = &frames.frames()[0]; + + // let kmer_iter = frame.kmer_iter(3, 42, false); + + // // Expected k-mers for protein sequence + // let expected_kmers = vec![ + // b"MVL".to_vec(), + // b"VLS".to_vec(), + // b"LSP".to_vec(), + // b"SPA".to_vec(), + // b"PAD".to_vec(), + // b"ADK".to_vec(), + // b"DKT".to_vec(), + // b"KTN".to_vec(), + // b"TNV".to_vec(), + // b"NVK".to_vec(), + // b"VKA".to_vec(), + // b"KAA".to_vec(), + // b"AAW".to_vec(), + // ]; + + // // Compute hashes for expected k-mers + // let expected_hashes: Vec = expected_kmers + // .iter() + // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // // Check that produced hashes match expected hashes in order + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order" + // ); + // } + + // #[test] + // fn test_kmer_iter_translate_frames() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Protein; + // let ksize =3; + // let seed = 42; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // // Three translated frames + // assert_eq!(frames.frames().len(), 3); + // eprintln!("frames: {}", frames); + + // // Expected k-mers for translated frames + // let frame1_kmers = vec![ + // vec![b"SRR".to_vec(), b"SST".to_vec()], + // vec![b"RRA".to_vec(), b"STT".to_vec()], + // ]; + // let frame2_kmers = vec![vec![b"VVE".to_vec(), b"ARR".to_vec()]]; + // let frame3_kmers = vec![vec![b"SSS".to_vec(), b"LDD".to_vec()]]; + // let expected_kmers = vec![frame1_kmers, frame2_kmers, frame3_kmers]; + // // let expected_kmers = vec![ + // // vec![b"SRR".to_vec(), b"SST".to_vec()], + // // vec![b"RRA".to_vec(), b"STT".to_vec()], + // // vec![b"VVE".to_vec(), b"ARR".to_vec()], + // // vec![b"SSS".to_vec(), b"LDD".to_vec()], + // // ]; + + // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { + // let kmer_iter = frame.kmer_iter(ksize, seed, false); + + // // Compute hashes for expected k-mers + // // let expected_hashes: Vec = expected_frame_kmers + // // .iter() + // // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + // // .collect(); + // let expected_hashes: Vec = expected_frame_kmers + // .iter() + // .flat_map(|kmers| { + // kmers.iter().map(|fw_kmer| { + // let rc_kmer = revcomp(fw_kmer); // Compute reverse complement + // crate::_hash_murmur(std::cmp::min(fw_kmer, &rc_kmer), seed) + // }) + // }) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // // Check that produced hashes match expected hashes in order + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order for frame" + // ); + // } + // } + + // #[test] + // fn test_kmer_iter_skipmer_m1n3() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Skipm1n3; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // // Three skipmer frames + // assert_eq!(frames.frames().len(), 3); + + // // Expected k-mers for skipmer (m=1, n=3) + // let expected_kmers = vec![ + // vec![b"ACC".to_vec()], + // vec![b"GTG".to_vec()], + // vec![b"TCG".to_vec()], + // ]; + + // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { + // let kmer_iter = frame.kmer_iter(3, 42, false); + + // // Compute hashes for expected k-mers + // let expected_hashes: Vec = expected_frame_kmers + // .iter() + // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // // Check that produced hashes match expected hashes in order + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order for frame" + // ); + // } + // } + + // #[test] + // fn test_kmer_iter_skipmer_m2n3() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Skipm2n3; + + // let frames = ReadingFrames::new(sequence, false, &hash_function); + + // // Three skipmer frames + // assert_eq!(frames.frames().len(), 3); + + // // Expected k-mers for skipmer (m=2, n=3) + // let expected_kmers = vec![ + // vec![b"AGC".to_vec()], + // vec![b"GTG".to_vec()], + // vec![b"TCA".to_vec()], + // ]; + + // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { + // let kmer_iter = frame.kmer_iter(3, 42, false); + + // // Compute hashes for expected k-mers + // let expected_hashes: Vec = expected_frame_kmers + // .iter() + // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // // Check that produced hashes match expected hashes in order + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order for frame" + // ); + // } + // } } From 92373fc316666f575fd95c81ebab37d29fe9f64f Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 5 Dec 2024 17:48:46 -0800 Subject: [PATCH 06/16] start updating tests --- src/core/src/encodings.rs | 33 ------ src/core/src/signature.rs | 228 +++++++++++++++++++++----------------- 2 files changed, 128 insertions(+), 133 deletions(-) diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index d87c220aa0..9ae33d7873 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -114,39 +114,6 @@ pub fn revcomp(seq: &[u8]) -> Vec { .collect() } -/// Generate a single translated frame from a DNA sequence -/// -/// * `sequence`: The DNA sequence as a slice of bytes. -/// * `frame_number`: The frame to translate (0, 1, or 2). -/// * `dayhoff`: Whether to use the Dayhoff amino acid alphabet. -/// * `hp`: Whether to use the hydrophobic-polar amino acid alphabet. -/// -/// Returns a translated frame as a `Vec`. -pub fn translated_frame(sequence: &[u8], frame_number: usize, dayhoff: bool, hp: bool) -> Vec { - if frame_number > 2 { - panic!("Frame number must be 0, 1, or 2"); - } - - sequence - .iter() - .cloned() - .skip(frame_number) // Skip the initial bases for the frame - .take(sequence.len() - frame_number) // Adjust length based on skipped bases - .collect::>() // Collect the DNA subsequence - .chunks(3) // Group into codons (triplets) - .filter_map(|codon| to_aa(codon, dayhoff, hp).ok()) // Translate each codon - .flatten() // Flatten the nested results into a single sequence - .collect() -} - -fn skipmer_frame(seq: &[u8], start: usize, n: usize, m: usize) -> Vec { - seq.iter() - .skip(start) - .enumerate() - .filter_map(|(i, &base)| if i % n < m { Some(base) } else { None }) - .collect() -} - static CODONTABLE: Lazy> = Lazy::new(|| { [ // F diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index c7f99786e1..7185061a37 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -10,13 +10,12 @@ use std::path::Path; use std::str; use cfg_if::cfg_if; -use needletail::sequence; #[cfg(feature = "parallel")] use rayon::prelude::*; use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; -use crate::encodings::{revcomp, to_aa, HashFunctions, VALID}; +use crate::encodings::{aa_to_dayhoff, aa_to_hp, revcomp, to_aa, HashFunctions, VALID}; use crate::prelude::*; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; @@ -205,8 +204,15 @@ impl ReadingFrame { ReadingFrame::DNA { fw, rc, len } } - pub fn new_protein(sequence: &[u8]) -> Self { - let fw = sequence.to_vec(); + pub fn new_protein(sequence: &[u8], dayhoff: bool, hp: bool) -> Self { + let fw: Vec = if dayhoff { + sequence.iter().map(|&aa| aa_to_dayhoff(aa)).collect() + } else if hp { + sequence.iter().map(|&aa| aa_to_hp(aa)).collect() + } else { + sequence.to_vec() // protein, as-is. + }; + let len = fw.len(); ReadingFrame::Protein { fw, len } } @@ -251,10 +257,24 @@ impl ReadingFrame { ReadingFrame::Protein { fw, len } } + pub fn get_fw(&self) -> Option<&[u8]> { + match self { + ReadingFrame::DNA { fw, .. } => Some(fw), + ReadingFrame::Protein { fw, .. } => Some(fw), + } + } + + pub fn get_rc(&self) -> Option<&[u8]> { + match self { + ReadingFrame::DNA { rc, .. } => Some(rc), + ReadingFrame::Protein { .. } => None, + } + } + pub fn kmer_iter<'a>(&'a self, ksize: usize, seed: u64, force: bool) -> KmerIterator<'a> { match self { - ReadingFrame::DNA { fw, rc, len } => KmerIterator::new(self, ksize, seed, force), - ReadingFrame::Protein { fw, len } => KmerIterator::new(self, ksize, seed, force), + ReadingFrame::DNA { .. } => KmerIterator::new(self, ksize, seed, force), + ReadingFrame::Protein { .. } => KmerIterator::new(self, ksize, seed, force), } } } @@ -383,7 +403,7 @@ impl SeqToHashes { // Generate frames based on sequence type and hash function let frames = if is_protein { - Self::protein_frames(&sequence) + Self::protein_frames(&sequence, &hash_function) } else if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { Self::translated_frames(&sequence, &hash_function) } else if hash_function.skipm1n3() || hash_function.skipm2n3() { @@ -406,8 +426,12 @@ impl SeqToHashes { vec![ReadingFrame::new_dna(&seq.to_ascii_uppercase())] } - fn protein_frames(seq: &[u8]) -> Vec { - vec![ReadingFrame::new_protein(&seq.to_ascii_uppercase())] + fn protein_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { + vec![ReadingFrame::new_protein( + &seq.to_ascii_uppercase(), + hash_function.dayhoff(), + hash_function.hp(), + )] } fn translated_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { @@ -989,6 +1013,8 @@ impl TryInto for Signature { #[cfg(test)] mod test { + + use super::*; use std::fs::File; use std::io::{BufReader, Read}; use std::path::PathBuf; @@ -1603,105 +1629,113 @@ mod test { // assert_eq!(frames.frames()[2].rc, Some(expected_rc_frame_2)); // } - // #[test] - // fn test_reading_frame_kmer_iter() { - // let sequence = b"AGTCGT"; - // let rc = revcomp(sequence); - // let hash_function = HashFunctions::Murmur64Dna; - // let frame = ReadingFrame { - // fw: sequence.to_vec(), - // rc: Some(rc.clone()), - // hash_function, - // is_translated: false, - // }; - // eprintln!("frame: {}", frame); - // assert_eq!(frame.fw, sequence.to_vec()); - // assert_eq!(frame.rc, Some(b"ACGACT".to_vec())); - - // let kmer_iter = frame.kmer_iter(3, 42, false); - - // // Expected k-mers from the forward and reverse complement sequence - // let expected_kmers = vec![ - // (b"AGT".to_vec(), b"ACT".to_vec()), - // (b"GTC".to_vec(), b"GAC".to_vec()), - // (b"TCG".to_vec(), b"CGA".to_vec()), - // (b"CGT".to_vec(), b"ACG".to_vec()), - // ]; - - // // Compute hashes for expected k-mers - // let expected_hashes: Vec = expected_kmers - // .iter() - // .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) - // .collect(); - - // // Collect hashes produced by the kmer_iter - // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // // Check that produced hashes match expected hashes in order - // assert_eq!( - // produced_hashes, expected_hashes, - // "Hashes do not match in order" - // ); - // } - - // #[test] - // fn test_kmer_iter_is_protein() { - // let sequence = b"MVLSPADKTNVKAAW"; - // let hash_function = HashFunctions::Murmur64Protein; + #[test] + fn test_reading_frame_kmer_iter() { + let sequence = b"AGTCGT"; + let frame = ReadingFrame::new_dna(sequence); + assert_eq!(frame.get_fw(), Some(sequence.as_slice())); + assert_eq!(frame.get_rc(), Some(b"ACGACT".as_slice())); + + // Create a KmerIterator + let mut kmer_iter = KmerIterator::new(&frame, 3, 42, false); + + // Expected k-mers from the forward and reverse complement sequence + let expected_kmers = vec![ + (b"AGT".to_vec(), b"ACT".to_vec()), + (b"GTC".to_vec(), b"GAC".to_vec()), + (b"TCG".to_vec(), b"CGA".to_vec()), + (b"CGT".to_vec(), b"ACG".to_vec()), + ]; + + // Compute expected hashes + let expected_hashes: Vec = expected_kmers + .iter() + .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) + .collect(); - // let frames = ReadingFrames::new(sequence, true, &hash_function); + // Collect hashes produced by the kmer_iter + let mut produced_hashes = Vec::new(); - // // Only one forward frame for protein - // assert_eq!(frames.frames().len(), 1); - // let frame = &frames.frames()[0]; + while let Some(result) = kmer_iter.next() { + match result { + Ok(hash) => produced_hashes.push(hash), + Err(e) => panic!("Error encountered during k-mer iteration: {:?}", e), + } + } - // let kmer_iter = frame.kmer_iter(3, 42, false); + // Assert that produced hashes match expected hashes + assert_eq!( + produced_hashes, expected_hashes, + "Hashes do not match in order" + ); - // // Expected k-mers for protein sequence - // let expected_kmers = vec![ - // b"MVL".to_vec(), - // b"VLS".to_vec(), - // b"LSP".to_vec(), - // b"SPA".to_vec(), - // b"PAD".to_vec(), - // b"ADK".to_vec(), - // b"DKT".to_vec(), - // b"KTN".to_vec(), - // b"TNV".to_vec(), - // b"NVK".to_vec(), - // b"VKA".to_vec(), - // b"KAA".to_vec(), - // b"AAW".to_vec(), - // ]; + // Debugging output for verification + eprintln!( + "Expected hashes: {:?}\nProduced hashes: {:?}", + expected_hashes, produced_hashes + ); + } - // // Compute hashes for expected k-mers - // let expected_hashes: Vec = expected_kmers - // .iter() - // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) - // .collect(); + #[test] + fn test_kmer_iter_is_protein() { + let sequence = b"MVLSPADKTNVKAAW"; + let hash_function = HashFunctions::Murmur64Protein; + + let frame = + ReadingFrame::new_protein(sequence, hash_function.dayhoff(), hash_function.hp()); + + let kmer_iter = frame.kmer_iter(3, 42, false); + + // Expected k-mers for protein sequence + let expected_kmers = vec![ + b"MVL".to_vec(), + b"VLS".to_vec(), + b"LSP".to_vec(), + b"SPA".to_vec(), + b"PAD".to_vec(), + b"ADK".to_vec(), + b"DKT".to_vec(), + b"KTN".to_vec(), + b"TNV".to_vec(), + b"NVK".to_vec(), + b"VKA".to_vec(), + b"KAA".to_vec(), + b"AAW".to_vec(), + ]; + + // Compute hashes for expected k-mers + let expected_hashes: Vec = expected_kmers + .iter() + .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + .collect(); - // // Collect hashes produced by the kmer_iter - // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + // Collect hashes produced by the kmer_iter + let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - // // Check that produced hashes match expected hashes in order - // assert_eq!( - // produced_hashes, expected_hashes, - // "Hashes do not match in order" - // ); - // } + // Check that produced hashes match expected hashes in order + assert_eq!( + produced_hashes, expected_hashes, + "Hashes do not match in order" + ); + } // #[test] // fn test_kmer_iter_translate_frames() { // let sequence = b"AGTCGTCGAGCT"; // let hash_function = HashFunctions::Murmur64Protein; - // let ksize =3; + // let k_size =3; // let seed = 42; + // let force = false; + // let is_protein = false; - // let frames = ReadingFrames::new(sequence, false, &hash_function); + // let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + // let frames = sth.frames; // // Three translated frames - // assert_eq!(frames.frames().len(), 3); - // eprintln!("frames: {}", frames); + // assert_eq!(frames.len(), 3); + // for frame in frames { + // eprintln!("frames: {}", frame); + // } // // Expected k-mers for translated frames // let frame1_kmers = vec![ @@ -1711,15 +1745,9 @@ mod test { // let frame2_kmers = vec![vec![b"VVE".to_vec(), b"ARR".to_vec()]]; // let frame3_kmers = vec![vec![b"SSS".to_vec(), b"LDD".to_vec()]]; // let expected_kmers = vec![frame1_kmers, frame2_kmers, frame3_kmers]; - // // let expected_kmers = vec![ - // // vec![b"SRR".to_vec(), b"SST".to_vec()], - // // vec![b"RRA".to_vec(), b"STT".to_vec()], - // // vec![b"VVE".to_vec(), b"ARR".to_vec()], - // // vec![b"SSS".to_vec(), b"LDD".to_vec()], - // // ]; - // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { - // let kmer_iter = frame.kmer_iter(ksize, seed, false); + // for (frame, expected_frame_kmers) in frames.iter().zip(expected_kmers.iter()) { + // let kmer_iter = frame.kmer_iter(k_size, seed, false); // // Compute hashes for expected k-mers // // let expected_hashes: Vec = expected_frame_kmers From 7535decf7f3f6c405d341fd54ec4aba8a3162dd3 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 10 Dec 2024 12:47:29 -0800 Subject: [PATCH 07/16] upd comments --- src/core/src/signature.rs | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 7185061a37..a2f36e26b3 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -167,8 +167,8 @@ impl SigsTrait for Sketch { pub enum ReadingFrame { DNA { fw: Vec, - rc: Vec, // Reverse complement is required - len: usize, + rc: Vec, + len: usize, // len gives max_index for kmer iterator }, Protein { fw: Vec, // Only forward frame @@ -218,8 +218,8 @@ impl ReadingFrame { } pub fn new_skipmer(seq: &[u8], start: usize, m: usize, n: usize) -> Self { - if start > n { - panic!("Skipmer frame number must be <= n ({})", n); + if start >= n { + panic!("Skipmer frame number must be < n ({})", n); } // Generate forward skipmer frame let fw: Vec = seq @@ -239,21 +239,21 @@ impl ReadingFrame { panic!("Frame number must be 0, 1, or 2"); } - // Translate the forward frame + // translate sequence let fw: Vec = sequence .iter() .cloned() - .skip(frame_number) // Skip the initial bases for the frame - .take(sequence.len() - frame_number) // Adjust length based on skipped bases - .collect::>() // Collect the DNA subsequence - .chunks(3) // Group into codons (triplets) - .filter_map(|codon| to_aa(codon, dayhoff, hp).ok()) // Translate each codon - .flatten() // Flatten the nested results into a single sequence + .skip(frame_number) // skip the initial bases for the frame + .take(sequence.len() - frame_number) // adjust length based on skipped bases + .collect::>() // collect the DNA subsequence + .chunks(3) // group into codons (triplets) + .filter_map(|codon| to_aa(codon, dayhoff, hp).ok()) // translate each codon + .flatten() // flatten the nested results into a single sequence .collect(); let len = fw.len(); - // Return a Protein reading frame (no reverse complement for translated frames) + // return protein reading frame ReadingFrame::Protein { fw, len } } @@ -421,26 +421,28 @@ impl SeqToHashes { } } - /// Generate DNA frames (forward + reverse complement) + /// generate frames from DNA: 1 DNA frame (fw+rc) fn dna_frames(seq: &[u8]) -> Vec { - vec![ReadingFrame::new_dna(&seq.to_ascii_uppercase())] + vec![ReadingFrame::new_dna(&seq)] } + /// generate frames from protein: 1 protein frame fn protein_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { vec![ReadingFrame::new_protein( - &seq.to_ascii_uppercase(), + &seq, hash_function.dayhoff(), hash_function.hp(), )] } + /// generate translated frames: 6 protein frames fn translated_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { - let revcomp_sequence = revcomp(&seq.to_ascii_uppercase()); + let revcomp_sequence = revcomp(&seq); (0..3) .flat_map(|frame_number| { vec![ ReadingFrame::new_translated( - &seq.to_ascii_uppercase(), + &seq, frame_number, hash_function.dayhoff(), hash_function.hp(), @@ -456,6 +458,7 @@ impl SeqToHashes { .collect() } + /// generate skipmer frames: 3 DNA frames (each with fw+rc) fn skipmer_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { let (m, n) = if hash_function.skipm1n3() { (1, 3) @@ -465,7 +468,7 @@ impl SeqToHashes { (0..3) .flat_map(|frame_number| { vec![ReadingFrame::new_skipmer( - &seq.to_ascii_uppercase(), + &seq, frame_number, m, n, From b9e68f0060b35b37950ed7cb8e7f0b7c03697d77 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 10 Dec 2024 17:18:19 -0800 Subject: [PATCH 08/16] saving intermediate state --- src/core/src/signature.rs | 712 +++++++++++++++++++++----------------- 1 file changed, 396 insertions(+), 316 deletions(-) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index a2f36e26b3..8a724e950c 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -163,7 +163,7 @@ impl SigsTrait for Sketch { } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum ReadingFrame { DNA { fw: Vec, @@ -271,14 +271,25 @@ impl ReadingFrame { } } - pub fn kmer_iter<'a>(&'a self, ksize: usize, seed: u64, force: bool) -> KmerIterator<'a> { + pub fn kmer_count(&self, k_size: usize) -> usize { match self { - ReadingFrame::DNA { .. } => KmerIterator::new(self, ksize, seed, force), - ReadingFrame::Protein { .. } => KmerIterator::new(self, ksize, seed, force), + ReadingFrame::DNA { len, .. } | ReadingFrame::Protein { len, .. } => { + if *len >= k_size { + len - k_size + 1 + } else { + 0 // No k-mers possible if len is smaller than k_size + } + } } } + + pub fn kmer_iter(&self, ksize: usize, seed: u64, force: bool) -> KmerIterator { + KmerIterator::new(self, ksize, seed, force) + } + } +#[derive(Debug, Clone)] pub struct KmerIterator<'a> { frame: &'a ReadingFrame, // Reference to the ReadingFrame ksize: usize, @@ -312,45 +323,6 @@ impl<'a> KmerIterator<'a> { } Ok(()) } - - fn get_dna_hash(&mut self, fw: &[u8], rc: &[u8]) -> Option> { - let kmer = &fw[self.index..self.index + self.ksize]; - - if !self.force { - if let Err(e) = self.validate_dna_kmer(kmer) { - self.index += 1; - return Some(Err(e)); - } - } - - let krc = &rc[rc.len() - self.ksize - self.index..rc.len() - self.index]; - let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed); - // NTP TESTING - eprintln!( - "Forward DNA k-mer: {}, Reverse Complement k-mer: {}, hash: {}", - String::from_utf8_lossy(kmer), - String::from_utf8_lossy(krc), - hash, - ); - - self.index += 1; - Some(Ok(hash)) - } - - fn get_protein_hash(&mut self, fw: &[u8]) -> Option> { - let kmer = &fw[self.index..self.index + self.ksize]; - - let hash = crate::_hash_murmur(kmer, self.seed); - // NTP TESTING - eprintln!( - "Protein k-mer: {}, hash: {}", - String::from_utf8_lossy(kmer), - hash - ); - - self.index += 1; - Some(Ok(hash)) - } } impl<'a> Iterator for KmerIterator<'a> { @@ -358,31 +330,60 @@ impl<'a> Iterator for KmerIterator<'a> { fn next(&mut self) -> Option { match self.frame { - ReadingFrame::DNA { fw, rc, len } => { + ReadingFrame::DNA { fw, rc, len, .. } => { if self.out_of_bounds(*len) { return None; } - self.get_dna_hash(fw, rc) + + let kmer = &fw[self.index..self.index + self.ksize]; + if !self.force { + if let Err(e) = self.validate_dna_kmer(kmer) { + self.index += 1; + return Some(Err(e)); + } + } + + let krc = &rc[rc.len() - self.ksize - self.index..rc.len() - self.index]; + let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed); + // NTP TESTING + eprintln!( + "Forward DNA k-mer: {}, Reverse Complement k-mer: {}, hash: {}", + String::from_utf8_lossy(kmer), + String::from_utf8_lossy(krc), + hash, + ); + self.index += 1; + Some(Ok(hash)) } - ReadingFrame::Protein { fw, len } => { + ReadingFrame::Protein { fw, len, .. } => { if self.out_of_bounds(*len) { return None; } - self.get_protein_hash(fw) + let kmer = &fw[self.index..self.index + self.ksize]; + let hash = crate::_hash_murmur(kmer, self.seed); + // NTP TESTING + eprintln!( + "Protein k-mer: {}, hash: {}", + String::from_utf8_lossy(kmer), + hash + ); + self.index += 1; + Some(Ok(hash)) } } } } -pub struct SeqToHashes { +pub struct SeqToHashes<'a> { k_size: usize, force: bool, seed: u64, frames: Vec, frame_index: usize, // Index of the current frame + current_kmer_iter: Option>, } -impl SeqToHashes { +impl<'a> SeqToHashes<'a> { pub fn new( seq: &[u8], k_size: usize, @@ -418,6 +419,7 @@ impl SeqToHashes { seed, frames, frame_index: 0, + current_kmer_iter: None, } } @@ -466,43 +468,90 @@ impl SeqToHashes { (2, 3) }; (0..3) - .flat_map(|frame_number| { - vec![ReadingFrame::new_skipmer( - &seq, - frame_number, - m, - n, - )] - }) + .flat_map(|frame_number| vec![ReadingFrame::new_skipmer(&seq, frame_number, m, n)]) .collect() } } -impl Iterator for SeqToHashes { +impl<'a> Iterator for SeqToHashes<'a> { type Item = Result; fn next(&mut self) -> Option { - // Iterate over the frames using frame_index while self.frame_index < self.frames.len() { - let frame = &self.frames[self.frame_index]; - - // Create a KmerIterator for the current frame - let mut kmer_iter = frame.kmer_iter(self.k_size, self.seed, self.force); + // Initialize the kmer_iter for the current frame if it is None + if self.current_kmer_iter.is_none() { + let frame = &self.frames[self.frame_index]; + self.current_kmer_iter = Some(frame.kmer_iter(self.k_size, self.seed, self.force)); + } - // Process k-mers in the current frame - if let Some(hash_result) = kmer_iter.next() { - return Some(hash_result); // Return the next hash + // Attempt to get the next k-mer from the current iterator + if let Some(ref mut kmer_iter) = self.current_kmer_iter { + if let Some(hash_result) = kmer_iter.next() { + return Some(hash_result); + } } - // Move to the next frame if the current one is exhausted + // If the current iterator is exhausted, move to the next frame + self.current_kmer_iter = None; self.frame_index += 1; } - // All frames exhausted + // All frames and iterators are exhausted None } } +// impl<'a> Iterator for SeqToHashes<'a> { +// type Item = Result; + +// fn next(&mut self) -> Option { +// while self.frame_index < self.frames.len() { +// // Initialize kmer_iter for the current frame if it is None +// if self.current_kmer_iter.is_none() { +// let frame = &self.frames[self.frame_index]; +// self.current_kmer_iter = Some(frame.kmer_iter(self.k_size, self.seed, self.force)); +// } + +// // Attempt to get the next hash from the current iterator +// if let Some(ref mut kmer_iter) = self.current_kmer_iter { +// if let Some(hash_result) = kmer_iter.next() { +// return Some(hash_result); +// } +// } + +// // If the current iterator is exhausted, move to the next frame +// self.current_kmer_iter = None; +// self.frame_index += 1; +// } + +// // All frames and iterators are exhausted +// None +// } +// } +// impl Iterator for SeqToHashes { +// type Item = Result; + +// fn next(&mut self) -> Option { +// // Iterate over the frames using frame_index +// while self.frame_index < self.frames.len() { +// let frame = &self.frames[self.frame_index]; +// // Create a KmerIterator for the current frame +// let mut kmer_iter = frame.kmer_iter(self.k_size, self.seed, self.force); + +// // Process k-mers in the current frame +// if let Some(hash_result) = kmer_iter.next() { +// return Some(hash_result); // Return the next hash +// } + +// // Move to the next frame if the current one is exhausted +// self.frame_index += 1; +// } + +// // All frames exhausted +// None +// } +// } + #[derive(Serialize, Deserialize, Debug, Clone, TypedBuilder)] #[cfg_attr( feature = "rkyv", @@ -1454,43 +1503,6 @@ mod test { } } - #[test] - fn test_seqtohashes_dna() { - let sequence = b"AGTCGTCA"; - let k_size = 7; - let seed = 42; - let force = true; // Force skip over invalid bases if needed - let is_protein = false; - // Initialize SeqToHashes iterator using the new constructor - let mut seq_to_hashes = SeqToHashes::new( - sequence, - k_size, - force, - is_protein, - HashFunctions::Murmur64Dna, - seed, - ); - - // Define expected hashes for the kmer configuration. - let expected_kmers = ["AGTCGTC", "GTCGTCA"]; - let expected_krc = ["GACGACT", "TGACGAC"]; - - // Compute expected hashes by hashing each k-mer with its reverse complement - let expected_hashes: Vec = expected_kmers - .iter() - .zip(expected_krc.iter()) - .map(|(kmer, krc)| { - // Convert both kmer and krc to byte slices and pass to _hash_murmur - crate::_hash_murmur(std::cmp::min(kmer.as_bytes(), krc.as_bytes()), seed) - }) - .collect(); - - // Compare each produced hash from the iterator with the expected hash - for expected_hash in expected_hashes { - let hash = seq_to_hashes.next().unwrap().ok().unwrap(); - assert_eq!(hash, expected_hash, "Mismatch in DNA hash"); - } - } #[test] fn test_seqtohashes_skipm2n3() { @@ -1533,104 +1545,100 @@ mod test { } } - // #[test] - // fn test_reading_frames_new_dna() { - // let sequence = b"AGTCGT"; - // let hash_function = HashFunctions::Murmur64Dna; - - // let frames = ReadingFrames::new(sequence, false, &hash_function); - - // assert_eq!(frames.frames().len(), 1); // Only one fw/rc readingframe for DNA - // assert_eq!(frames.frames()[0].fw, sequence.to_vec()); - // assert_eq!(frames.frames()[0].rc, Some(b"ACGACT".to_vec())); - // } - - // #[test] - // fn test_reading_frames_new_is_protein() { - // let sequence = b"MVLSPADKTNVKAAW"; - // let hash_function = HashFunctions::Murmur64Protein; - - // let frames = ReadingFrames::new(sequence, true, &hash_function); - - // assert_eq!(frames.frames().len(), 1); // Only one frame for protein - // assert_eq!(frames.frames()[0].fw, sequence.to_vec()); - // assert_eq!(frames.frames()[0].rc, None); // No reverse complement for protein - // } - - // #[test] - // fn test_reading_frames_translate_frames() { - // let sequence = b"AGTCGTCGAGCT"; - // let hash_function = HashFunctions::Murmur64Protein; - - // let frames = ReadingFrames::new(sequence, false, &hash_function); - // eprintln!("frames: {}", frames); + #[test] + fn test_reading_frame_new_dna() { + let sequence = b"AGTCGT"; + let hash_function = HashFunctions::Murmur64Dna; - // assert_eq!(frames.frames().len(), 3); // Three translated frames - // assert_eq!(frames.frames()[0].fw, b"SRRA".to_vec()); - // assert_eq!(frames.frames()[0].rc, Some(b"SSTT".to_vec())); - // assert_eq!(frames.frames()[1].fw, b"VVE".to_vec()); - // assert_eq!(frames.frames()[1].rc, Some(b"ARR".to_vec())); - // assert_eq!(frames.frames()[2].fw, b"SSS".to_vec()); - // assert_eq!(frames.frames()[2].rc, Some(b"LDD".to_vec())); - // } + let frames = ReadingFrame::new_dna(sequence); - // #[test] - // fn test_reading_frames_skipmer_frames_m1n3() { - // let sequence = b"AGTCGTCGAGCT"; - // let hash_function = HashFunctions::Murmur64Skipm1n3; + assert_eq!(frames.get_fw(), Some(sequence.as_slice())); + assert_eq!(frames.get_rc(), Some(b"ACGACT".as_slice())); + } - // let frames = ReadingFrames::new(sequence, false, &hash_function); + #[test] + fn test_reading_frames_new_is_protein() { + let sequence = b"MVLSPADKTNVKAAW"; - // assert_eq!(frames.frames().len(), 3); // Three skipmer frames + let frames = ReadingFrame::new_protein(sequence, false, false); - // // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) - // let expected_fw_frame_0 = b"ACCG".to_vec(); // Frame 0: Keep 1 base, skip 2 bases, starting from index 0 - // let expected_fw_frame_1 = b"GGGC".to_vec(); // Frame 1: Keep 1 base, skip 2 bases, starting from index 1 - // let expected_fw_frame_2 = b"TTAT".to_vec(); // Frame 2: Keep 1 base, skip 2 bases, starting from index 2 + assert_eq!(frames.get_fw(), Some(sequence.as_slice())); + assert_eq!(frames.get_rc(), None); // No reverse complement for protein + } - // let expected_rc_frame_0 = revcomp(&expected_fw_frame_0); - // let expected_rc_frame_1 = revcomp(&expected_fw_frame_1); - // let expected_rc_frame_2 = revcomp(&expected_fw_frame_2); + #[test] + fn test_reading_frame_translate() { + let sequence = b"AGTCGTCGAGCT"; + let revcomp = revcomp(sequence); + let mut frames = Vec::new(); + for i in 0..3 { + let frame_fw = ReadingFrame::new_translated(sequence, i, false, false); + eprintln!("frame: {}", frame_fw); + frames.push(frame_fw); + let frame_rc = ReadingFrame::new_translated(&revcomp, i, false, false); + eprintln!("frame: {}", frame_rc); + frames.push(frame_rc); + } - // // Check forward and reverse complement sequences for each frame - // assert_eq!(frames.frames()[0].fw, expected_fw_frame_0); - // assert_eq!(frames.frames()[0].rc, Some(expected_rc_frame_0)); + assert_eq!(frames[0].get_fw(), Some(b"SRRA".as_slice())); + assert_eq!(frames[1].get_fw(), Some(b"SSTT".as_slice())); + assert_eq!(frames[2].get_fw(), Some(b"VVE".as_slice())); + assert_eq!(frames[3].get_fw(), Some(b"ARR".as_slice())); + assert_eq!(frames[4].get_fw(), Some(b"SSS".as_slice())); + assert_eq!(frames[5].get_fw(), Some(b"LDD".as_slice())); + } - // assert_eq!(frames.frames()[1].fw, expected_fw_frame_1); - // assert_eq!(frames.frames()[1].rc, Some(expected_rc_frame_1)); + #[test] + fn test_reading_frame_skipmer_m1n3() { + let sequence = b"AGTCGTCGAGCT"; + let m = 1; + let n = 3; + + let mut frames = Vec::new(); + for start in 0..3 { + let frame = ReadingFrame::new_skipmer(sequence, start, m, n); + eprintln!("frame: {}", frame); + frames.push(frame); + } - // assert_eq!(frames.frames()[2].fw, expected_fw_frame_2); - // assert_eq!(frames.frames()[2].rc, Some(expected_rc_frame_2)); - // } + assert_eq!(frames.len(), 3); // Three skipmer frames - // #[test] - // fn test_reading_frames_skipmer_frames_m2n3() { - // let sequence = b"AGTCGTCGAGCT"; - // let hash_function = HashFunctions::Murmur64Skipm2n3; + // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) + assert_eq!(frames[0].get_fw(), Some(b"ACCG".as_slice())); + assert_eq!(frames[0].get_rc(), Some(b"CGGT".as_slice())); - // let frames = ReadingFrames::new(sequence, false, &hash_function); + assert_eq!(frames[1].get_fw(), Some(b"GGGC".as_slice())); + assert_eq!(frames[1].get_rc(), Some(b"GCCC".as_slice())); - // assert_eq!(frames.frames().len(), 3); // Three skipmer frames + assert_eq!(frames[2].get_fw(), Some(b"TTAT".as_slice())); + assert_eq!(frames[2].get_rc(), Some(b"ATAA".as_slice())); + } - // // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) - // let expected_fw_frame_0 = b"AGCGCGGC".to_vec(); - // let expected_fw_frame_1 = b"GTGTGACT".to_vec(); - // let expected_fw_frame_2 = b"TCTCAGT".to_vec(); + #[test] + fn test_reading_frame_skipmer_m2n3() { + let sequence = b"AGTCGTCGAGCT"; + let m = 2; + let n = 3; + + let mut frames = Vec::new(); + for start in 0..3 { + let frame = ReadingFrame::new_skipmer(sequence, start, m, n); + eprintln!("frame: {}", frame); + frames.push(frame); + } - // let expected_rc_frame_0 = revcomp(&expected_fw_frame_0); - // let expected_rc_frame_1 = revcomp(&expected_fw_frame_1); - // let expected_rc_frame_2 = revcomp(&expected_fw_frame_2); + assert_eq!(frames.len(), 3); // Three skipmer frames - // // Check forward and reverse complement sequences for each frame - // assert_eq!(frames.frames()[0].fw, expected_fw_frame_0); - // assert_eq!(frames.frames()[0].rc, Some(expected_rc_frame_0)); + // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) + assert_eq!(frames[0].get_fw(), Some(b"AGCGCGGC".as_slice())); + assert_eq!(frames[0].get_rc(), Some(b"GCCGCGCT".as_slice())); - // assert_eq!(frames.frames()[1].fw, expected_fw_frame_1); - // assert_eq!(frames.frames()[1].rc, Some(expected_rc_frame_1)); + assert_eq!(frames[1].get_fw(), Some(b"GTGTGACT".as_slice())); + assert_eq!(frames[1].get_rc(), Some(b"AGTCACAC".as_slice())); - // assert_eq!(frames.frames()[2].fw, expected_fw_frame_2); - // assert_eq!(frames.frames()[2].rc, Some(expected_rc_frame_2)); - // } + assert_eq!(frames[2].get_fw(), Some(b"TCTCAGT".as_slice())); + assert_eq!(frames[2].get_rc(), Some(b"ACTGAGA".as_slice())); + } #[test] fn test_reading_frame_kmer_iter() { @@ -1722,133 +1730,205 @@ mod test { ); } - // #[test] - // fn test_kmer_iter_translate_frames() { - // let sequence = b"AGTCGTCGAGCT"; - // let hash_function = HashFunctions::Murmur64Protein; - // let k_size =3; - // let seed = 42; - // let force = false; - // let is_protein = false; - - // let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); - // let frames = sth.frames; - - // // Three translated frames - // assert_eq!(frames.len(), 3); - // for frame in frames { - // eprintln!("frames: {}", frame); - // } - - // // Expected k-mers for translated frames - // let frame1_kmers = vec![ - // vec![b"SRR".to_vec(), b"SST".to_vec()], - // vec![b"RRA".to_vec(), b"STT".to_vec()], - // ]; - // let frame2_kmers = vec![vec![b"VVE".to_vec(), b"ARR".to_vec()]]; - // let frame3_kmers = vec![vec![b"SSS".to_vec(), b"LDD".to_vec()]]; - // let expected_kmers = vec![frame1_kmers, frame2_kmers, frame3_kmers]; - - // for (frame, expected_frame_kmers) in frames.iter().zip(expected_kmers.iter()) { - // let kmer_iter = frame.kmer_iter(k_size, seed, false); - - // // Compute hashes for expected k-mers - // // let expected_hashes: Vec = expected_frame_kmers - // // .iter() - // // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) - // // .collect(); - // let expected_hashes: Vec = expected_frame_kmers - // .iter() - // .flat_map(|kmers| { - // kmers.iter().map(|fw_kmer| { - // let rc_kmer = revcomp(fw_kmer); // Compute reverse complement - // crate::_hash_murmur(std::cmp::min(fw_kmer, &rc_kmer), seed) - // }) - // }) - // .collect(); - - // // Collect hashes produced by the kmer_iter - // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // // Check that produced hashes match expected hashes in order - // assert_eq!( - // produced_hashes, expected_hashes, - // "Hashes do not match in order for frame" - // ); - // } - // } - - // #[test] - // fn test_kmer_iter_skipmer_m1n3() { - // let sequence = b"AGTCGTCGAGCT"; - // let hash_function = HashFunctions::Murmur64Skipm1n3; - - // let frames = ReadingFrames::new(sequence, false, &hash_function); - - // // Three skipmer frames - // assert_eq!(frames.frames().len(), 3); - - // // Expected k-mers for skipmer (m=1, n=3) - // let expected_kmers = vec![ - // vec![b"ACC".to_vec()], - // vec![b"GTG".to_vec()], - // vec![b"TCG".to_vec()], - // ]; - - // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { - // let kmer_iter = frame.kmer_iter(3, 42, false); - - // // Compute hashes for expected k-mers - // let expected_hashes: Vec = expected_frame_kmers - // .iter() - // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) - // .collect(); - - // // Collect hashes produced by the kmer_iter - // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // // Check that produced hashes match expected hashes in order - // assert_eq!( - // produced_hashes, expected_hashes, - // "Hashes do not match in order for frame" - // ); - // } - // } - - // #[test] - // fn test_kmer_iter_skipmer_m2n3() { - // let sequence = b"AGTCGTCGAGCT"; - // let hash_function = HashFunctions::Murmur64Skipm2n3; - - // let frames = ReadingFrames::new(sequence, false, &hash_function); - - // // Three skipmer frames - // assert_eq!(frames.frames().len(), 3); - - // // Expected k-mers for skipmer (m=2, n=3) - // let expected_kmers = vec![ - // vec![b"AGC".to_vec()], - // vec![b"GTG".to_vec()], - // vec![b"TCA".to_vec()], - // ]; - - // for (frame, expected_frame_kmers) in frames.frames().iter().zip(expected_kmers.iter()) { - // let kmer_iter = frame.kmer_iter(3, 42, false); - - // // Compute hashes for expected k-mers - // let expected_hashes: Vec = expected_frame_kmers - // .iter() - // .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) - // .collect(); - - // // Collect hashes produced by the kmer_iter - // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // // Check that produced hashes match expected hashes in order - // assert_eq!( - // produced_hashes, expected_hashes, - // "Hashes do not match in order for frame" - // ); - // } - // } -} + #[test] + fn test_kmer_iter_translate_frames() { + let sequence = b"AGTCGTCGAGCT"; + let hash_function = HashFunctions::Murmur64Protein; + let k_size =3; + let seed = 42; + let force = false; + let is_protein = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames; + + assert_eq!(frames[0].get_fw(), Some(b"SRRA".as_slice())); + assert_eq!(frames[1].get_fw(), Some(b"SSTT".as_slice())); + assert_eq!(frames[2].get_fw(), Some(b"VVE".as_slice())); + assert_eq!(frames[3].get_fw(), Some(b"ARR".as_slice())); + assert_eq!(frames[4].get_fw(), Some(b"SSS".as_slice())); + assert_eq!(frames[5].get_fw(), Some(b"LDD".as_slice())); + + // Six translated frames + assert_eq!(frames.len(), 6); + + // Expected k-mers for translated frames + let f1_kmers = vec![b"SRR".as_slice(), b"RRA".as_slice()]; + let f2_kmers = vec![b"SST".as_slice(), b"STT".as_slice()]; + let f3_kmers = vec![b"VVE".as_slice()]; + let f4_kmers = vec![b"ARR".as_slice()]; + let f5_kmers = vec![b"SSS".as_slice()]; + let f6_kmers = vec![b"LDD".as_slice()]; + let expected_kmers = vec![f1_kmers, f2_kmers, f3_kmers, f4_kmers, f5_kmers, f6_kmers]; + + for (frame, expected_frame_kmers) in frames.iter().zip(expected_kmers.iter()) { + let kmer_iter = frame.kmer_iter(k_size, seed, false); + // Compute hashes for expected k-mers + let expected_hashes: Vec = expected_frame_kmers + .iter() + .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) + .collect(); + + // Collect hashes produced by the kmer_iter + let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // Check that produced hashes match expected hashes in order + assert_eq!( + produced_hashes, expected_hashes, + "Hashes do not match in order for frame" + ); + } + } + + #[test] + fn test_seqtohashes_kmer_iter_skipmer_m1n3() { + let sequence = b"AGTCGTCGAGCT"; + let hash_function = HashFunctions::Murmur64Skipm1n3; + let k_size = 3; + let is_protein = false; + let seed = 42; + let force = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + // get hashes from sth + let frames = sth.frames.clone(); + let sth_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + eprintln!("sth_hashes: {:?}", sth_hashes); + + // Three skipmer frames + assert_eq!(frames.len(), 3); + assert_eq!(frames[0].get_fw(), Some(b"ACCG".as_slice())); + assert_eq!(frames[0].get_rc(), Some(b"CGGT".as_slice())); + let f1_kmers = vec![(b"ACC".as_slice(), b"GGT".as_slice()), (b"CCG".as_slice(), b"CGG".as_slice())]; + + assert_eq!(frames[1].get_fw(), Some(b"GGGC".as_slice())); + assert_eq!(frames[1].get_rc(), Some(b"GCCC".as_slice())); + let f2_kmers = vec![(b"GGG".as_slice(), b"CCC".as_slice()), (b"GGC".as_slice(), b"GCC".as_slice())]; + + assert_eq!(frames[2].get_fw(), Some(b"TTAT".as_slice())); + assert_eq!(frames[2].get_rc(), Some(b"ATAA".as_slice())); + let f3_kmers = vec![(b"TTA".as_slice(), b"TAA".as_slice()), (b"TAT".as_slice(), b"ATA".as_slice())]; + + // Expected k-mers for skipmer (m=1, n=3) + let expected_kmers = vec![f1_kmers, f2_kmers, f3_kmers]; + + let mut all_expected = Vec::new(); + for (frame, expected_frame_kmers) in frames.iter().zip(expected_kmers.iter()) { + let kmer_iter = frame.kmer_iter(3, 42, false); + + // Compute hashes for expected k-mers + let expected_hashes: Vec = expected_frame_kmers + .iter() + .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) + .collect(); + + // Collect hashes produced by the kmer_iter + let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // Check that produced hashes match expected hashes in order + assert_eq!( + produced_hashes, expected_hashes, + "Hashes do not match in order for frame" + ); + // keep track of all expected hashes + all_expected.extend(expected_hashes); + } + + // Check that sth hashes match expected hashes in order + // assert_eq!( + // sth_hashes, all_expected, + // "Hashes do not match in order for SeqToHashes" + // ); + } + + + #[test] + fn test_kmer_iter_skipmer_m2n3() { + let sequence = b"AGTCGTCGAGCT"; + let hash_function = HashFunctions::Murmur64Skipm2n3; + let k_size = 7; + let is_protein = false; + let seed = 42; + let force = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames; + + // Three skipmer frames + assert_eq!(frames.len(), 3); + assert_eq!(frames[0].get_fw(), Some(b"AGCGCGGC".as_slice())); + assert_eq!(frames[0].get_rc(), Some(b"GCCGCGCT".as_slice())); + let f1_kmers = vec![(b"AGCGCGG".as_slice(), b"CCGCGCT".as_slice()), + (b"GCGCGGC".as_slice(), b"GCCGCGC".as_slice())]; + + assert_eq!(frames[1].get_fw(), Some(b"GTGTGACT".as_slice())); + assert_eq!(frames[1].get_rc(), Some(b"AGTCACAC".as_slice())); + let f2_kmers = vec![(b"GTGTGAC".as_slice(), b"GTCACAC".as_slice()), + (b"TGTGACT".as_slice(), b"AGTCACA".as_slice())]; + + assert_eq!(frames[2].get_fw(), Some(b"TCTCAGT".as_slice())); + assert_eq!(frames[2].get_rc(), Some(b"ACTGAGA".as_slice())); + let f3_kmers = vec![(b"TCTCAGT".as_slice(), b"ACTGAGA".as_slice())]; + + + // Expected k-mers for skipmer (m=2, n=3) + let expected_kmers = vec![f1_kmers, f2_kmers, f3_kmers]; + + for (frame, expected_frame_kmers) in frames.iter().zip(expected_kmers.iter()) { + // Compute hashes for expected k-mers + let expected_hashes: Vec = expected_frame_kmers + .iter() + .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) + .collect(); + + // Collect hashes produced by the kmer_iter + let kmer_iter = frame.kmer_iter(k_size, seed, force); + let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // Check that produced hashes match expected hashes in order + eprintln!("expected: {:?}, produced: {:?}", expected_hashes, produced_hashes); + assert_eq!( + produced_hashes, expected_hashes, + "Hashes do not match in order for frame" + ); + } + } + +#[test] + fn test_seqtohashes_dna() { + let sequence = b"AGTCGTCA"; + let k_size = 7; + let seed = 42; + let force = true; // Force skip over invalid bases if needed + let is_protein = false; + // Initialize SeqToHashes iterator using the new constructor + let mut seq_to_hashes = SeqToHashes::new( + sequence, + k_size, + force, + is_protein, + HashFunctions::Murmur64Dna, + seed, + ); + + // Define expected hashes for the kmer configuration. + let expected_kmers = ["AGTCGTC", "GTCGTCA"]; + let expected_krc = ["GACGACT", "TGACGAC"]; + + // Compute expected hashes by hashing each k-mer with its reverse complement + let expected_hashes: Vec = expected_kmers + .iter() + .zip(expected_krc.iter()) + .map(|(kmer, krc)| { + // Convert both kmer and krc to byte slices and pass to _hash_murmur + crate::_hash_murmur(std::cmp::min(kmer.as_bytes(), krc.as_bytes()), seed) + }) + .collect(); + + // Compare each produced hash from the iterator with the expected hash + for expected_hash in expected_hashes { + let hash = seq_to_hashes.next().unwrap().ok().unwrap(); + assert_eq!(hash, expected_hash, "Mismatch in DNA hash"); + } + } +} \ No newline at end of file From fe26c2183eb30d93aeff242edbdb6ede2cf68944 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 10 Dec 2024 17:33:24 -0800 Subject: [PATCH 09/16] back to no lifetimes --- src/core/src/signature.rs | 221 +++++++++++++++----------------------- 1 file changed, 89 insertions(+), 132 deletions(-) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 8a724e950c..f257cd1338 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -271,25 +271,14 @@ impl ReadingFrame { } } - pub fn kmer_count(&self, k_size: usize) -> usize { + pub fn kmer_iter<'a>(&'a self, ksize: usize, seed: u64, force: bool) -> KmerIterator<'a> { match self { - ReadingFrame::DNA { len, .. } | ReadingFrame::Protein { len, .. } => { - if *len >= k_size { - len - k_size + 1 - } else { - 0 // No k-mers possible if len is smaller than k_size - } - } + ReadingFrame::DNA { .. } => KmerIterator::new(self, ksize, seed, force), + ReadingFrame::Protein { .. } => KmerIterator::new(self, ksize, seed, force), } } - - pub fn kmer_iter(&self, ksize: usize, seed: u64, force: bool) -> KmerIterator { - KmerIterator::new(self, ksize, seed, force) - } - } -#[derive(Debug, Clone)] pub struct KmerIterator<'a> { frame: &'a ReadingFrame, // Reference to the ReadingFrame ksize: usize, @@ -323,6 +312,45 @@ impl<'a> KmerIterator<'a> { } Ok(()) } + + fn get_dna_hash(&mut self, fw: &[u8], rc: &[u8]) -> Option> { + let kmer = &fw[self.index..self.index + self.ksize]; + + if !self.force { + if let Err(e) = self.validate_dna_kmer(kmer) { + self.index += 1; + return Some(Err(e)); + } + } + + let krc = &rc[rc.len() - self.ksize - self.index..rc.len() - self.index]; + let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed); + // NTP TESTING + eprintln!( + "Forward DNA k-mer: {}, Reverse Complement k-mer: {}, hash: {}", + String::from_utf8_lossy(kmer), + String::from_utf8_lossy(krc), + hash, + ); + + self.index += 1; + Some(Ok(hash)) + } + + fn get_protein_hash(&mut self, fw: &[u8]) -> Option> { + let kmer = &fw[self.index..self.index + self.ksize]; + + let hash = crate::_hash_murmur(kmer, self.seed); + // NTP TESTING + eprintln!( + "Protein k-mer: {}, hash: {}", + String::from_utf8_lossy(kmer), + hash + ); + + self.index += 1; + Some(Ok(hash)) + } } impl<'a> Iterator for KmerIterator<'a> { @@ -330,60 +358,31 @@ impl<'a> Iterator for KmerIterator<'a> { fn next(&mut self) -> Option { match self.frame { - ReadingFrame::DNA { fw, rc, len, .. } => { + ReadingFrame::DNA { fw, rc, len } => { if self.out_of_bounds(*len) { return None; } - - let kmer = &fw[self.index..self.index + self.ksize]; - if !self.force { - if let Err(e) = self.validate_dna_kmer(kmer) { - self.index += 1; - return Some(Err(e)); - } - } - - let krc = &rc[rc.len() - self.ksize - self.index..rc.len() - self.index]; - let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed); - // NTP TESTING - eprintln!( - "Forward DNA k-mer: {}, Reverse Complement k-mer: {}, hash: {}", - String::from_utf8_lossy(kmer), - String::from_utf8_lossy(krc), - hash, - ); - self.index += 1; - Some(Ok(hash)) + self.get_dna_hash(fw, rc) } - ReadingFrame::Protein { fw, len, .. } => { + ReadingFrame::Protein { fw, len } => { if self.out_of_bounds(*len) { return None; } - let kmer = &fw[self.index..self.index + self.ksize]; - let hash = crate::_hash_murmur(kmer, self.seed); - // NTP TESTING - eprintln!( - "Protein k-mer: {}, hash: {}", - String::from_utf8_lossy(kmer), - hash - ); - self.index += 1; - Some(Ok(hash)) + self.get_protein_hash(fw) } } } } -pub struct SeqToHashes<'a> { +pub struct SeqToHashes { k_size: usize, force: bool, seed: u64, frames: Vec, frame_index: usize, // Index of the current frame - current_kmer_iter: Option>, } -impl<'a> SeqToHashes<'a> { +impl SeqToHashes { pub fn new( seq: &[u8], k_size: usize, @@ -419,7 +418,6 @@ impl<'a> SeqToHashes<'a> { seed, frames, frame_index: 0, - current_kmer_iter: None, } } @@ -473,85 +471,31 @@ impl<'a> SeqToHashes<'a> { } } -impl<'a> Iterator for SeqToHashes<'a> { +impl Iterator for SeqToHashes { type Item = Result; fn next(&mut self) -> Option { + // Iterate over the frames using frame_index while self.frame_index < self.frames.len() { - // Initialize the kmer_iter for the current frame if it is None - if self.current_kmer_iter.is_none() { - let frame = &self.frames[self.frame_index]; - self.current_kmer_iter = Some(frame.kmer_iter(self.k_size, self.seed, self.force)); - } + let frame = &self.frames[self.frame_index]; - // Attempt to get the next k-mer from the current iterator - if let Some(ref mut kmer_iter) = self.current_kmer_iter { - if let Some(hash_result) = kmer_iter.next() { - return Some(hash_result); - } + // Create a KmerIterator for the current frame + let mut kmer_iter = frame.kmer_iter(self.k_size, self.seed, self.force); + + // Process k-mers in the current frame + if let Some(hash_result) = kmer_iter.next() { + return Some(hash_result); // Return the next hash } - // If the current iterator is exhausted, move to the next frame - self.current_kmer_iter = None; + // Move to the next frame if the current one is exhausted self.frame_index += 1; } - // All frames and iterators are exhausted + // All frames exhausted None } } -// impl<'a> Iterator for SeqToHashes<'a> { -// type Item = Result; - -// fn next(&mut self) -> Option { -// while self.frame_index < self.frames.len() { -// // Initialize kmer_iter for the current frame if it is None -// if self.current_kmer_iter.is_none() { -// let frame = &self.frames[self.frame_index]; -// self.current_kmer_iter = Some(frame.kmer_iter(self.k_size, self.seed, self.force)); -// } - -// // Attempt to get the next hash from the current iterator -// if let Some(ref mut kmer_iter) = self.current_kmer_iter { -// if let Some(hash_result) = kmer_iter.next() { -// return Some(hash_result); -// } -// } - -// // If the current iterator is exhausted, move to the next frame -// self.current_kmer_iter = None; -// self.frame_index += 1; -// } - -// // All frames and iterators are exhausted -// None -// } -// } -// impl Iterator for SeqToHashes { -// type Item = Result; - -// fn next(&mut self) -> Option { -// // Iterate over the frames using frame_index -// while self.frame_index < self.frames.len() { -// let frame = &self.frames[self.frame_index]; -// // Create a KmerIterator for the current frame -// let mut kmer_iter = frame.kmer_iter(self.k_size, self.seed, self.force); - -// // Process k-mers in the current frame -// if let Some(hash_result) = kmer_iter.next() { -// return Some(hash_result); // Return the next hash -// } - -// // Move to the next frame if the current one is exhausted -// self.frame_index += 1; -// } - -// // All frames exhausted -// None -// } -// } - #[derive(Serialize, Deserialize, Debug, Clone, TypedBuilder)] #[cfg_attr( feature = "rkyv", @@ -1503,7 +1447,6 @@ mod test { } } - #[test] fn test_seqtohashes_skipm2n3() { let sequence = b"AGTCGTCA"; @@ -1734,7 +1677,7 @@ mod test { fn test_kmer_iter_translate_frames() { let sequence = b"AGTCGTCGAGCT"; let hash_function = HashFunctions::Murmur64Protein; - let k_size =3; + let k_size = 3; let seed = 42; let force = false; let is_protein = false; @@ -1753,7 +1696,7 @@ mod test { assert_eq!(frames.len(), 6); // Expected k-mers for translated frames - let f1_kmers = vec![b"SRR".as_slice(), b"RRA".as_slice()]; + let f1_kmers = vec![b"SRR".as_slice(), b"RRA".as_slice()]; let f2_kmers = vec![b"SST".as_slice(), b"STT".as_slice()]; let f3_kmers = vec![b"VVE".as_slice()]; let f4_kmers = vec![b"ARR".as_slice()]; @@ -1799,15 +1742,24 @@ mod test { assert_eq!(frames.len(), 3); assert_eq!(frames[0].get_fw(), Some(b"ACCG".as_slice())); assert_eq!(frames[0].get_rc(), Some(b"CGGT".as_slice())); - let f1_kmers = vec![(b"ACC".as_slice(), b"GGT".as_slice()), (b"CCG".as_slice(), b"CGG".as_slice())]; + let f1_kmers = vec![ + (b"ACC".as_slice(), b"GGT".as_slice()), + (b"CCG".as_slice(), b"CGG".as_slice()), + ]; assert_eq!(frames[1].get_fw(), Some(b"GGGC".as_slice())); assert_eq!(frames[1].get_rc(), Some(b"GCCC".as_slice())); - let f2_kmers = vec![(b"GGG".as_slice(), b"CCC".as_slice()), (b"GGC".as_slice(), b"GCC".as_slice())]; + let f2_kmers = vec![ + (b"GGG".as_slice(), b"CCC".as_slice()), + (b"GGC".as_slice(), b"GCC".as_slice()), + ]; assert_eq!(frames[2].get_fw(), Some(b"TTAT".as_slice())); assert_eq!(frames[2].get_rc(), Some(b"ATAA".as_slice())); - let f3_kmers = vec![(b"TTA".as_slice(), b"TAA".as_slice()), (b"TAT".as_slice(), b"ATA".as_slice())]; + let f3_kmers = vec![ + (b"TTA".as_slice(), b"TAA".as_slice()), + (b"TAT".as_slice(), b"ATA".as_slice()), + ]; // Expected k-mers for skipmer (m=1, n=3) let expected_kmers = vec![f1_kmers, f2_kmers, f3_kmers]; @@ -1841,7 +1793,6 @@ mod test { // ); } - #[test] fn test_kmer_iter_skipmer_m2n3() { let sequence = b"AGTCGTCGAGCT"; @@ -1858,22 +1809,25 @@ mod test { assert_eq!(frames.len(), 3); assert_eq!(frames[0].get_fw(), Some(b"AGCGCGGC".as_slice())); assert_eq!(frames[0].get_rc(), Some(b"GCCGCGCT".as_slice())); - let f1_kmers = vec![(b"AGCGCGG".as_slice(), b"CCGCGCT".as_slice()), - (b"GCGCGGC".as_slice(), b"GCCGCGC".as_slice())]; + let f1_kmers = vec![ + (b"AGCGCGG".as_slice(), b"CCGCGCT".as_slice()), + (b"GCGCGGC".as_slice(), b"GCCGCGC".as_slice()), + ]; assert_eq!(frames[1].get_fw(), Some(b"GTGTGACT".as_slice())); assert_eq!(frames[1].get_rc(), Some(b"AGTCACAC".as_slice())); - let f2_kmers = vec![(b"GTGTGAC".as_slice(), b"GTCACAC".as_slice()), - (b"TGTGACT".as_slice(), b"AGTCACA".as_slice())]; + let f2_kmers = vec![ + (b"GTGTGAC".as_slice(), b"GTCACAC".as_slice()), + (b"TGTGACT".as_slice(), b"AGTCACA".as_slice()), + ]; assert_eq!(frames[2].get_fw(), Some(b"TCTCAGT".as_slice())); assert_eq!(frames[2].get_rc(), Some(b"ACTGAGA".as_slice())); - let f3_kmers = vec![(b"TCTCAGT".as_slice(), b"ACTGAGA".as_slice())]; - + let f3_kmers = vec![(b"TCTCAGT".as_slice(), b"ACTGAGA".as_slice())]; // Expected k-mers for skipmer (m=2, n=3) let expected_kmers = vec![f1_kmers, f2_kmers, f3_kmers]; - + for (frame, expected_frame_kmers) in frames.iter().zip(expected_kmers.iter()) { // Compute hashes for expected k-mers let expected_hashes: Vec = expected_frame_kmers @@ -1886,7 +1840,10 @@ mod test { let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); // Check that produced hashes match expected hashes in order - eprintln!("expected: {:?}, produced: {:?}", expected_hashes, produced_hashes); + eprintln!( + "expected: {:?}, produced: {:?}", + expected_hashes, produced_hashes + ); assert_eq!( produced_hashes, expected_hashes, "Hashes do not match in order for frame" @@ -1894,7 +1851,7 @@ mod test { } } -#[test] + #[test] fn test_seqtohashes_dna() { let sequence = b"AGTCGTCA"; let k_size = 7; @@ -1931,4 +1888,4 @@ mod test { assert_eq!(hash, expected_hash, "Mismatch in DNA hash"); } } -} \ No newline at end of file +} From 55237dd4ea76b7b0cd86e7577a8edb1f15876e40 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 10 Dec 2024 19:00:03 -0800 Subject: [PATCH 10/16] working version! --- src/core/src/signature.rs | 624 +++++++++++++++++++++----------------- 1 file changed, 344 insertions(+), 280 deletions(-) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index f257cd1338..a7cc976fcc 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -257,24 +257,34 @@ impl ReadingFrame { ReadingFrame::Protein { fw, len } } - pub fn get_fw(&self) -> Option<&[u8]> { + /// Get the forward sequence. + pub fn fw(&self) -> &[u8] { match self { - ReadingFrame::DNA { fw, .. } => Some(fw), - ReadingFrame::Protein { fw, .. } => Some(fw), + ReadingFrame::DNA { fw, .. } => fw, + ReadingFrame::Protein { fw, .. } => fw, } } - pub fn get_rc(&self) -> Option<&[u8]> { + /// Get the reverse complement sequence (if DNA). + pub fn rc(&self) -> &[u8] { match self { - ReadingFrame::DNA { rc, .. } => Some(rc), - ReadingFrame::Protein { .. } => None, + ReadingFrame::DNA { rc, .. } => rc, + _ => panic!("Reverse complement is only available for DNA frames"), } } - pub fn kmer_iter<'a>(&'a self, ksize: usize, seed: u64, force: bool) -> KmerIterator<'a> { + pub fn length(&self) -> usize { match self { - ReadingFrame::DNA { .. } => KmerIterator::new(self, ksize, seed, force), - ReadingFrame::Protein { .. } => KmerIterator::new(self, ksize, seed, force), + ReadingFrame::DNA { len, .. } => *len, + ReadingFrame::Protein { len, .. } => *len, + } + } + + /// Get the type of the frame as a string. + pub fn frame_type(&self) -> &'static str { + match self { + ReadingFrame::DNA { .. } => "DNA", + ReadingFrame::Protein { .. } => "Protein", } } } @@ -380,6 +390,7 @@ pub struct SeqToHashes { seed: u64, frames: Vec, frame_index: usize, // Index of the current frame + kmer_index: usize, // Current k-mer index within the frame } impl SeqToHashes { @@ -398,7 +409,7 @@ impl SeqToHashes { ksize = k_size / 3; } - // uppercase the sequence + // uppercase the sequence. this clones the data bc &[u8] is immutable. let sequence = seq.to_ascii_uppercase(); // Generate frames based on sequence type and hash function @@ -418,6 +429,7 @@ impl SeqToHashes { seed, frames, frame_index: 0, + kmer_index: 0, } } @@ -469,30 +481,88 @@ impl SeqToHashes { .flat_map(|frame_number| vec![ReadingFrame::new_skipmer(&seq, frame_number, m, n)]) .collect() } + + fn out_of_bounds(&self, frame: &ReadingFrame) -> bool { + self.kmer_index + self.k_size > frame.length() + } + + fn validate_dna_kmer(&self, kmer: &[u8]) -> Result<(), Error> { + for &nt in kmer { + if !VALID[nt as usize] { + return Err(Error::InvalidDNA { + message: String::from_utf8_lossy(kmer).to_string(), + }); + } + } + Ok(()) + } + + /// Process a DNA k-mer, including canonicalization and validation + fn process_dna_kmer(&self, frame: &ReadingFrame) -> Result { + let kmer = &frame.fw()[self.kmer_index..self.kmer_index + self.k_size]; + let rc = frame.rc(); + + // Validate the k-mer if `force` is false + if !self.force { + self.validate_dna_kmer(kmer)?; + } + + let reverse_index = frame.length() - self.k_size - self.kmer_index; + let krc = &rc[reverse_index..reverse_index + self.k_size]; + + // Compute canonical hash + let canonical_kmer = std::cmp::min(kmer, krc); + let hash = crate::_hash_murmur(canonical_kmer, self.seed); + + eprintln!( + "Forward DNA k-mer: {}, Reverse Complement k-mer: {}, hash: {}", + String::from_utf8_lossy(kmer), + String::from_utf8_lossy(krc), + hash, + ); + + Ok(hash) + } + + fn process_protein_kmer(&self, frame: &ReadingFrame) -> u64 { + let kmer = &frame.fw()[self.kmer_index..self.kmer_index + self.k_size]; + let hash = crate::_hash_murmur(kmer, self.seed); + + eprintln!( + "Protein k-mer: {}, hash: {}", + String::from_utf8_lossy(kmer), + hash, + ); + + hash + } } impl Iterator for SeqToHashes { type Item = Result; fn next(&mut self) -> Option { - // Iterate over the frames using frame_index while self.frame_index < self.frames.len() { let frame = &self.frames[self.frame_index]; - // Create a KmerIterator for the current frame - let mut kmer_iter = frame.kmer_iter(self.k_size, self.seed, self.force); - - // Process k-mers in the current frame - if let Some(hash_result) = kmer_iter.next() { - return Some(hash_result); // Return the next hash + // Check bounds using out_of_bounds + if self.out_of_bounds(frame) { + self.frame_index += 1; + self.kmer_index = 0; // Reset for the next frame + continue; } - // Move to the next frame if the current one is exhausted - self.frame_index += 1; + // Delegate to DNA or protein processing + let result = match frame { + ReadingFrame::DNA { .. } => self.process_dna_kmer(frame), + ReadingFrame::Protein { .. } => Ok(self.process_protein_kmer(frame)), + }; + + self.kmer_index += 1; // Advance k-mer index + return Some(result); } - // All frames exhausted - None + None // No more frames or k-mers } } @@ -1450,7 +1520,6 @@ mod test { #[test] fn test_seqtohashes_skipm2n3() { let sequence = b"AGTCGTCA"; - // let rc_seq = b"TGACGACT"; let k_size = 5; let seed = 42; let force = true; // Force skip over invalid bases if needed @@ -1495,150 +1564,150 @@ mod test { let frames = ReadingFrame::new_dna(sequence); - assert_eq!(frames.get_fw(), Some(sequence.as_slice())); - assert_eq!(frames.get_rc(), Some(b"ACGACT".as_slice())); + assert_eq!(frames.fw(), sequence.as_slice()); + assert_eq!(frames.rc(), b"ACGACT".as_slice()); } #[test] fn test_reading_frames_new_is_protein() { + // NTP todo - test panic/err if rc() let sequence = b"MVLSPADKTNVKAAW"; let frames = ReadingFrame::new_protein(sequence, false, false); - assert_eq!(frames.get_fw(), Some(sequence.as_slice())); - assert_eq!(frames.get_rc(), None); // No reverse complement for protein + assert_eq!(frames.fw(), sequence.as_slice()); } #[test] - fn test_reading_frame_translate() { + fn test_seqtohashes_frames_translate_protein() { let sequence = b"AGTCGTCGAGCT"; - let revcomp = revcomp(sequence); - let mut frames = Vec::new(); - for i in 0..3 { - let frame_fw = ReadingFrame::new_translated(sequence, i, false, false); - eprintln!("frame: {}", frame_fw); - frames.push(frame_fw); - let frame_rc = ReadingFrame::new_translated(&revcomp, i, false, false); - eprintln!("frame: {}", frame_rc); - frames.push(frame_rc); - } + let hash_function = HashFunctions::Murmur64Protein; // Represents m=1, n=3 + let k_size = 3; // K-mer size is not directly relevant for skipmer frame validation + let seed = 42; // Seed is also irrelevant for frame structure + let force = false; + let is_protein = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); // Clone frames to inspect them - assert_eq!(frames[0].get_fw(), Some(b"SRRA".as_slice())); - assert_eq!(frames[1].get_fw(), Some(b"SSTT".as_slice())); - assert_eq!(frames[2].get_fw(), Some(b"VVE".as_slice())); - assert_eq!(frames[3].get_fw(), Some(b"ARR".as_slice())); - assert_eq!(frames[4].get_fw(), Some(b"SSS".as_slice())); - assert_eq!(frames[5].get_fw(), Some(b"LDD".as_slice())); + assert_eq!(frames[0].fw(), b"SRRA".as_slice()); + assert_eq!(frames[1].fw(), b"SSTT".as_slice()); + assert_eq!(frames[2].fw(), b"VVE".as_slice()); + assert_eq!(frames[3].fw(), b"ARR".as_slice()); + assert_eq!(frames[4].fw(), b"SSS".as_slice()); + assert_eq!(frames[5].fw(), b"LDD".as_slice()); } #[test] - fn test_reading_frame_skipmer_m1n3() { + fn test_seqtohashes_frames_skipmer_m1n3() { let sequence = b"AGTCGTCGAGCT"; - let m = 1; - let n = 3; - - let mut frames = Vec::new(); - for start in 0..3 { - let frame = ReadingFrame::new_skipmer(sequence, start, m, n); - eprintln!("frame: {}", frame); - frames.push(frame); - } - - assert_eq!(frames.len(), 3); // Three skipmer frames - - // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) - assert_eq!(frames[0].get_fw(), Some(b"ACCG".as_slice())); - assert_eq!(frames[0].get_rc(), Some(b"CGGT".as_slice())); - - assert_eq!(frames[1].get_fw(), Some(b"GGGC".as_slice())); - assert_eq!(frames[1].get_rc(), Some(b"GCCC".as_slice())); + let hash_function = HashFunctions::Murmur64Skipm1n3; // Represents m=1, n=3 + let k_size = 3; // K-mer size is not directly relevant for skipmer frame validation + let seed = 42; // Seed is also irrelevant for frame structure + let force = false; + let is_protein = false; - assert_eq!(frames[2].get_fw(), Some(b"TTAT".as_slice())); - assert_eq!(frames[2].get_rc(), Some(b"ATAA".as_slice())); - } + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); // Clone frames to inspect them - #[test] - fn test_reading_frame_skipmer_m2n3() { - let sequence = b"AGTCGTCGAGCT"; - let m = 2; - let n = 3; - - let mut frames = Vec::new(); - for start in 0..3 { - let frame = ReadingFrame::new_skipmer(sequence, start, m, n); - eprintln!("frame: {}", frame); - frames.push(frame); - } + eprintln!("Frames: {:?}", frames); assert_eq!(frames.len(), 3); // Three skipmer frames // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) - assert_eq!(frames[0].get_fw(), Some(b"AGCGCGGC".as_slice())); - assert_eq!(frames[0].get_rc(), Some(b"GCCGCGCT".as_slice())); + assert_eq!(frames[0].fw(), b"ACCG".as_slice()); + assert_eq!(frames[0].rc(), b"CGGT".as_slice()); - assert_eq!(frames[1].get_fw(), Some(b"GTGTGACT".as_slice())); - assert_eq!(frames[1].get_rc(), Some(b"AGTCACAC".as_slice())); + assert_eq!(frames[1].fw(), b"GGGC".as_slice()); + assert_eq!(frames[1].rc(), b"GCCC".as_slice()); - assert_eq!(frames[2].get_fw(), Some(b"TCTCAGT".as_slice())); - assert_eq!(frames[2].get_rc(), Some(b"ACTGAGA".as_slice())); + assert_eq!(frames[2].fw(), b"TTAT".as_slice()); + assert_eq!(frames[2].rc(), b"ATAA".as_slice()); } #[test] - fn test_reading_frame_kmer_iter() { - let sequence = b"AGTCGT"; - let frame = ReadingFrame::new_dna(sequence); - assert_eq!(frame.get_fw(), Some(sequence.as_slice())); - assert_eq!(frame.get_rc(), Some(b"ACGACT".as_slice())); - - // Create a KmerIterator - let mut kmer_iter = KmerIterator::new(&frame, 3, 42, false); - - // Expected k-mers from the forward and reverse complement sequence - let expected_kmers = vec![ - (b"AGT".to_vec(), b"ACT".to_vec()), - (b"GTC".to_vec(), b"GAC".to_vec()), - (b"TCG".to_vec(), b"CGA".to_vec()), - (b"CGT".to_vec(), b"ACG".to_vec()), - ]; - - // Compute expected hashes - let expected_hashes: Vec = expected_kmers - .iter() - .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) - .collect(); - - // Collect hashes produced by the kmer_iter - let mut produced_hashes = Vec::new(); + fn test_seqtohashes_frames_skipmer_m2n3() { + let sequence = b"AGTCGTCGAGCT"; + let hash_function = HashFunctions::Murmur64Skipm2n3; // Represents m=2, n=3 + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = false; - while let Some(result) = kmer_iter.next() { - match result { - Ok(hash) => produced_hashes.push(hash), - Err(e) => panic!("Error encountered during k-mer iteration: {:?}", e), - } - } + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames; + eprintln!("Frames: {:?}", frames); - // Assert that produced hashes match expected hashes - assert_eq!( - produced_hashes, expected_hashes, - "Hashes do not match in order" - ); + assert_eq!(frames.len(), 3); // Three skipmer frames - // Debugging output for verification - eprintln!( - "Expected hashes: {:?}\nProduced hashes: {:?}", - expected_hashes, produced_hashes - ); - } + // Expected skipmer sequences for m=1, n=3 (keep-1, skip-2) + assert_eq!(frames[0].fw(), b"AGCGCGGC".as_slice()); + assert_eq!(frames[0].rc(), b"GCCGCGCT".as_slice()); + + assert_eq!(frames[1].fw(), b"GTGTGACT".as_slice()); + assert_eq!(frames[1].rc(), b"AGTCACAC".as_slice()); + + assert_eq!(frames[2].fw(), b"TCTCAGT".as_slice()); + assert_eq!(frames[2].rc(), b"ACTGAGA".as_slice()); + } + + // #[test] + // fn test_seqtohashes_frame_dna() { + // let sequence = b"AGTCGT"; + // let frame = ReadingFrame::new_dna(sequence); + // assert_eq!(frame.fw(), Some(sequence.as_slice())); + // assert_eq!(frame.rc(), Some(b"ACGACT".as_slice())); + + // // Create a KmerIterator + // let mut kmer_iter = KmerIterator::new(&frame, 3, 42, false); + + // // Expected k-mers from the forward and reverse complement sequence + // let expected_kmers = vec![ + // (b"AGT".to_vec(), b"ACT".to_vec()), + // (b"GTC".to_vec(), b"GAC".to_vec()), + // (b"TCG".to_vec(), b"CGA".to_vec()), + // (b"CGT".to_vec(), b"ACG".to_vec()), + // ]; + + // // Compute expected hashes + // let expected_hashes: Vec = expected_kmers + // .iter() + // .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let mut produced_hashes = Vec::new(); + + // while let Some(result) = kmer_iter.next() { + // match result { + // Ok(hash) => produced_hashes.push(hash), + // Err(e) => panic!("Error encountered during k-mer iteration: {:?}", e), + // } + // } + + // // Assert that produced hashes match expected hashes + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order" + // ); + + // // Debugging output for verification + // eprintln!( + // "Expected hashes: {:?}\nProduced hashes: {:?}", + // expected_hashes, produced_hashes + // ); + // } #[test] - fn test_kmer_iter_is_protein() { + fn test_seqtohashes_is_protein() { let sequence = b"MVLSPADKTNVKAAW"; let hash_function = HashFunctions::Murmur64Protein; + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = true; - let frame = - ReadingFrame::new_protein(sequence, hash_function.dayhoff(), hash_function.hp()); - - let kmer_iter = frame.kmer_iter(3, 42, false); + let sth = SeqToHashes::new(sequence, k_size * 3, force, is_protein, hash_function, seed); // Expected k-mers for protein sequence let expected_kmers = vec![ @@ -1663,68 +1732,55 @@ mod test { .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) .collect(); - // Collect hashes produced by the kmer_iter - let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + // Collect hashes from SeqToHashes + let sth_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + eprintln!("SeqToHashes hashes: {:?}", sth_hashes); - // Check that produced hashes match expected hashes in order - assert_eq!( - produced_hashes, expected_hashes, - "Hashes do not match in order" - ); + // Check that SeqToHashes matches expected hashes in order + assert_eq!(sth_hashes, expected_hashes, "Hashes do not match in order"); } #[test] - fn test_kmer_iter_translate_frames() { + fn test_seqtohashes_translate() { let sequence = b"AGTCGTCGAGCT"; let hash_function = HashFunctions::Murmur64Protein; - let k_size = 3; + let k_size = 9; // needs to be *3 for protein let seed = 42; let force = false; let is_protein = false; let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); - let frames = sth.frames; - assert_eq!(frames[0].get_fw(), Some(b"SRRA".as_slice())); - assert_eq!(frames[1].get_fw(), Some(b"SSTT".as_slice())); - assert_eq!(frames[2].get_fw(), Some(b"VVE".as_slice())); - assert_eq!(frames[3].get_fw(), Some(b"ARR".as_slice())); - assert_eq!(frames[4].get_fw(), Some(b"SSS".as_slice())); - assert_eq!(frames[5].get_fw(), Some(b"LDD".as_slice())); - - // Six translated frames - assert_eq!(frames.len(), 6); - - // Expected k-mers for translated frames - let f1_kmers = vec![b"SRR".as_slice(), b"RRA".as_slice()]; - let f2_kmers = vec![b"SST".as_slice(), b"STT".as_slice()]; - let f3_kmers = vec![b"VVE".as_slice()]; - let f4_kmers = vec![b"ARR".as_slice()]; - let f5_kmers = vec![b"SSS".as_slice()]; - let f6_kmers = vec![b"LDD".as_slice()]; - let expected_kmers = vec![f1_kmers, f2_kmers, f3_kmers, f4_kmers, f5_kmers, f6_kmers]; - - for (frame, expected_frame_kmers) in frames.iter().zip(expected_kmers.iter()) { - let kmer_iter = frame.kmer_iter(k_size, seed, false); - // Compute hashes for expected k-mers - let expected_hashes: Vec = expected_frame_kmers - .iter() - .map(|fw_kmer| crate::_hash_murmur(fw_kmer, 42)) - .collect(); + let expected_kmers = vec![ + b"SRR".as_slice(), + b"RRA".as_slice(), + b"SST".as_slice(), + b"STT".as_slice(), + b"VVE".as_slice(), + b"ARR".as_slice(), + b"SSS".as_slice(), + b"LDD".as_slice(), + ]; - // Collect hashes produced by the kmer_iter - let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + // Compute expected hashes + let expected_hashes: Vec = expected_kmers + .iter() + .map(|fw_kmer| crate::_hash_murmur(fw_kmer, seed)) + .collect(); - // Check that produced hashes match expected hashes in order - assert_eq!( - produced_hashes, expected_hashes, - "Hashes do not match in order for frame" - ); - } + // Collect hashes from SeqToHashes + let sth_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + eprintln!("SeqToHashes hashes: {:?}", sth_hashes); + + // Check that SeqToHashes matches expected hashes in order + assert_eq!( + sth_hashes, expected_hashes, + "Hashes do not match in order for SeqToHashes" + ); } #[test] - fn test_seqtohashes_kmer_iter_skipmer_m1n3() { + fn test_seqtohashes_skipmer_m1n3() { let sequence = b"AGTCGTCGAGCT"; let hash_function = HashFunctions::Murmur64Skipm1n3; let k_size = 3; @@ -1733,123 +1789,90 @@ mod test { let force = false; let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); - // get hashes from sth - let frames = sth.frames.clone(); - let sth_hashes: Vec = sth.map(|result| result.unwrap()).collect(); - eprintln!("sth_hashes: {:?}", sth_hashes); - - // Three skipmer frames - assert_eq!(frames.len(), 3); - assert_eq!(frames[0].get_fw(), Some(b"ACCG".as_slice())); - assert_eq!(frames[0].get_rc(), Some(b"CGGT".as_slice())); - let f1_kmers = vec![ + // Expected k-mers for skipmer (m=1, n=3) across all frames + let expected_kmers = vec![ (b"ACC".as_slice(), b"GGT".as_slice()), (b"CCG".as_slice(), b"CGG".as_slice()), - ]; - - assert_eq!(frames[1].get_fw(), Some(b"GGGC".as_slice())); - assert_eq!(frames[1].get_rc(), Some(b"GCCC".as_slice())); - let f2_kmers = vec![ (b"GGG".as_slice(), b"CCC".as_slice()), (b"GGC".as_slice(), b"GCC".as_slice()), - ]; - - assert_eq!(frames[2].get_fw(), Some(b"TTAT".as_slice())); - assert_eq!(frames[2].get_rc(), Some(b"ATAA".as_slice())); - let f3_kmers = vec![ (b"TTA".as_slice(), b"TAA".as_slice()), (b"TAT".as_slice(), b"ATA".as_slice()), ]; - // Expected k-mers for skipmer (m=1, n=3) - let expected_kmers = vec![f1_kmers, f2_kmers, f3_kmers]; - - let mut all_expected = Vec::new(); - for (frame, expected_frame_kmers) in frames.iter().zip(expected_kmers.iter()) { - let kmer_iter = frame.kmer_iter(3, 42, false); - - // Compute hashes for expected k-mers - let expected_hashes: Vec = expected_frame_kmers - .iter() - .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) - .collect(); - - // Collect hashes produced by the kmer_iter - let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + // Compute expected hashes + let expected_hashes: Vec = expected_kmers + .iter() + .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), seed)) + .collect(); - // Check that produced hashes match expected hashes in order - assert_eq!( - produced_hashes, expected_hashes, - "Hashes do not match in order for frame" - ); - // keep track of all expected hashes - all_expected.extend(expected_hashes); - } + // Collect hashes from SeqToHashes + let sth_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + eprintln!("SeqToHashes hashes: {:?}", sth_hashes); - // Check that sth hashes match expected hashes in order - // assert_eq!( - // sth_hashes, all_expected, - // "Hashes do not match in order for SeqToHashes" - // ); + // Check that SeqToHashes matches expected hashes in order + assert_eq!( + sth_hashes, expected_hashes, + "Hashes do not match in order for SeqToHashes" + ); } - #[test] - fn test_kmer_iter_skipmer_m2n3() { - let sequence = b"AGTCGTCGAGCT"; - let hash_function = HashFunctions::Murmur64Skipm2n3; - let k_size = 7; - let is_protein = false; - let seed = 42; - let force = false; - - let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); - let frames = sth.frames; - - // Three skipmer frames - assert_eq!(frames.len(), 3); - assert_eq!(frames[0].get_fw(), Some(b"AGCGCGGC".as_slice())); - assert_eq!(frames[0].get_rc(), Some(b"GCCGCGCT".as_slice())); - let f1_kmers = vec![ - (b"AGCGCGG".as_slice(), b"CCGCGCT".as_slice()), - (b"GCGCGGC".as_slice(), b"GCCGCGC".as_slice()), - ]; - - assert_eq!(frames[1].get_fw(), Some(b"GTGTGACT".as_slice())); - assert_eq!(frames[1].get_rc(), Some(b"AGTCACAC".as_slice())); - let f2_kmers = vec![ - (b"GTGTGAC".as_slice(), b"GTCACAC".as_slice()), - (b"TGTGACT".as_slice(), b"AGTCACA".as_slice()), - ]; - - assert_eq!(frames[2].get_fw(), Some(b"TCTCAGT".as_slice())); - assert_eq!(frames[2].get_rc(), Some(b"ACTGAGA".as_slice())); - let f3_kmers = vec![(b"TCTCAGT".as_slice(), b"ACTGAGA".as_slice())]; - - // Expected k-mers for skipmer (m=2, n=3) - let expected_kmers = vec![f1_kmers, f2_kmers, f3_kmers]; - - for (frame, expected_frame_kmers) in frames.iter().zip(expected_kmers.iter()) { - // Compute hashes for expected k-mers - let expected_hashes: Vec = expected_frame_kmers - .iter() - .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) - .collect(); - - // Collect hashes produced by the kmer_iter - let kmer_iter = frame.kmer_iter(k_size, seed, force); - let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // Check that produced hashes match expected hashes in order - eprintln!( - "expected: {:?}, produced: {:?}", - expected_hashes, produced_hashes - ); - assert_eq!( - produced_hashes, expected_hashes, - "Hashes do not match in order for frame" - ); - } - } + // #[test] + // fn test_kmer_iter_skipmer_m2n3() { + // let sequence = b"AGTCGTCGAGCT"; + // let hash_function = HashFunctions::Murmur64Skipm2n3; + // let k_size = 7; + // let is_protein = false; + // let seed = 42; + // let force = false; + + // let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + // let frames = sth.frames; + + // // Three skipmer frames + // assert_eq!(frames.len(), 3); + // assert_eq!(frames[0].fw(), Some(b"AGCGCGGC".as_slice())); + // assert_eq!(frames[0].rc(), Some(b"GCCGCGCT".as_slice())); + // let f1_kmers = vec![ + // (b"AGCGCGG".as_slice(), b"CCGCGCT".as_slice()), + // (b"GCGCGGC".as_slice(), b"GCCGCGC".as_slice()), + // ]; + + // assert_eq!(frames[1].fw(), Some(b"GTGTGACT".as_slice())); + // assert_eq!(frames[1].rc(), Some(b"AGTCACAC".as_slice())); + // let f2_kmers = vec![ + // (b"GTGTGAC".as_slice(), b"GTCACAC".as_slice()), + // (b"TGTGACT".as_slice(), b"AGTCACA".as_slice()), + // ]; + + // assert_eq!(frames[2].fw(), Some(b"TCTCAGT".as_slice())); + // assert_eq!(frames[2].rc(), Some(b"ACTGAGA".as_slice())); + // let f3_kmers = vec![(b"TCTCAGT".as_slice(), b"ACTGAGA".as_slice())]; + + // // Expected k-mers for skipmer (m=2, n=3) + // let expected_kmers = vec![f1_kmers, f2_kmers, f3_kmers]; + + // for (frame, expected_frame_kmers) in frames.iter().zip(expected_kmers.iter()) { + // // Compute hashes for expected k-mers + // let expected_hashes: Vec = expected_frame_kmers + // .iter() + // .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) + // .collect(); + + // // Collect hashes produced by the kmer_iter + // let kmer_iter = frame.kmer_iter(k_size, seed, force); + // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); + + // // Check that produced hashes match expected hashes in order + // eprintln!( + // "expected: {:?}, produced: {:?}", + // expected_hashes, produced_hashes + // ); + // assert_eq!( + // produced_hashes, expected_hashes, + // "Hashes do not match in order for frame" + // ); + // } + // } #[test] fn test_seqtohashes_dna() { @@ -1888,4 +1911,45 @@ mod test { assert_eq!(hash, expected_hash, "Mismatch in DNA hash"); } } + + #[test] + fn test_seqtohashes_dna_2() { + let sequence = b"AGTCGT"; + let hash_function = HashFunctions::Murmur64Dna; // Specify a DNA-based hash function + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + + // Expected k-mers from the forward and reverse complement sequence + let expected_kmers = vec![ + (b"AGT".to_vec(), b"ACT".to_vec()), + (b"GTC".to_vec(), b"GAC".to_vec()), + (b"TCG".to_vec(), b"CGA".to_vec()), + (b"CGT".to_vec(), b"ACG".to_vec()), + ]; + + // Compute expected hashes + let expected_hashes: Vec = expected_kmers + .iter() + .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), seed)) + .collect(); + + // Collect hashes produced by SeqToHashes + let produced_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + + // Assert that produced hashes match expected hashes + assert_eq!( + produced_hashes, expected_hashes, + "Hashes do not match in order" + ); + + // Debugging output for verification + eprintln!( + "Expected hashes: {:?}\nProduced hashes: {:?}", + expected_hashes, produced_hashes + ); + } } From 5a496906acc8f468a33dd903da61dc688a0d0455 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 10 Dec 2024 19:05:05 -0800 Subject: [PATCH 11/16] rm kmeriterator as it is now back in seqtohashes --- src/core/src/signature.rs | 112 +++++--------------------------------- 1 file changed, 15 insertions(+), 97 deletions(-) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index a7cc976fcc..578e39ae0f 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -289,101 +289,6 @@ impl ReadingFrame { } } -pub struct KmerIterator<'a> { - frame: &'a ReadingFrame, // Reference to the ReadingFrame - ksize: usize, - index: usize, - seed: u64, - force: bool, -} - -impl<'a> KmerIterator<'a> { - pub fn new(frame: &'a ReadingFrame, ksize: usize, seed: u64, force: bool) -> Self { - Self { - frame, - ksize, - index: 0, - seed, - force, - } - } - - fn out_of_bounds(&self, length: usize) -> bool { - self.index + self.ksize > length - } - - fn validate_dna_kmer(&self, kmer: &[u8]) -> Result<(), Error> { - for &nt in kmer { - if !VALID[nt as usize] { - return Err(Error::InvalidDNA { - message: String::from_utf8_lossy(kmer).to_string(), - }); - } - } - Ok(()) - } - - fn get_dna_hash(&mut self, fw: &[u8], rc: &[u8]) -> Option> { - let kmer = &fw[self.index..self.index + self.ksize]; - - if !self.force { - if let Err(e) = self.validate_dna_kmer(kmer) { - self.index += 1; - return Some(Err(e)); - } - } - - let krc = &rc[rc.len() - self.ksize - self.index..rc.len() - self.index]; - let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed); - // NTP TESTING - eprintln!( - "Forward DNA k-mer: {}, Reverse Complement k-mer: {}, hash: {}", - String::from_utf8_lossy(kmer), - String::from_utf8_lossy(krc), - hash, - ); - - self.index += 1; - Some(Ok(hash)) - } - - fn get_protein_hash(&mut self, fw: &[u8]) -> Option> { - let kmer = &fw[self.index..self.index + self.ksize]; - - let hash = crate::_hash_murmur(kmer, self.seed); - // NTP TESTING - eprintln!( - "Protein k-mer: {}, hash: {}", - String::from_utf8_lossy(kmer), - hash - ); - - self.index += 1; - Some(Ok(hash)) - } -} - -impl<'a> Iterator for KmerIterator<'a> { - type Item = Result; - - fn next(&mut self) -> Option { - match self.frame { - ReadingFrame::DNA { fw, rc, len } => { - if self.out_of_bounds(*len) { - return None; - } - self.get_dna_hash(fw, rc) - } - ReadingFrame::Protein { fw, len } => { - if self.out_of_bounds(*len) { - return None; - } - self.get_protein_hash(fw) - } - } - } -} - pub struct SeqToHashes { k_size: usize, force: bool, @@ -1561,6 +1466,13 @@ mod test { fn test_reading_frame_new_dna() { let sequence = b"AGTCGT"; let hash_function = HashFunctions::Murmur64Dna; + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); // Clone frames to inspect them let frames = ReadingFrame::new_dna(sequence); @@ -1572,10 +1484,16 @@ mod test { fn test_reading_frames_new_is_protein() { // NTP todo - test panic/err if rc() let sequence = b"MVLSPADKTNVKAAW"; + let hash_function = HashFunctions::Murmur64Protein; + let k_size = 9; + let seed = 42; + let force = false; + let is_protein = true; - let frames = ReadingFrame::new_protein(sequence, false, false); + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); // Clone frames to inspect them - assert_eq!(frames.fw(), sequence.as_slice()); + assert_eq!(frames[0].fw(), sequence.as_slice()); } #[test] From 1a0c29f715d8c0e34da4bb184e74772531529983 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 11 Dec 2024 15:30:25 -0800 Subject: [PATCH 12/16] fix dna validation; clippy fixes --- src/core/src/encodings.rs | 2 - src/core/src/signature.rs | 494 ++++++++++++++++++-------------------- 2 files changed, 239 insertions(+), 257 deletions(-) diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index 9ae33d7873..56077c80e1 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -519,8 +519,6 @@ impl<'a> Iterator for Indices<'a> { #[cfg(test)] mod test { - use proptest::collection::vec; - use super::*; #[test] diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 578e39ae0f..740df1dc8d 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -180,8 +180,8 @@ impl std::fmt::Display for ReadingFrame { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { ReadingFrame::DNA { fw, rc, len } => { - let fw_str = String::from_utf8(fw.clone()).expect("Invalid UTF-8 sequence in fw"); - let rc_str = String::from_utf8(rc.clone()).expect("Invalid UTF-8 sequence in rc"); + let fw_str = String::from_utf8_lossy(fw).to_string(); + let rc_str = String::from_utf8_lossy(rc).to_string(); write!( f, "Type: DNA ({}bp), Forward: {}, Reverse Complement: {}", @@ -189,7 +189,7 @@ impl std::fmt::Display for ReadingFrame { ) } ReadingFrame::Protein { fw, len } => { - let fw_str = String::from_utf8(fw.clone()).expect("Invalid UTF-8 sequence in fw"); + let fw_str = String::from_utf8_lossy(fw).to_string(); write!(f, "Type: Protein ({}aa), Forward: {}", len, fw_str) } } @@ -314,7 +314,8 @@ impl SeqToHashes { ksize = k_size / 3; } - // uppercase the sequence. this clones the data bc &[u8] is immutable. + // uppercase the sequence. this clones the data bc &[u8] is immutable? + // TODO: could we avoid this by changing revcomp/VALID/etc? let sequence = seq.to_ascii_uppercase(); // Generate frames based on sequence type and hash function @@ -323,7 +324,7 @@ impl SeqToHashes { } else if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { Self::translated_frames(&sequence, &hash_function) } else if hash_function.skipm1n3() || hash_function.skipm2n3() { - Self::skipmer_frames(&sequence, &hash_function) + Self::skipmer_frames(&sequence, &hash_function, ksize) } else { Self::dna_frames(&sequence) }; @@ -340,13 +341,13 @@ impl SeqToHashes { /// generate frames from DNA: 1 DNA frame (fw+rc) fn dna_frames(seq: &[u8]) -> Vec { - vec![ReadingFrame::new_dna(&seq)] + vec![ReadingFrame::new_dna(seq)] } /// generate frames from protein: 1 protein frame fn protein_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { vec![ReadingFrame::new_protein( - &seq, + seq, hash_function.dayhoff(), hash_function.hp(), )] @@ -354,12 +355,12 @@ impl SeqToHashes { /// generate translated frames: 6 protein frames fn translated_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { - let revcomp_sequence = revcomp(&seq); + let revcomp_sequence = revcomp(seq); (0..3) .flat_map(|frame_number| { vec![ ReadingFrame::new_translated( - &seq, + seq, frame_number, hash_function.dayhoff(), hash_function.hp(), @@ -376,14 +377,21 @@ impl SeqToHashes { } /// generate skipmer frames: 3 DNA frames (each with fw+rc) - fn skipmer_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { + fn skipmer_frames( + seq: &[u8], + hash_function: &HashFunctions, + ksize: usize, + ) -> Vec { let (m, n) = if hash_function.skipm1n3() { (1, 3) } else { (2, 3) }; + if ksize < n { + unimplemented!() + } (0..3) - .flat_map(|frame_number| vec![ReadingFrame::new_skipmer(&seq, frame_number, m, n)]) + .flat_map(|frame_number| vec![ReadingFrame::new_skipmer(seq, frame_number, m, n)]) .collect() } @@ -391,25 +399,32 @@ impl SeqToHashes { self.kmer_index + self.k_size > frame.length() } - fn validate_dna_kmer(&self, kmer: &[u8]) -> Result<(), Error> { + // check all bases are valid + fn validate_dna_kmer(&self, kmer: &[u8]) -> Result { for &nt in kmer { if !VALID[nt as usize] { - return Err(Error::InvalidDNA { - message: String::from_utf8_lossy(kmer).to_string(), - }); + if self.force { + // Return `false` to indicate invalid k-mer, but do not error out + return Ok(false); + } else { + return Err(Error::InvalidDNA { + message: String::from_utf8_lossy(kmer).to_string(), + }); + } } } - Ok(()) + Ok(true) // All bases are valid } /// Process a DNA k-mer, including canonicalization and validation - fn process_dna_kmer(&self, frame: &ReadingFrame) -> Result { + fn dna_hash(&self, frame: &ReadingFrame) -> Result, Error> { let kmer = &frame.fw()[self.kmer_index..self.kmer_index + self.k_size]; let rc = frame.rc(); - // Validate the k-mer if `force` is false - if !self.force { - self.validate_dna_kmer(kmer)?; + // Validate the k-mer. Skip if invalid and force is true + match self.validate_dna_kmer(kmer)? { + false => return Ok(None), // Skip this k-mer + true => {} } let reverse_index = frame.length() - self.k_size - self.kmer_index; @@ -419,27 +434,12 @@ impl SeqToHashes { let canonical_kmer = std::cmp::min(kmer, krc); let hash = crate::_hash_murmur(canonical_kmer, self.seed); - eprintln!( - "Forward DNA k-mer: {}, Reverse Complement k-mer: {}, hash: {}", - String::from_utf8_lossy(kmer), - String::from_utf8_lossy(krc), - hash, - ); - - Ok(hash) + Ok(Some(hash)) } - fn process_protein_kmer(&self, frame: &ReadingFrame) -> u64 { + fn protein_hash(&self, frame: &ReadingFrame) -> u64 { let kmer = &frame.fw()[self.kmer_index..self.kmer_index + self.k_size]; - let hash = crate::_hash_murmur(kmer, self.seed); - - eprintln!( - "Protein k-mer: {}, hash: {}", - String::from_utf8_lossy(kmer), - hash, - ); - - hash + crate::_hash_murmur(kmer, self.seed) // build and return hash } } @@ -450,7 +450,7 @@ impl Iterator for SeqToHashes { while self.frame_index < self.frames.len() { let frame = &self.frames[self.frame_index]; - // Check bounds using out_of_bounds + // Do we need to move to the next frame? if self.out_of_bounds(frame) { self.frame_index += 1; self.kmer_index = 0; // Reset for the next frame @@ -459,8 +459,16 @@ impl Iterator for SeqToHashes { // Delegate to DNA or protein processing let result = match frame { - ReadingFrame::DNA { .. } => self.process_dna_kmer(frame), - ReadingFrame::Protein { .. } => Ok(self.process_protein_kmer(frame)), + ReadingFrame::DNA { .. } => match self.dna_hash(frame) { + Ok(Some(hash)) => Ok(hash), // Valid hash + Ok(None) => { + // Skipped invalid k-mer + self.kmer_index += 1; + continue; + } + Err(err) => Err(err), // Error + }, + ReadingFrame::Protein { .. } => Ok(self.protein_hash(frame)), }; self.kmer_index += 1; // Advance k-mer index @@ -985,7 +993,6 @@ impl TryInto for Signature { #[cfg(test)] mod test { - use super::*; use std::fs::File; use std::io::{BufReader, Read}; use std::path::PathBuf; @@ -1131,7 +1138,8 @@ mod test { dbg!(&sig.signatures); assert_eq!(sig.signatures[0].size(), 3); assert_eq!(sig.signatures[1].size(), 3); - assert_eq!(sig.signatures[2].size(), 2); + eprintln!("{:?}", sig.signatures[2]); + assert_eq!(sig.signatures[2].size(), 3); assert_eq!(sig.signatures[3].size(), 1); } @@ -1139,20 +1147,20 @@ mod test { fn signature_skipm1n3_add_sequence() { let params = ComputeParameters::builder() .ksizes(vec![3, 4, 5, 6]) - .num_hashes(3u32) + .num_hashes(10u32) .dna(false) .skipm1n3(true) .build(); let mut sig = Signature::from_params(¶ms); - sig.add_sequence(b"ATGCATGA", false).unwrap(); + sig.add_sequence(b"ATGCATGAATGAC", false).unwrap(); assert_eq!(sig.signatures.len(), 4); dbg!(&sig.signatures); - assert_eq!(sig.signatures[0].size(), 3); - assert_eq!(sig.signatures[1].size(), 3); - assert_eq!(sig.signatures[2].size(), 2); - assert_eq!(sig.signatures[3].size(), 1); + assert_eq!(sig.signatures[0].size(), 5); + assert_eq!(sig.signatures[1].size(), 4); + assert_eq!(sig.signatures[2].size(), 1); + assert_eq!(sig.signatures[3].size(), 0); } #[test] @@ -1160,7 +1168,7 @@ mod test { fn signature_skipm2n3_add_sequence_too_small() { let params = ComputeParameters::builder() .ksizes(vec![2]) - .num_hashes(3u32) + .num_hashes(10u32) .dna(false) .skipm2n3(true) .build(); @@ -1174,7 +1182,7 @@ mod test { fn signature_skipm1n3_add_sequence_too_small() { let params = ComputeParameters::builder() .ksizes(vec![2]) - .num_hashes(3u32) + .num_hashes(10u32) .dna(false) .skipm1n3(true) .build(); @@ -1423,90 +1431,101 @@ mod test { } #[test] - fn test_seqtohashes_skipm2n3() { - let sequence = b"AGTCGTCA"; - let k_size = 5; + fn test_seqtohashes_frames_dna() { + let sequence = b"AGTCGT"; + let hash_function = HashFunctions::Murmur64Dna; + let k_size = 3; let seed = 42; - let force = true; // Force skip over invalid bases if needed + let force = false; let is_protein = false; - // Initialize SeqToHashes iterator using the new constructor - let mut seq_to_hashes = SeqToHashes::new( - sequence, - k_size, - force, - is_protein, - HashFunctions::Murmur64Skipm2n3, - seed, - ); + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); - // Define expected hashes for the skipmer configuration. - let expected_kmers = ["AGCGC", "GTGTA"]; - // rc of the k-mer, not of the sequence, then skipmerized. Correct? - let expected_krc = ["GCGCT", "TACAC"]; + assert_eq!(frames.len(), 1); + assert_eq!(frames[0].fw(), sequence.as_slice()); + assert_eq!(frames[0].rc(), b"ACGACT".as_slice()); + } - // Compute expected hashes by hashing each k-mer with its reverse complement - let expected_hashes: Vec = expected_kmers - .iter() - .zip(expected_krc.iter()) - .map(|(kmer, krc)| { - // Convert both kmer and krc to byte slices and pass to _hash_murmur - crate::_hash_murmur(std::cmp::min(kmer.as_bytes(), krc.as_bytes()), seed) - }) - .collect(); + #[test] + fn test_seqtohashes_frames_is_protein() { + let sequence = b"MVLSPADKTNVKAAW"; + let hash_function = HashFunctions::Murmur64Protein; + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = true; - // Compare each produced hash from the iterator with the expected hash - for expected_hash in expected_hashes { - let hash = seq_to_hashes.next().unwrap().ok().unwrap(); - assert_eq!(hash, expected_hash, "Mismatch in skipmer hash"); - } + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); + + assert_eq!(frames.len(), 1); + assert_eq!(frames[0].fw(), sequence.as_slice()); } #[test] - fn test_reading_frame_new_dna() { - let sequence = b"AGTCGT"; - let hash_function = HashFunctions::Murmur64Dna; + #[should_panic] + fn test_seqtohashes_frames_is_protein_try_access_rc() { + // test panic if trying to access rc + let sequence = b"MVLSPADKTNVKAAW"; + let hash_function = HashFunctions::Murmur64Protein; let k_size = 3; let seed = 42; let force = false; - let is_protein = false; + let is_protein = true; let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); - let frames = sth.frames.clone(); // Clone frames to inspect them + let frames = sth.frames.clone(); - let frames = ReadingFrame::new_dna(sequence); + // protein frame doesn't have rc; this should panic + eprintln!("{:?}", frames[0].rc()); + } - assert_eq!(frames.fw(), sequence.as_slice()); - assert_eq!(frames.rc(), b"ACGACT".as_slice()); + #[test] + fn test_seqtohashes_frames_is_protein_dayhoff() { + let sequence = b"MVLSPADKTNVKAAW"; + let dayhoff_seq = b"eeebbbcdbcedbbf"; + let hash_function = HashFunctions::Murmur64Dayhoff; + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = true; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); + + assert_eq!(frames.len(), 1); + assert_eq!(frames[0].fw(), dayhoff_seq.as_slice()); } #[test] - fn test_reading_frames_new_is_protein() { - // NTP todo - test panic/err if rc() + fn test_seqtohashes_frames_is_protein_hp() { let sequence = b"MVLSPADKTNVKAAW"; - let hash_function = HashFunctions::Murmur64Protein; - let k_size = 9; + let hp_seq = b"hhhphhpppphphhh"; + let hash_function = HashFunctions::Murmur64Hp; + let k_size = 3; let seed = 42; let force = false; let is_protein = true; let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); - let frames = sth.frames.clone(); // Clone frames to inspect them + let frames = sth.frames.clone(); - assert_eq!(frames[0].fw(), sequence.as_slice()); + assert_eq!(frames.len(), 1); + assert_eq!(frames[0].fw(), hp_seq.as_slice()); } #[test] fn test_seqtohashes_frames_translate_protein() { let sequence = b"AGTCGTCGAGCT"; - let hash_function = HashFunctions::Murmur64Protein; // Represents m=1, n=3 - let k_size = 3; // K-mer size is not directly relevant for skipmer frame validation - let seed = 42; // Seed is also irrelevant for frame structure + let hash_function = HashFunctions::Murmur64Protein; + let k_size = 3; + let seed = 42; let force = false; let is_protein = false; let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); - let frames = sth.frames.clone(); // Clone frames to inspect them + let frames = sth.frames.clone(); assert_eq!(frames[0].fw(), b"SRRA".as_slice()); assert_eq!(frames[1].fw(), b"SSTT".as_slice()); @@ -1526,7 +1545,7 @@ mod test { let is_protein = false; let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); - let frames = sth.frames.clone(); // Clone frames to inspect them + let frames = sth.frames.clone(); eprintln!("Frames: {:?}", frames); @@ -1546,7 +1565,7 @@ mod test { #[test] fn test_seqtohashes_frames_skipmer_m2n3() { let sequence = b"AGTCGTCGAGCT"; - let hash_function = HashFunctions::Murmur64Skipm2n3; // Represents m=2, n=3 + let hash_function = HashFunctions::Murmur64Skipm2n3; let k_size = 3; let seed = 42; let force = false; @@ -1569,52 +1588,79 @@ mod test { assert_eq!(frames[2].rc(), b"ACTGAGA".as_slice()); } - // #[test] - // fn test_seqtohashes_frame_dna() { - // let sequence = b"AGTCGT"; - // let frame = ReadingFrame::new_dna(sequence); - // assert_eq!(frame.fw(), Some(sequence.as_slice())); - // assert_eq!(frame.rc(), Some(b"ACGACT".as_slice())); - - // // Create a KmerIterator - // let mut kmer_iter = KmerIterator::new(&frame, 3, 42, false); - - // // Expected k-mers from the forward and reverse complement sequence - // let expected_kmers = vec![ - // (b"AGT".to_vec(), b"ACT".to_vec()), - // (b"GTC".to_vec(), b"GAC".to_vec()), - // (b"TCG".to_vec(), b"CGA".to_vec()), - // (b"CGT".to_vec(), b"ACG".to_vec()), - // ]; - - // // Compute expected hashes - // let expected_hashes: Vec = expected_kmers - // .iter() - // .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) - // .collect(); - - // // Collect hashes produced by the kmer_iter - // let mut produced_hashes = Vec::new(); - - // while let Some(result) = kmer_iter.next() { - // match result { - // Ok(hash) => produced_hashes.push(hash), - // Err(e) => panic!("Error encountered during k-mer iteration: {:?}", e), - // } - // } - - // // Assert that produced hashes match expected hashes - // assert_eq!( - // produced_hashes, expected_hashes, - // "Hashes do not match in order" - // ); - - // // Debugging output for verification - // eprintln!( - // "Expected hashes: {:?}\nProduced hashes: {:?}", - // expected_hashes, produced_hashes - // ); - // } + #[test] + fn test_seqtohashes_dna() { + let sequence = b"AGTCGT"; + let hash_function = HashFunctions::Murmur64Dna; + let k_size = 3; + let seed = 42; + let force = false; + let is_protein = false; + + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + + // Expected k-mers from the forward and reverse complement sequence + let expected_kmers = vec![ + (b"AGT".to_vec(), b"ACT".to_vec()), + (b"GTC".to_vec(), b"GAC".to_vec()), + (b"TCG".to_vec(), b"CGA".to_vec()), + (b"CGT".to_vec(), b"ACG".to_vec()), + ]; + + // Compute expected hashes from expected kmers + let expected_hashes: Vec = expected_kmers + .iter() + .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), seed)) + .collect(); + + // Collect hashes from SeqToHashes + let sth_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + eprintln!("SeqToHashes hashes: {:?}", sth_hashes); + + // Check that SeqToHashes matches expected hashes in order + assert_eq!( + sth_hashes, expected_hashes, + "Hashes do not match in order for SeqToHashes" + ); + } + + #[test] + fn test_seqtohashes_dna_2() { + let sequence = b"AGTCGTCA"; + let k_size = 7; + let seed = 42; + let force = true; // Force skip over invalid bases if needed + let is_protein = false; + // Initialize SeqToHashes iterator using the new constructor + let mut seq_to_hashes = SeqToHashes::new( + sequence, + k_size, + force, + is_protein, + HashFunctions::Murmur64Dna, + seed, + ); + + // Define expected hashes for the kmer configuration. + let expected_kmers = ["AGTCGTC", "GTCGTCA"]; + let expected_krc = ["GACGACT", "TGACGAC"]; + + // Compute expected hashes by hashing each k-mer with its reverse complement + let expected_hashes: Vec = expected_kmers + .iter() + .zip(expected_krc.iter()) + .map(|(kmer, krc)| { + // Convert both kmer and krc to byte slices and pass to _hash_murmur + crate::_hash_murmur(std::cmp::min(kmer.as_bytes(), krc.as_bytes()), seed) + }) + .collect(); + + // Compare each produced hash from the iterator with the expected hash + for expected_hash in expected_hashes { + let hash = seq_to_hashes.next().unwrap().ok().unwrap(); + assert_eq!(hash, expected_hash, "Mismatch in DNA hash"); + } + } #[test] fn test_seqtohashes_is_protein() { @@ -1698,7 +1744,7 @@ mod test { } #[test] - fn test_seqtohashes_skipmer_m1n3() { + fn test_seqtohashes_skipm1n3() { let sequence = b"AGTCGTCGAGCT"; let hash_function = HashFunctions::Murmur64Skipm1n3; let k_size = 3; @@ -1734,119 +1780,62 @@ mod test { ); } - // #[test] - // fn test_kmer_iter_skipmer_m2n3() { - // let sequence = b"AGTCGTCGAGCT"; - // let hash_function = HashFunctions::Murmur64Skipm2n3; - // let k_size = 7; - // let is_protein = false; - // let seed = 42; - // let force = false; - - // let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); - // let frames = sth.frames; - - // // Three skipmer frames - // assert_eq!(frames.len(), 3); - // assert_eq!(frames[0].fw(), Some(b"AGCGCGGC".as_slice())); - // assert_eq!(frames[0].rc(), Some(b"GCCGCGCT".as_slice())); - // let f1_kmers = vec![ - // (b"AGCGCGG".as_slice(), b"CCGCGCT".as_slice()), - // (b"GCGCGGC".as_slice(), b"GCCGCGC".as_slice()), - // ]; - - // assert_eq!(frames[1].fw(), Some(b"GTGTGACT".as_slice())); - // assert_eq!(frames[1].rc(), Some(b"AGTCACAC".as_slice())); - // let f2_kmers = vec![ - // (b"GTGTGAC".as_slice(), b"GTCACAC".as_slice()), - // (b"TGTGACT".as_slice(), b"AGTCACA".as_slice()), - // ]; - - // assert_eq!(frames[2].fw(), Some(b"TCTCAGT".as_slice())); - // assert_eq!(frames[2].rc(), Some(b"ACTGAGA".as_slice())); - // let f3_kmers = vec![(b"TCTCAGT".as_slice(), b"ACTGAGA".as_slice())]; - - // // Expected k-mers for skipmer (m=2, n=3) - // let expected_kmers = vec![f1_kmers, f2_kmers, f3_kmers]; - - // for (frame, expected_frame_kmers) in frames.iter().zip(expected_kmers.iter()) { - // // Compute hashes for expected k-mers - // let expected_hashes: Vec = expected_frame_kmers - // .iter() - // .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), 42)) - // .collect(); - - // // Collect hashes produced by the kmer_iter - // let kmer_iter = frame.kmer_iter(k_size, seed, force); - // let produced_hashes: Vec = kmer_iter.map(|result| result.unwrap()).collect(); - - // // Check that produced hashes match expected hashes in order - // eprintln!( - // "expected: {:?}, produced: {:?}", - // expected_hashes, produced_hashes - // ); - // assert_eq!( - // produced_hashes, expected_hashes, - // "Hashes do not match in order for frame" - // ); - // } - // } - #[test] - fn test_seqtohashes_dna() { - let sequence = b"AGTCGTCA"; + fn test_seq2hashes_skipm2n3() { + let sequence = b"AGTCGTCGAGCT"; + let hash_function = HashFunctions::Murmur64Skipm2n3; let k_size = 7; - let seed = 42; - let force = true; // Force skip over invalid bases if needed let is_protein = false; - // Initialize SeqToHashes iterator using the new constructor - let mut seq_to_hashes = SeqToHashes::new( - sequence, - k_size, - force, - is_protein, - HashFunctions::Murmur64Dna, - seed, - ); + let seed = 42; + let force = false; - // Define expected hashes for the kmer configuration. - let expected_kmers = ["AGTCGTC", "GTCGTCA"]; - let expected_krc = ["GACGACT", "TGACGAC"]; + let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); - // Compute expected hashes by hashing each k-mer with its reverse complement + // Expected k-mers for skipmer (m=2, n=3) + let expected_kmers = vec![ + (b"AGCGCGG".as_slice(), b"CCGCGCT".as_slice()), + (b"GCGCGGC".as_slice(), b"GCCGCGC".as_slice()), + (b"GTGTGAC".as_slice(), b"GTCACAC".as_slice()), + (b"TGTGACT".as_slice(), b"AGTCACA".as_slice()), + (b"TCTCAGT".as_slice(), b"ACTGAGA".as_slice()), + ]; + + // Compute expected hashes let expected_hashes: Vec = expected_kmers .iter() - .zip(expected_krc.iter()) - .map(|(kmer, krc)| { - // Convert both kmer and krc to byte slices and pass to _hash_murmur - crate::_hash_murmur(std::cmp::min(kmer.as_bytes(), krc.as_bytes()), seed) - }) + .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), seed)) .collect(); - // Compare each produced hash from the iterator with the expected hash - for expected_hash in expected_hashes { - let hash = seq_to_hashes.next().unwrap().ok().unwrap(); - assert_eq!(hash, expected_hash, "Mismatch in DNA hash"); - } + // Collect hashes from SeqToHashes + let sth_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + eprintln!("SeqToHashes hashes: {:?}", sth_hashes); + + // Check that SeqToHashes matches expected hashes in order + assert_eq!( + sth_hashes, expected_hashes, + "Hashes do not match in order for SeqToHashes" + ); } #[test] - fn test_seqtohashes_dna_2() { - let sequence = b"AGTCGT"; - let hash_function = HashFunctions::Murmur64Dna; // Specify a DNA-based hash function - let k_size = 3; + fn test_seqtohashes_skipm2n3_2() { + let sequence = b"AGTCGTCA"; + let hash_function = HashFunctions::Murmur64Skipm2n3; + let k_size = 5; let seed = 42; - let force = false; + let force = true; let is_protein = false; let sth = SeqToHashes::new(sequence, k_size, force, is_protein, hash_function, seed); + let frames = sth.frames.clone(); + for fr in frames { + eprintln!("{}", fr); + } - // Expected k-mers from the forward and reverse complement sequence let expected_kmers = vec![ - (b"AGT".to_vec(), b"ACT".to_vec()), - (b"GTC".to_vec(), b"GAC".to_vec()), - (b"TCG".to_vec(), b"CGA".to_vec()), - (b"CGT".to_vec(), b"ACG".to_vec()), + (b"AGCGC".as_slice(), b"GCGCT".as_slice()), + (b"GCGCA".as_slice(), b"TGCGC".as_slice()), + (b"GTGTA".as_slice(), b"TACAC".as_slice()), ]; // Compute expected hashes @@ -1855,19 +1844,14 @@ mod test { .map(|(fw_kmer, rc_kmer)| crate::_hash_murmur(std::cmp::min(fw_kmer, rc_kmer), seed)) .collect(); - // Collect hashes produced by SeqToHashes - let produced_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + // Collect hashes from SeqToHashes + let sth_hashes: Vec = sth.map(|result| result.unwrap()).collect(); + eprintln!("SeqToHashes hashes: {:?}", sth_hashes); - // Assert that produced hashes match expected hashes + // Check that SeqToHashes matches expected hashes in order assert_eq!( - produced_hashes, expected_hashes, - "Hashes do not match in order" - ); - - // Debugging output for verification - eprintln!( - "Expected hashes: {:?}\nProduced hashes: {:?}", - expected_hashes, produced_hashes + sth_hashes, expected_hashes, + "Hashes do not match in order for SeqToHashes" ); } } From 82ff63e54cc62fa9daef6d7d0d348bee19812078 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 11 Dec 2024 17:04:46 -0800 Subject: [PATCH 13/16] actual fix dna validation; almost there --- src/core/src/signature.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 740df1dc8d..cbca38eed9 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -204,6 +204,7 @@ impl ReadingFrame { ReadingFrame::DNA { fw, rc, len } } + // TODO -- handle bad DNA sequence here?? pub fn new_protein(sequence: &[u8], dayhoff: bool, hp: bool) -> Self { let fw: Vec = if dayhoff { sequence.iter().map(|&aa| aa_to_dayhoff(aa)).collect() @@ -417,13 +418,13 @@ impl SeqToHashes { } /// Process a DNA k-mer, including canonicalization and validation - fn dna_hash(&self, frame: &ReadingFrame) -> Result, Error> { + fn dna_hash(&self, frame: &ReadingFrame) -> Result { let kmer = &frame.fw()[self.kmer_index..self.kmer_index + self.k_size]; let rc = frame.rc(); // Validate the k-mer. Skip if invalid and force is true match self.validate_dna_kmer(kmer)? { - false => return Ok(None), // Skip this k-mer + false => return Ok(0), // Skip this k-mer true => {} } @@ -434,7 +435,7 @@ impl SeqToHashes { let canonical_kmer = std::cmp::min(kmer, krc); let hash = crate::_hash_murmur(canonical_kmer, self.seed); - Ok(Some(hash)) + Ok(hash) } fn protein_hash(&self, frame: &ReadingFrame) -> u64 { @@ -460,12 +461,7 @@ impl Iterator for SeqToHashes { // Delegate to DNA or protein processing let result = match frame { ReadingFrame::DNA { .. } => match self.dna_hash(frame) { - Ok(Some(hash)) => Ok(hash), // Valid hash - Ok(None) => { - // Skipped invalid k-mer - self.kmer_index += 1; - continue; - } + Ok(hash) => Ok(hash), // Valid hash Err(err) => Err(err), // Error }, ReadingFrame::Protein { .. } => Ok(self.protein_hash(frame)), From 6c18180ad32fb659debe09ea8fe039de527c7cd8 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 11 Dec 2024 17:22:21 -0800 Subject: [PATCH 14/16] apply lint suggestion; add back indexing comment --- src/core/src/signature.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index cbca38eed9..84e9d91477 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -204,7 +204,6 @@ impl ReadingFrame { ReadingFrame::DNA { fw, rc, len } } - // TODO -- handle bad DNA sequence here?? pub fn new_protein(sequence: &[u8], dayhoff: bool, hp: bool) -> Self { let fw: Vec = if dayhoff { sequence.iter().map(|&aa| aa_to_dayhoff(aa)).collect() @@ -423,11 +422,21 @@ impl SeqToHashes { let rc = frame.rc(); // Validate the k-mer. Skip if invalid and force is true - match self.validate_dna_kmer(kmer)? { - false => return Ok(0), // Skip this k-mer - true => {} + if !self.validate_dna_kmer(kmer)? { + return Ok(0); // Skip invalid k-mer } + // For a ksize = 3, and a sequence AGTCGT (len = 6): + // +-+---------+---------------+-------+ + // seq RC |i|i + ksize|len - ksize - i|len - i| + // AGTCGT ACGACT +-+---------+---------------+-------+ + // +-> +-> |0| 2 | 3 | 6 | + // +-> +-> |1| 3 | 2 | 5 | + // +-> +-> |2| 4 | 1 | 4 | + // +-> +-> |3| 5 | 0 | 3 | + // +-+---------+---------------+-------+ + // (leaving this table here because I had to draw to + // get the indices correctly) let reverse_index = frame.length() - self.k_size - self.kmer_index; let krc = &rc[reverse_index..reverse_index + self.k_size]; From af2af553d8c7d20aff5fd4dfc899c8bc8af6e76b Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 12 Dec 2024 07:12:58 -0800 Subject: [PATCH 15/16] fix uppercasing + validation --- src/core/src/signature.rs | 193 +++++++++++++++++----------------- tests/test_sourmash_sketch.py | 8 +- 2 files changed, 102 insertions(+), 99 deletions(-) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 84e9d91477..abffbc19f7 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -10,6 +10,7 @@ use std::path::Path; use std::str; use cfg_if::cfg_if; +use itertools::Itertools; #[cfg(feature = "parallel")] use rayon::prelude::*; use serde::{Deserialize, Serialize}; @@ -171,7 +172,7 @@ pub enum ReadingFrame { len: usize, // len gives max_index for kmer iterator }, Protein { - fw: Vec, // Only forward frame + fw: Vec, len: usize, }, } @@ -198,58 +199,64 @@ impl std::fmt::Display for ReadingFrame { impl ReadingFrame { pub fn new_dna(sequence: &[u8]) -> Self { - let fw = sequence.to_vec(); - let rc = revcomp(sequence); + let fw = sequence.to_ascii_uppercase(); + let rc = revcomp(&fw); let len = sequence.len(); ReadingFrame::DNA { fw, rc, len } } pub fn new_protein(sequence: &[u8], dayhoff: bool, hp: bool) -> Self { + let seq = sequence.to_ascii_uppercase(); let fw: Vec = if dayhoff { - sequence.iter().map(|&aa| aa_to_dayhoff(aa)).collect() + seq.iter().map(|&aa| aa_to_dayhoff(aa)).collect() } else if hp { - sequence.iter().map(|&aa| aa_to_hp(aa)).collect() + seq.iter().map(|&aa| aa_to_hp(aa)).collect() } else { - sequence.to_vec() // protein, as-is. + seq }; let len = fw.len(); ReadingFrame::Protein { fw, len } } - pub fn new_skipmer(seq: &[u8], start: usize, m: usize, n: usize) -> Self { + pub fn new_skipmer(sequence: &[u8], start: usize, m: usize, n: usize) -> Self { + let seq = sequence.to_ascii_uppercase(); if start >= n { panic!("Skipmer frame number must be < n ({})", n); } - // Generate forward skipmer frame - let fw: Vec = seq - .iter() - .skip(start) - .enumerate() - .filter_map(|(i, &base)| if i % n < m { Some(base) } else { None }) - .collect(); + // do we need to round up? (+1) + let mut fw = Vec::with_capacity(((seq.len() * m) + 1) / n); + seq.iter().skip(start).enumerate().for_each(|(i, &base)| { + if i % n < m { + fw.push(base.to_ascii_uppercase()); + } + }); let len = fw.len(); let rc = revcomp(&fw); ReadingFrame::DNA { fw, rc, len } } + // this is the only one that doesn't uppercase in here b/c more efficient to uppercase externally :/ pub fn new_translated(sequence: &[u8], frame_number: usize, dayhoff: bool, hp: bool) -> Self { if frame_number > 2 { panic!("Frame number must be 0, 1, or 2"); } - // translate sequence - let fw: Vec = sequence + // Translate sequence into amino acids + let mut fw = Vec::with_capacity(sequence.len() / 3); + // NOTE: b/c of chunks(3), we only process full codons and ignore leftover bases (e.g. 1 or 2 at end of frame) + sequence .iter() - .cloned() - .skip(frame_number) // skip the initial bases for the frame - .take(sequence.len() - frame_number) // adjust length based on skipped bases - .collect::>() // collect the DNA subsequence - .chunks(3) // group into codons (triplets) - .filter_map(|codon| to_aa(codon, dayhoff, hp).ok()) // translate each codon - .flatten() // flatten the nested results into a single sequence - .collect(); + .skip(frame_number) // Skip the initial bases for the frame + .take(sequence.len() - frame_number) // Adjust length based on skipped bases + .chunks(3) // Group into codons (triplets) using itertools + .into_iter() + .filter_map(|chunk| { + let codon: Vec = chunk.cloned().collect(); // Collect the chunk into a Vec + to_aa(&codon, dayhoff, hp).ok() // Translate the codon + }) + .for_each(|aa| fw.extend(aa)); // Extend `fw` with amino acids let len = fw.len(); @@ -258,6 +265,7 @@ impl ReadingFrame { } /// Get the forward sequence. + #[inline] pub fn fw(&self) -> &[u8] { match self { ReadingFrame::DNA { fw, .. } => fw, @@ -266,6 +274,7 @@ impl ReadingFrame { } /// Get the reverse complement sequence (if DNA). + #[inline] pub fn rc(&self) -> &[u8] { match self { ReadingFrame::DNA { rc, .. } => rc, @@ -273,6 +282,7 @@ impl ReadingFrame { } } + #[inline] pub fn length(&self) -> usize { match self { ReadingFrame::DNA { len, .. } => *len, @@ -294,8 +304,9 @@ pub struct SeqToHashes { force: bool, seed: u64, frames: Vec, - frame_index: usize, // Index of the current frame - kmer_index: usize, // Current k-mer index within the frame + frame_index: usize, // Index of the current frame + kmer_index: usize, // Current k-mer index within the frame + last_position_check: usize, // Index of last base we validated } impl SeqToHashes { @@ -314,19 +325,17 @@ impl SeqToHashes { ksize = k_size / 3; } - // uppercase the sequence. this clones the data bc &[u8] is immutable? - // TODO: could we avoid this by changing revcomp/VALID/etc? - let sequence = seq.to_ascii_uppercase(); - // Generate frames based on sequence type and hash function - let frames = if is_protein { - Self::protein_frames(&sequence, &hash_function) + let frames = if hash_function.dna() { + Self::dna_frames(&seq) + } else if is_protein { + Self::protein_frames(&seq, &hash_function) } else if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { - Self::translated_frames(&sequence, &hash_function) + Self::translated_frames(&seq, &hash_function) } else if hash_function.skipm1n3() || hash_function.skipm2n3() { - Self::skipmer_frames(&sequence, &hash_function, ksize) + Self::skipmer_frames(&seq, &hash_function, ksize) } else { - Self::dna_frames(&sequence) + unimplemented!(); }; SeqToHashes { @@ -336,6 +345,7 @@ impl SeqToHashes { frames, frame_index: 0, kmer_index: 0, + last_position_check: 0, } } @@ -355,12 +365,14 @@ impl SeqToHashes { /// generate translated frames: 6 protein frames fn translated_frames(seq: &[u8], hash_function: &HashFunctions) -> Vec { - let revcomp_sequence = revcomp(seq); + // since we need to revcomp BEFORE making ReadingFrames, uppercase the sequence here + let sequence = seq.to_ascii_uppercase(); + let revcomp_sequence = revcomp(&sequence); (0..3) .flat_map(|frame_number| { vec![ ReadingFrame::new_translated( - seq, + &sequence, frame_number, hash_function.dayhoff(), hash_function.hp(), @@ -398,59 +410,6 @@ impl SeqToHashes { fn out_of_bounds(&self, frame: &ReadingFrame) -> bool { self.kmer_index + self.k_size > frame.length() } - - // check all bases are valid - fn validate_dna_kmer(&self, kmer: &[u8]) -> Result { - for &nt in kmer { - if !VALID[nt as usize] { - if self.force { - // Return `false` to indicate invalid k-mer, but do not error out - return Ok(false); - } else { - return Err(Error::InvalidDNA { - message: String::from_utf8_lossy(kmer).to_string(), - }); - } - } - } - Ok(true) // All bases are valid - } - - /// Process a DNA k-mer, including canonicalization and validation - fn dna_hash(&self, frame: &ReadingFrame) -> Result { - let kmer = &frame.fw()[self.kmer_index..self.kmer_index + self.k_size]; - let rc = frame.rc(); - - // Validate the k-mer. Skip if invalid and force is true - if !self.validate_dna_kmer(kmer)? { - return Ok(0); // Skip invalid k-mer - } - - // For a ksize = 3, and a sequence AGTCGT (len = 6): - // +-+---------+---------------+-------+ - // seq RC |i|i + ksize|len - ksize - i|len - i| - // AGTCGT ACGACT +-+---------+---------------+-------+ - // +-> +-> |0| 2 | 3 | 6 | - // +-> +-> |1| 3 | 2 | 5 | - // +-> +-> |2| 4 | 1 | 4 | - // +-> +-> |3| 5 | 0 | 3 | - // +-+---------+---------------+-------+ - // (leaving this table here because I had to draw to - // get the indices correctly) - let reverse_index = frame.length() - self.k_size - self.kmer_index; - let krc = &rc[reverse_index..reverse_index + self.k_size]; - - // Compute canonical hash - let canonical_kmer = std::cmp::min(kmer, krc); - let hash = crate::_hash_murmur(canonical_kmer, self.seed); - - Ok(hash) - } - - fn protein_hash(&self, frame: &ReadingFrame) -> u64 { - let kmer = &frame.fw()[self.kmer_index..self.kmer_index + self.k_size]; - crate::_hash_murmur(kmer, self.seed) // build and return hash - } } impl Iterator for SeqToHashes { @@ -464,22 +423,60 @@ impl Iterator for SeqToHashes { if self.out_of_bounds(frame) { self.frame_index += 1; self.kmer_index = 0; // Reset for the next frame + self.last_position_check = 0; continue; } - // Delegate to DNA or protein processing let result = match frame { - ReadingFrame::DNA { .. } => match self.dna_hash(frame) { - Ok(hash) => Ok(hash), // Valid hash - Err(err) => Err(err), // Error - }, - ReadingFrame::Protein { .. } => Ok(self.protein_hash(frame)), + ReadingFrame::DNA { .. } => { + let kmer = &frame.fw()[self.kmer_index..self.kmer_index + self.k_size]; + let rc = frame.rc(); + + // Validate k-mer bases + for j in std::cmp::max(self.kmer_index, self.last_position_check) + ..self.kmer_index + self.k_size + { + if !VALID[frame.fw()[j] as usize] { + if !self.force { + // Return an error if force is false + return Some(Err(Error::InvalidDNA { + message: String::from_utf8(kmer.to_vec()).unwrap(), + })); + } else { + // Skip the invalid k-mer + self.kmer_index += 1; + return Some(Ok(0)); + } + } + self.last_position_check += 1; + } + + // Compute canonical hash + // For a ksize = 3, and a sequence AGTCGT (len = 6): + // +-+---------+---------------+-------+ + // seq RC |i|i + ksize|len - ksize - i|len - i| + // AGTCGT ACGACT +-+---------+---------------+-------+ + // +-> +-> |0| 2 | 3 | 6 | + // +-> +-> |1| 3 | 2 | 5 | + // +-> +-> |2| 4 | 1 | 4 | + // +-> +-> |3| 5 | 0 | 3 | + // +-+---------+---------------+-------+ + // (leaving this table here because I had to draw to + // get the indices correctly) + let krc = &rc[frame.length() - self.k_size - self.kmer_index + ..frame.length() - self.kmer_index]; + let hash = crate::_hash_murmur(std::cmp::min(kmer, krc), self.seed); + Ok(hash) + } + ReadingFrame::Protein { .. } => { + let kmer = &frame.fw()[self.kmer_index..self.kmer_index + self.k_size]; + Ok(crate::_hash_murmur(kmer, self.seed)) + } }; - self.kmer_index += 1; // Advance k-mer index + self.kmer_index += 1; // Advance k-mer index for valid k-mers return Some(result); } - None // No more frames or k-mers } } diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py index 88332b1945..b95b081a75 100644 --- a/tests/test_sourmash_sketch.py +++ b/tests/test_sourmash_sketch.py @@ -345,6 +345,7 @@ def test_protein_override_bad_rust_foo(): siglist = factory() assert len(siglist) == 1 sig = siglist[0] + print(sig.minhash.ksize) # try adding something testdata1 = utils.get_test_data("ecoli.faa") @@ -354,7 +355,12 @@ def test_protein_override_bad_rust_foo(): with pytest.raises(ValueError) as exc: sig.add_protein(record.sequence) - assert 'Invalid hash function: "DNA"' in str(exc) + # assert 'Invalid hash function: "DNA"' in str(exc) + + # this case now ends up in the "DNA" section of SeqToHashes, + # so we run into the invalid k-mer error + # instead of invalid Hash Function. + assert "invalid DNA character in input k-mer: MRVLKFGGTS" in str(exc) def test_dayhoff_defaults(): From 28781bcb2dac0563d2ca3d8b6cc6e8e3511fb7f4 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 12 Dec 2024 11:53:31 -0800 Subject: [PATCH 16/16] clippy fixes; rm comment --- src/core/src/signature.rs | 8 ++++---- tests/test_sourmash_sketch.py | 5 ----- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index abffbc19f7..5cc60ca299 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -327,13 +327,13 @@ impl SeqToHashes { // Generate frames based on sequence type and hash function let frames = if hash_function.dna() { - Self::dna_frames(&seq) + Self::dna_frames(seq) } else if is_protein { - Self::protein_frames(&seq, &hash_function) + Self::protein_frames(seq, &hash_function) } else if hash_function.protein() || hash_function.dayhoff() || hash_function.hp() { - Self::translated_frames(&seq, &hash_function) + Self::translated_frames(seq, &hash_function) } else if hash_function.skipm1n3() || hash_function.skipm2n3() { - Self::skipmer_frames(&seq, &hash_function, ksize) + Self::skipmer_frames(seq, &hash_function, ksize) } else { unimplemented!(); }; diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py index b95b081a75..a9af0fdd24 100644 --- a/tests/test_sourmash_sketch.py +++ b/tests/test_sourmash_sketch.py @@ -355,11 +355,6 @@ def test_protein_override_bad_rust_foo(): with pytest.raises(ValueError) as exc: sig.add_protein(record.sequence) - # assert 'Invalid hash function: "DNA"' in str(exc) - - # this case now ends up in the "DNA" section of SeqToHashes, - # so we run into the invalid k-mer error - # instead of invalid Hash Function. assert "invalid DNA character in input k-mer: MRVLKFGGTS" in str(exc)