From 2ba40b0e4c5696d3589030b2edfed75333646b9f Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 2 Dec 2024 17:18:46 -0800 Subject: [PATCH] init ReadingFrames --- src/core/src/encodings.rs | 93 +++++++++++++++++++++++++++++++++++++++ src/core/src/signature.rs | 17 ++++++- 2 files changed, 108 insertions(+), 2 deletions(-) diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index a7c2f3cef..5a215a3e9 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -94,6 +94,21 @@ impl TryFrom<&str> for HashFunctions { } } +#[derive(Debug)] +pub struct ReadingFrames { + forward: [Vec; 3], + revcomp: [Vec; 3], +} + +impl ReadingFrames { + pub fn new(forward: [Vec; 3], rc: [Vec; 3]) -> Self { + ReadingFrames { + forward, + revcomp: rc, + } + } +} + const COMPLEMENT: [u8; 256] = { let mut lookup = [0; 256]; lookup[b'A' as usize] = b'T'; @@ -112,6 +127,84 @@ pub fn revcomp(seq: &[u8]) -> Vec { .collect() } +/// Generate a single translated frame from a DNA sequence +/// +/// * `sequence`: The DNA sequence as a slice of bytes. +/// * `frame_number`: The frame to translate (0, 1, or 2). +/// * `dayhoff`: Whether to use the Dayhoff amino acid alphabet. +/// * `hp`: Whether to use the hydrophobic-polar amino acid alphabet. +/// +/// Returns a translated frame as a `Vec`. +pub fn translated_frame(sequence: &[u8], frame_number: usize, dayhoff: bool, hp: bool) -> Vec { + if frame_number > 2 { + panic!("Frame number must be 0, 1, or 2"); + } + + sequence + .iter() + .cloned() + .skip(frame_number) // Skip the initial bases for the frame + .take(sequence.len() - frame_number) // Adjust length based on skipped bases + .collect::>() // Collect the DNA subsequence + .chunks(3) // Group into codons (triplets) + .filter_map(|codon| to_aa(codon, dayhoff, hp).ok()) // Translate each codon + .flatten() // Flatten the nested results into a single sequence + .collect() +} + +pub fn make_translated_frames( + sequence: &[u8], + dna_rc: &[u8], + dayhoff: bool, + hp: bool, +) -> ReadingFrames { + // Generate forward frames + let forward = [ + translated_frame(sequence, 0, dayhoff, hp), + translated_frame(sequence, 1, dayhoff, hp), + translated_frame(sequence, 2, dayhoff, hp), + ]; + + // Generate reverse complement frames + let revcomp = [ + translated_frame(dna_rc, 0, dayhoff, hp), + translated_frame(dna_rc, 1, dayhoff, hp), + translated_frame(dna_rc, 2, dayhoff, hp), + ]; + + // Return a ReadingFrames object + ReadingFrames::new(forward, revcomp) +} + +fn skipmer_frame(seq: &[u8], start: usize, n: usize, m: usize) -> Vec { + seq.iter() + .skip(start) + .enumerate() + .filter_map(|(i, &base)| if i % n < m { Some(base) } else { None }) + .collect() +} + +pub fn make_skipmer_frames(seq: &[u8], n: usize, m: usize) -> ReadingFrames { + if m >= n { + panic!("m must be less than n"); + } + // Generate the first three forward frames + let forward: [Vec; 3] = [ + skipmer_frame(seq, 0, n, m), + skipmer_frame(seq, 1, n, m), + skipmer_frame(seq, 2, n, m), + ]; + // Generate the reverse complement frames + let reverse_complement: [Vec; 3] = [ + revcomp(&forward[0]), + revcomp(&forward[1]), + revcomp(&forward[2]), + ]; + + // Return the frames in a structured format + ReadingFrames::new(forward, reverse_complement) +} + static CODONTABLE: Lazy> = Lazy::new(|| { [ // F diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index b84c593f6..ac580ed0a 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -287,8 +287,12 @@ impl Iterator for SeqToHashes { self.skip_m = 2 }; // eqn from skipmer paper. might want to add one to dna_ksize to round up? - self.skip_len = - self.skip_n * (((self.dna_ksize) / self.skip_m) - 1) + self.skip_m; + // to do - check if we need to enforce k = multiple of n for revcomp k-mers to work + // , or if we can keep the round up trick I'm using here. + eprintln!("setting skipmer extended length"); + self.skip_len = (self.skip_n * (((self.dna_ksize + 1) / self.skip_m) - 1)) + + self.skip_m; + eprintln!("skipmer extended length: {}", self.skip_len); // my prior eqn // self.skip_len = self.dna_ksize + ((self.dna_ksize + 1) / self.skip_m) - 1; // add 1 to round up rather than down @@ -296,8 +300,10 @@ impl Iterator for SeqToHashes { if self.k_size < self.skip_n { unimplemented!() } + self.skipmer_configured = true; } // have enough sequence to kmerize? + eprintln!("checking seq len"); if self.dna_len < self.dna_ksize || (self.hash_function.protein() && self.dna_len < self.k_size * 3) || (self.hash_function.dayhoff() && self.dna_len < self.k_size * 3) @@ -307,12 +313,15 @@ impl Iterator for SeqToHashes { return None; } // pre-calculate the reverse complement for the full sequence... + eprintln!("precalculating revcomp"); + // NOTE: Shall we precalc skipmer seq here too? + maybe translated frames? self.dna_rc = revcomp(&self.sequence); self.dna_configured = true; } // Processing DNA if self.hash_function.dna() { + eprintln!("processing DNA"); let kmer = &self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize]; // validate the bases @@ -353,6 +362,7 @@ impl Iterator for SeqToHashes { self.kmer_index += 1; Some(Ok(hash)) } else if self.skipmer_configured { + eprintln!("processing skipmer"); // Check bounds to ensure we don't exceed the sequence length if self.kmer_index + self.skip_len > self.sequence.len() { return None; @@ -372,8 +382,10 @@ impl Iterator for SeqToHashes { self.kmer_index += 1; // Move to the next position if skipping is forced return Some(result); } + eprintln!("base {}", base); kmer.push(base); } + // eprintln!("skipmer kmer: {:?}", kmer); // Generate reverse complement skipmer let krc: Vec = self.dna_rc[self.dna_len - self.skip_len - self.kmer_index @@ -387,6 +399,7 @@ impl Iterator for SeqToHashes { let hash = crate::_hash_murmur(std::cmp::min(&kmer, &krc), self.seed); self.kmer_index += 1; + eprintln!("built skipmer hash"); Some(Ok(hash)) } else if self.hashes_buffer.is_empty() && self.translate_iter_step == 0 { // Processing protein by translating DNA