Skip to content

Commit

Permalink
init ReadingFrames
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Dec 3, 2024
1 parent 1868ab2 commit 2ba40b0
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 2 deletions.
93 changes: 93 additions & 0 deletions src/core/src/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,21 @@ impl TryFrom<&str> for HashFunctions {
}
}

#[derive(Debug)]
pub struct ReadingFrames {
forward: [Vec<u8>; 3],

Check warning on line 99 in src/core/src/encodings.rs

View workflow job for this annotation

GitHub Actions / Check

fields `forward` and `revcomp` are never read

Check failure on line 99 in src/core/src/encodings.rs

View workflow job for this annotation

GitHub Actions / Lints (stable)

fields `forward` and `revcomp` are never read

Check failure on line 99 in src/core/src/encodings.rs

View workflow job for this annotation

GitHub Actions / Lints (beta)

fields `forward` and `revcomp` are never read

Check warning on line 99 in src/core/src/encodings.rs

View workflow job for this annotation

GitHub Actions / test (stable)

fields `forward` and `revcomp` are never read

Check warning on line 99 in src/core/src/encodings.rs

View workflow job for this annotation

GitHub Actions / test (beta)

fields `forward` and `revcomp` are never read

Check warning on line 99 in src/core/src/encodings.rs

View workflow job for this annotation

GitHub Actions / test (windows)

fields `forward` and `revcomp` are never read

Check warning on line 99 in src/core/src/encodings.rs

View workflow job for this annotation

GitHub Actions / minimum_rust_version

fields `forward` and `revcomp` are never read

Check warning on line 99 in src/core/src/encodings.rs

View workflow job for this annotation

GitHub Actions / test (macos)

fields `forward` and `revcomp` are never read
revcomp: [Vec<u8>; 3],
}

impl ReadingFrames {
pub fn new(forward: [Vec<u8>; 3], rc: [Vec<u8>; 3]) -> Self {
ReadingFrames {
forward,
revcomp: rc,
}
}
}

const COMPLEMENT: [u8; 256] = {
let mut lookup = [0; 256];
lookup[b'A' as usize] = b'T';
Expand All @@ -112,6 +127,84 @@ pub fn revcomp(seq: &[u8]) -> Vec<u8> {
.collect()
}

/// Generate a single translated frame from a DNA sequence
///
/// * `sequence`: The DNA sequence as a slice of bytes.
/// * `frame_number`: The frame to translate (0, 1, or 2).
/// * `dayhoff`: Whether to use the Dayhoff amino acid alphabet.
/// * `hp`: Whether to use the hydrophobic-polar amino acid alphabet.
///
/// Returns a translated frame as a `Vec<u8>`.
pub fn translated_frame(sequence: &[u8], frame_number: usize, dayhoff: bool, hp: bool) -> Vec<u8> {
if frame_number > 2 {
panic!("Frame number must be 0, 1, or 2");
}

sequence
.iter()
.cloned()
.skip(frame_number) // Skip the initial bases for the frame
.take(sequence.len() - frame_number) // Adjust length based on skipped bases
.collect::<Vec<u8>>() // Collect the DNA subsequence
.chunks(3) // Group into codons (triplets)
.filter_map(|codon| to_aa(codon, dayhoff, hp).ok()) // Translate each codon
.flatten() // Flatten the nested results into a single sequence
.collect()
}

pub fn make_translated_frames(
sequence: &[u8],
dna_rc: &[u8],
dayhoff: bool,
hp: bool,
) -> ReadingFrames {
// Generate forward frames
let forward = [
translated_frame(sequence, 0, dayhoff, hp),
translated_frame(sequence, 1, dayhoff, hp),
translated_frame(sequence, 2, dayhoff, hp),
];

// Generate reverse complement frames
let revcomp = [
translated_frame(dna_rc, 0, dayhoff, hp),
translated_frame(dna_rc, 1, dayhoff, hp),
translated_frame(dna_rc, 2, dayhoff, hp),
];

// Return a ReadingFrames object
ReadingFrames::new(forward, revcomp)
}

fn skipmer_frame(seq: &[u8], start: usize, n: usize, m: usize) -> Vec<u8> {
seq.iter()
.skip(start)
.enumerate()
.filter_map(|(i, &base)| if i % n < m { Some(base) } else { None })
.collect()
}

pub fn make_skipmer_frames(seq: &[u8], n: usize, m: usize) -> ReadingFrames {
if m >= n {
panic!("m must be less than n");
}
// Generate the first three forward frames
let forward: [Vec<u8>; 3] = [
skipmer_frame(seq, 0, n, m),
skipmer_frame(seq, 1, n, m),
skipmer_frame(seq, 2, n, m),
];
// Generate the reverse complement frames
let reverse_complement: [Vec<u8>; 3] = [
revcomp(&forward[0]),
revcomp(&forward[1]),
revcomp(&forward[2]),
];

// Return the frames in a structured format
ReadingFrames::new(forward, reverse_complement)
}

static CODONTABLE: Lazy<HashMap<&'static str, u8>> = Lazy::new(|| {
[
// F
Expand Down
17 changes: 15 additions & 2 deletions src/core/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -287,17 +287,23 @@ impl Iterator for SeqToHashes {
self.skip_m = 2
};
// eqn from skipmer paper. might want to add one to dna_ksize to round up?
self.skip_len =
self.skip_n * (((self.dna_ksize) / self.skip_m) - 1) + self.skip_m;
// to do - check if we need to enforce k = multiple of n for revcomp k-mers to work
// , or if we can keep the round up trick I'm using here.
eprintln!("setting skipmer extended length");
self.skip_len = (self.skip_n * (((self.dna_ksize + 1) / self.skip_m) - 1))
+ self.skip_m;
eprintln!("skipmer extended length: {}", self.skip_len);
// my prior eqn
// self.skip_len = self.dna_ksize + ((self.dna_ksize + 1) / self.skip_m) - 1; // add 1 to round up rather than down

// check that we can actually build skipmers
if self.k_size < self.skip_n {
unimplemented!()
}
self.skipmer_configured = true;
}
// have enough sequence to kmerize?
eprintln!("checking seq len");
if self.dna_len < self.dna_ksize
|| (self.hash_function.protein() && self.dna_len < self.k_size * 3)
|| (self.hash_function.dayhoff() && self.dna_len < self.k_size * 3)
Expand All @@ -307,12 +313,15 @@ impl Iterator for SeqToHashes {
return None;
}
// pre-calculate the reverse complement for the full sequence...
eprintln!("precalculating revcomp");
// NOTE: Shall we precalc skipmer seq here too? + maybe translated frames?
self.dna_rc = revcomp(&self.sequence);
self.dna_configured = true;
}

// Processing DNA
if self.hash_function.dna() {
eprintln!("processing DNA");
let kmer = &self.sequence[self.kmer_index..self.kmer_index + self.dna_ksize];

// validate the bases
Expand Down Expand Up @@ -353,6 +362,7 @@ impl Iterator for SeqToHashes {
self.kmer_index += 1;
Some(Ok(hash))
} else if self.skipmer_configured {
eprintln!("processing skipmer");
// Check bounds to ensure we don't exceed the sequence length
if self.kmer_index + self.skip_len > self.sequence.len() {
return None;
Expand All @@ -372,8 +382,10 @@ impl Iterator for SeqToHashes {
self.kmer_index += 1; // Move to the next position if skipping is forced
return Some(result);
}
eprintln!("base {}", base);
kmer.push(base);
}
// eprintln!("skipmer kmer: {:?}", kmer);

// Generate reverse complement skipmer
let krc: Vec<u8> = self.dna_rc[self.dna_len - self.skip_len - self.kmer_index
Expand All @@ -387,6 +399,7 @@ impl Iterator for SeqToHashes {

let hash = crate::_hash_murmur(std::cmp::min(&kmer, &krc), self.seed);
self.kmer_index += 1;
eprintln!("built skipmer hash");
Some(Ok(hash))
} else if self.hashes_buffer.is_empty() && self.translate_iter_step == 0 {
// Processing protein by translating DNA
Expand Down

0 comments on commit 2ba40b0

Please sign in to comment.