From 02b1f6b67349a0eb5d037b0a5b87d1b11dd12a57 Mon Sep 17 00:00:00 2001 From: Max Brown Date: Thu, 15 Jul 2021 15:18:40 +0100 Subject: [PATCH] Making sure canonical kmer hashmaps right size. --- src/kmer_maps.rs | 42 ++++++++++++++++++++++++++++++++++++++++-- src/kmeru8.rs | 3 ++- src/main.rs | 9 +++------ 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/src/kmer_maps.rs b/src/kmer_maps.rs index 08abb91..69e0575 100644 --- a/src/kmer_maps.rs +++ b/src/kmer_maps.rs @@ -3,17 +3,28 @@ pub mod kmer_maps { use std::collections::HashMap; use std::fmt::{Display, Error, Formatter}; + use crate::kmeru8::kmeru8::reverse_complement; + #[derive(Debug, Clone)] pub struct KmerMap { pub len: usize, pub map: HashMap, i32>, } - pub fn generate_kmer_maps() -> Vec { + pub fn generate_kmer_maps(canonical: bool) -> Vec { let kmer_maps: Vec = vec![2, 3, 4] .iter() .map(|i| { - let kmer_i = gen_all_kmers(*i); + let mut kmer_i = gen_all_kmers(*i); + + // if canonical = true, call filter_canonical + match canonical { + true => { + kmer_i = filter_canonical(kmer_i); + } + false => (), + } + let mut kmers_u8: Vec> = Vec::new(); for i in kmer_i { kmers_u8.push(i.as_bytes().to_vec()); @@ -30,6 +41,33 @@ pub mod kmer_maps { kmer_maps } + // filter Vec of kmers for canonical only. + // so the hashmap will only contain canonical kmer keys + + fn filter_canonical(kmers: Vec) -> Vec { + let revcomp_kmers: Vec = kmers + .iter() + .map(|e| { + std::str::from_utf8(&reverse_complement(e.as_bytes())) + .unwrap() + .to_owned() + }) + .collect(); + + let mut canonical_kmers = Vec::new(); + + for (e1, e2) in kmers.into_iter().zip(revcomp_kmers.into_iter()) { + if e1 < e2 { + canonical_kmers.push(e1); + } else { + canonical_kmers.push(e2); + } + } + canonical_kmers.sort_unstable(); + canonical_kmers.dedup(); + canonical_kmers + } + // move out of kmeru8.rs fn gen_all_kmers(k: usize) -> Vec { let mut output: Vec = Vec::new(); diff --git a/src/kmeru8.rs b/src/kmeru8.rs index 95f9465..461aeca 100644 --- a/src/kmeru8.rs +++ b/src/kmeru8.rs @@ -46,6 +46,7 @@ pub mod kmeru8 { // unfortunately this creates a copy // but in place manipulation seems difficult, because rust. let mut kmer_upper = kmer.to_ascii_uppercase(); + // canonical is really slow compared to non-canonical. if canonical { // switch to lexicographically lower kmer let rev_kmer = reverse_complement(&kmer_upper); @@ -137,7 +138,7 @@ pub mod kmeru8 { diversity } - fn reverse_complement(dna: &[u8]) -> Vec { + pub fn reverse_complement(dna: &[u8]) -> Vec { let dna_vec = dna.to_vec(); let mut revcomp = Vec::new(); diff --git a/src/main.rs b/src/main.rs index 77eeb61..506ed6e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -42,9 +42,7 @@ fn main() { Arg::with_name("canonical_kmers") .short("c") .long("canonical_kmers") - .help("Should the canonical kmers be calculated? Boolean, input true or false.") - .takes_value(true) - .default_value("false"), + .help("Should the canonical kmers be calculated?"), ) .arg( Arg::with_name("output") @@ -65,8 +63,7 @@ fn main() { let input_fasta = matches.value_of("fasta").unwrap(); let output = matches.value_of("output").unwrap(); let window_size = value_t!(matches.value_of("window_size"), usize).unwrap_or_else(|e| e.exit()); - let canonical_kmers = - value_t!(matches.value_of("canonical_kmers"), bool).unwrap_or_else(|e| e.exit()); + let canonical_kmers = matches.is_present("canonical_kmers"); let masked = matches.is_present("masked"); // create directory for output @@ -120,7 +117,7 @@ fn main() { } // compute the 2-4mer kmer maps once only - let kmer_maps = kmer_maps::generate_kmer_maps(); + let kmer_maps = kmer_maps::generate_kmer_maps(canonical_kmers); // channel for collecting output let (sender, receiver) = channel();