Skip to content

Commit

Permalink
Making sure canonical kmer hashmaps right size.
Browse files Browse the repository at this point in the history
  • Loading branch information
Euphrasiologist committed Jul 15, 2021
1 parent 2558986 commit 02b1f6b
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 9 deletions.
42 changes: 40 additions & 2 deletions src/kmer_maps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,28 @@ pub mod kmer_maps {
use std::collections::HashMap;
use std::fmt::{Display, Error, Formatter};

use crate::kmeru8::kmeru8::reverse_complement;

#[derive(Debug, Clone)]
pub struct KmerMap {
pub len: usize,
pub map: HashMap<Vec<u8>, i32>,
}

pub fn generate_kmer_maps() -> Vec<KmerMap> {
pub fn generate_kmer_maps(canonical: bool) -> Vec<KmerMap> {
let kmer_maps: Vec<KmerMap> = vec![2, 3, 4]
.iter()
.map(|i| {
let kmer_i = gen_all_kmers(*i);
let mut kmer_i = gen_all_kmers(*i);

// if canonical = true, call filter_canonical
match canonical {
true => {
kmer_i = filter_canonical(kmer_i);
}
false => (),
}

let mut kmers_u8: Vec<Vec<u8>> = Vec::new();
for i in kmer_i {
kmers_u8.push(i.as_bytes().to_vec());
Expand All @@ -30,6 +41,33 @@ pub mod kmer_maps {
kmer_maps
}

// filter Vec<String> of kmers for canonical only.
// so the hashmap will only contain canonical kmer keys

fn filter_canonical(kmers: Vec<String>) -> Vec<String> {
let revcomp_kmers: Vec<String> = kmers
.iter()
.map(|e| {
std::str::from_utf8(&reverse_complement(e.as_bytes()))
.unwrap()
.to_owned()
})
.collect();

let mut canonical_kmers = Vec::new();

for (e1, e2) in kmers.into_iter().zip(revcomp_kmers.into_iter()) {
if e1 < e2 {
canonical_kmers.push(e1);
} else {
canonical_kmers.push(e2);
}
}
canonical_kmers.sort_unstable();
canonical_kmers.dedup();
canonical_kmers
}

// move out of kmeru8.rs
fn gen_all_kmers(k: usize) -> Vec<String> {
let mut output: Vec<String> = Vec::new();
Expand Down
3 changes: 2 additions & 1 deletion src/kmeru8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ pub mod kmeru8 {
// unfortunately this creates a copy
// but in place manipulation seems difficult, because rust.
let mut kmer_upper = kmer.to_ascii_uppercase();
// canonical is really slow compared to non-canonical.
if canonical {
// switch to lexicographically lower kmer
let rev_kmer = reverse_complement(&kmer_upper);
Expand Down Expand Up @@ -137,7 +138,7 @@ pub mod kmeru8 {
diversity
}

fn reverse_complement(dna: &[u8]) -> Vec<u8> {
pub fn reverse_complement(dna: &[u8]) -> Vec<u8> {
let dna_vec = dna.to_vec();
let mut revcomp = Vec::new();

Expand Down
9 changes: 3 additions & 6 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,7 @@ fn main() {
Arg::with_name("canonical_kmers")
.short("c")
.long("canonical_kmers")
.help("Should the canonical kmers be calculated? Boolean, input true or false.")
.takes_value(true)
.default_value("false"),
.help("Should the canonical kmers be calculated?"),
)
.arg(
Arg::with_name("output")
Expand All @@ -65,8 +63,7 @@ fn main() {
let input_fasta = matches.value_of("fasta").unwrap();
let output = matches.value_of("output").unwrap();
let window_size = value_t!(matches.value_of("window_size"), usize).unwrap_or_else(|e| e.exit());
let canonical_kmers =
value_t!(matches.value_of("canonical_kmers"), bool).unwrap_or_else(|e| e.exit());
let canonical_kmers = matches.is_present("canonical_kmers");
let masked = matches.is_present("masked");

// create directory for output
Expand Down Expand Up @@ -120,7 +117,7 @@ fn main() {
}

// compute the 2-4mer kmer maps once only
let kmer_maps = kmer_maps::generate_kmer_maps();
let kmer_maps = kmer_maps::generate_kmer_maps(canonical_kmers);

// channel for collecting output
let (sender, receiver) = channel();
Expand Down

0 comments on commit 02b1f6b

Please sign in to comment.