From 05d6d28c6373fe1078a537c17a26065afbd48ed8 Mon Sep 17 00:00:00 2001 From: Max Brown Date: Thu, 15 Jul 2021 18:09:23 +0100 Subject: [PATCH] Fixes #5 - headers for output kmer frequencies done. --- Cargo.lock | 12 ++++- Cargo.toml | 3 +- src/kmer_maps.rs | 21 ++++++++- src/main.rs | 116 ++++++++++++++++++++++++++++++++--------------- 4 files changed, 112 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f8b192d..24bbe21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -64,7 +64,7 @@ dependencies = [ "fnv", "fxhash", "getset", - "itertools", + "itertools 0.9.0", "itertools-num", "lazy_static", "multimap", @@ -321,6 +321,7 @@ dependencies = [ "bio", "clap", "indicatif", + "itertools 0.10.1", "rayon", ] @@ -428,6 +429,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" +dependencies = [ + "either", +] + [[package]] name = "itertools-num" version = "0.1.3" diff --git a/Cargo.toml b/Cargo.toml index 405380e..abbcc29 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,4 +8,5 @@ edition = "2018" bio = "*" clap = "~2.27.0" rayon = "1.5.0" -indicatif = "*" \ No newline at end of file +indicatif = "*" +itertools = "0.10.1" \ No newline at end of file diff --git a/src/kmer_maps.rs b/src/kmer_maps.rs index 69e0575..6f816dd 100644 --- a/src/kmer_maps.rs +++ b/src/kmer_maps.rs @@ -87,7 +87,7 @@ pub mod kmer_maps { output } - // display for Vec + // hacky display for Vec #[derive(Clone)] pub struct WriteArray(pub Vec); @@ -104,4 +104,23 @@ pub mod kmer_maps { write!(f, "{}", tab_separated) } } + // hacky display for Vec> + // which is what the kmers are stored as for most of the time. + #[derive(Clone)] + pub struct WriteKmerValues<'a>(pub Vec<&'a Vec>); + + impl<'a> Display for WriteKmerValues<'a> { + fn fmt(&self, f: &mut Formatter) -> Result<(), Error> { + let mut tab_separated = String::new(); + + for kmer in &self.0[0..self.0.len() - 1] { + let kmer_str = std::str::from_utf8(kmer).unwrap(); + tab_separated.push_str(kmer_str); + tab_separated.push_str("\t"); + } + + tab_separated.push_str(std::str::from_utf8(&self.0[self.0.len() - 1]).unwrap()); + write!(f, "{}", tab_separated) + } + } } diff --git a/src/main.rs b/src/main.rs index 506ed6e..a41d042 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,6 +9,7 @@ use std::sync::mpsc::channel; use bio::io::fasta; use clap::{value_t, App, Arg}; use indicatif::{ProgressBar, ProgressStyle}; +use itertools::Itertools; use rayon::prelude::*; // internal imports @@ -201,6 +202,7 @@ fn main() { // write files // I pass over &res four times here... // pretty inefficient. + // I should really separate these functions from main. It's just lazy. eprintln!("[+]\tWriting output to files"); for i in &res { @@ -228,44 +230,84 @@ fn main() { window_file.flush().unwrap(); // these are the arrays, tab separated bed-like format. - for i in &res { - writeln!( - window_file_2, - "{}\t{}\t{}\t{}", - i.id, - i.start, - i.end, - kmer_maps::WriteArray(i.divalues.clone()) - ) - .unwrap_or_else(|_| eprintln!("[-]\tError in writing to file.")); - } - window_file_2.flush().unwrap(); - // - for i in &res { - writeln!( - window_file_3, - "{}\t{}\t{}\t{}", - i.id, - i.start, - i.end, - kmer_maps::WriteArray(i.trivalues.clone()) - ) - .unwrap_or_else(|_| eprintln!("[-]\tError in writing to file.")); - } - window_file_3.flush().unwrap(); - // - for i in &res { - writeln!( - window_file_4, - "{}\t{}\t{}\t{}", - i.id, - i.start, - i.end, - kmer_maps::WriteArray(i.tetravalues.clone()) - ) - .unwrap_or_else(|_| eprintln!("[-]\tError in writing to file.")); + // TODO: add headers for each of these + match kmer_maps.as_slice() { + [two, three, four] => { + // headers for dinucs + let mut dinuc_headers = Vec::new(); + for key in two.map.keys().sorted() { + dinuc_headers.push(key) + } + writeln!( + window_file_2, + "id\tstart\tend\t{}", + kmer_maps::WriteKmerValues(dinuc_headers) + ) + .unwrap_or_else(|_| eprintln!("[-]\tError in writing to file.")); + + for i in &res { + writeln!( + window_file_2, + "{}\t{}\t{}\t{}", + i.id, + i.start, + i.end, + kmer_maps::WriteArray(i.divalues.clone()) + ) + .unwrap_or_else(|_| eprintln!("[-]\tError in writing to file.")); + } + window_file_2.flush().unwrap(); + + // headers for trinucs + let mut trinuc_headers = Vec::new(); + for key in three.map.keys().sorted() { + trinuc_headers.push(key) + } + writeln!( + window_file_3, + "id\tstart\tend\t{}", + kmer_maps::WriteKmerValues(trinuc_headers) + ) + .unwrap_or_else(|_| eprintln!("[-]\tError in writing to file.")); + for i in &res { + writeln!( + window_file_3, + "{}\t{}\t{}\t{}", + i.id, + i.start, + i.end, + kmer_maps::WriteArray(i.trivalues.clone()) + ) + .unwrap_or_else(|_| eprintln!("[-]\tError in writing to file.")); + } + window_file_3.flush().unwrap(); + // + // headers for tetranucs + let mut tetranuc_headers = Vec::new(); + for key in four.map.keys().sorted() { + tetranuc_headers.push(key) + } + writeln!( + window_file_4, + "id\tstart\tend\t{}", + kmer_maps::WriteKmerValues(tetranuc_headers) + ) + .unwrap_or_else(|_| eprintln!("[-]\tError in writing to file.")); + for i in &res { + writeln!( + window_file_4, + "{}\t{}\t{}\t{}", + i.id, + i.start, + i.end, + kmer_maps::WriteArray(i.tetravalues.clone()) + ) + .unwrap_or_else(|_| eprintln!("[-]\tError in writing to file.")); + } + window_file_4.flush().unwrap(); + } + [..] => {} // Needed to make the patterns exhaustive } - window_file_4.flush().unwrap(); eprintln!("[+]\tOutput written to directory: ./fw_out/{}", output); }