diff --git a/src/cli.rs b/src/cli.rs index 4a6ca1ac..c86ce035 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -14,6 +14,7 @@ mod footprint_opts; mod nucleosome_opts; mod pileup_opts; mod predict_opts; +mod qc_opts; mod strip_basemods_opts; // include the subcommand modules as top level functions and structs in the cli module @@ -27,6 +28,7 @@ pub use footprint_opts::*; pub use nucleosome_opts::*; pub use pileup_opts::*; pub use predict_opts::*; +pub use qc_opts::*; pub use strip_basemods_opts::*; // diff --git a/src/cli/qc_opts.rs b/src/cli/qc_opts.rs new file mode 100644 index 00000000..3bc71de2 --- /dev/null +++ b/src/cli/qc_opts.rs @@ -0,0 +1,9 @@ +use crate::utils::input_bam::InputBam; +use clap::Args; +use std::fmt::Debug; + +#[derive(Args, Debug)] +pub struct QcOpts { + #[clap(flatten)] + pub input: InputBam, +} diff --git a/src/subcommands.rs b/src/subcommands.rs index 75aed803..e8f4fd59 100644 --- a/src/subcommands.rs +++ b/src/subcommands.rs @@ -16,5 +16,7 @@ pub mod footprint; pub mod pileup; /// m6A prediction pub mod predict_m6a; +/// Collect QC metrics +pub mod qc; /// Remove base modifications from a bam record pub mod strip_basemods; diff --git a/src/subcommands/qc.rs b/src/subcommands/qc.rs new file mode 100644 index 00000000..7d79bedc --- /dev/null +++ b/src/subcommands/qc.rs @@ -0,0 +1,108 @@ +use crate::cli::QcOpts; +use crate::fiber; +use anyhow::Result; +use ordered_float::OrderedFloat; +use std::collections::HashMap; + +pub struct QcStats { + pub fiber_count: i64, + // hashmap that stores lengths of fibers + pub fiber_lengths: HashMap, + // length of msps + pub msp_lengths: HashMap, + // lengths of nucleosomes + pub nuc_lengths: HashMap, + // number of ccs passes per read + pub ccs_passes: HashMap, i64>, + /// m6as per read + pub m6a_count: HashMap, + // m6as over total AT count + pub m6a_ratio: HashMap, i64>, + // add rq to stats + pub rq: HashMap, i64>, +} + +impl QcStats { + pub fn new() -> Self { + Self { + fiber_count: 0, + fiber_lengths: HashMap::new(), + msp_lengths: HashMap::new(), + nuc_lengths: HashMap::new(), + ccs_passes: HashMap::new(), + m6a_count: HashMap::new(), + m6a_ratio: HashMap::new(), + rq: HashMap::new(), + } + } + + pub fn add_read_to_stats(&mut self, fiber: &fiber::FiberseqData) { + self.full_read_stats(fiber); + self.add_m6a_stats(fiber); + Self::add_range_lengths(&mut self.msp_lengths, &fiber.msp); + Self::add_range_lengths(&mut self.nuc_lengths, &fiber.nuc); + } + + fn full_read_stats(&mut self, fiber: &fiber::FiberseqData) { + self.fiber_count += 1; + self.fiber_lengths + .entry(fiber.record.seq_len() as i64) + .and_modify(|e| *e += 1) + .or_insert(1); + self.ccs_passes + .entry(OrderedFloat(fiber.ec)) + .and_modify(|e| *e += 1) + .or_insert(1); + if let Some(rq) = fiber.get_rq() { + self.rq + .entry(OrderedFloat(rq)) + .and_modify(|e| *e += 1) + .or_insert(1); + } + } + + fn add_m6a_stats(&mut self, fiber: &fiber::FiberseqData) { + let m6a_count = fiber.m6a.starts.len() as i64; + self.m6a_count + .entry(m6a_count) + .and_modify(|e| *e += 1) + .or_insert(1); + + // now get the ratio + let total_at = fiber + .record + .seq() + .as_bytes() + .iter() + .filter(|&b| *b == b'A' || *b == b'T') + .count(); + + let ratio = m6a_count as f32 / total_at as f32; + self.m6a_ratio + .entry(OrderedFloat(ratio)) + .and_modify(|e| *e += 1) + .or_insert(1); + } + + fn add_range_lengths(hashmap: &mut HashMap, range: &crate::utils::bamranges::Ranges) { + for r in range.lengths.iter().flatten() { + hashmap.entry(*r).and_modify(|e| *e += 1).or_insert(1); + } + } +} + +impl Default for QcStats { + fn default() -> Self { + Self::new() + } +} + +pub fn run_qc(opts: &mut QcOpts) -> Result<(), anyhow::Error> { + let mut bam = opts.input.bam_reader(); + let mut stats = QcStats::default(); + + for fiber in opts.input.fibers(&mut bam) { + stats.add_read_to_stats(&fiber); + } + Ok(()) +} diff --git a/tests/basemods.rs b/tests/basemods.rs index 7b84d3ca..37e34959 100644 --- a/tests/basemods.rs +++ b/tests/basemods.rs @@ -1,6 +1,5 @@ use env_logger::{Builder, Target}; use fibertools_rs::utils::basemods::*; -use log; use rust_htslib::{bam, bam::Read}; #[test] @@ -11,7 +10,7 @@ fn test_mods_do_not_change() { .target(Target::Stderr) .filter(None, log::LevelFilter::Debug) .init(); - let mut bam = bam::Reader::from_path(&"tests/data/all.bam").unwrap(); + let mut bam = bam::Reader::from_path("tests/data/all.bam").unwrap(); for rec in bam.records() { let mut rec = rec.unwrap(); let mods = BaseMods::new(&rec, 0); diff --git a/tests/bio_io.rs b/tests/bio_io.rs index 01a80d19..ca54ae57 100644 --- a/tests/bio_io.rs +++ b/tests/bio_io.rs @@ -1,10 +1,9 @@ use fibertools_rs::utils::bio_io; -use log; use rust_htslib::{bam, bam::Read}; #[test] pub fn test_msp_nuc_tags() { - let mut bam = bam::Reader::from_path(&"tests/data/all.bam").unwrap(); + let mut bam = bam::Reader::from_path("tests/data/all.bam").unwrap(); for record in bam.records() { let record = record.unwrap(); let _n_s = bio_io::get_u32_tag(&record, b"ns"); diff --git a/tests/extract_test.rs b/tests/extract_test.rs index 75536e77..9b02b946 100644 --- a/tests/extract_test.rs +++ b/tests/extract_test.rs @@ -7,9 +7,8 @@ fn get_fiber_data_from_test_bam(bam_file: &str) -> Vec>() } @@ -44,8 +43,7 @@ fn test_many_msps() { .msp .starts .iter() - .flatten() - .map(|&x| x) + .flatten().copied() .chain(fiber_data.msp.ends.iter().flatten().map(|&x| x - 1)) .collect::>(); eprintln!("m6a: {:?}", m6a); diff --git a/tests/m6a_prediction_test.rs b/tests/m6a_prediction_test.rs index 3ceaffd7..25d2f826 100644 --- a/tests/m6a_prediction_test.rs +++ b/tests/m6a_prediction_test.rs @@ -1,4 +1,3 @@ -use fibertools_rs; use fibertools_rs::utils::bio_io; use fibertools_rs::utils::input_bam::FiberFilters; use rust_htslib::bam::Reader; @@ -25,7 +24,7 @@ fn run_prediction_and_count_qual(inbam: String) -> usize { let named_tmp_bam_out = NamedTempFile::new().unwrap(); let out_str = named_tmp_bam_out.path().to_str().unwrap(); let mut predict_options = fibertools_rs::cli::PredictM6AOptions::default(); - predict_options.input.bam = inbam.clone(); + predict_options.input.bam.clone_from(&inbam); predict_options.input.global.threads = 1; predict_options.out = out_str.to_string();