From 43863923285d2e6bc8a0339572f0f3e6642ae745 Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Thu, 23 May 2024 16:22:44 +1000 Subject: [PATCH 1/6] actions: Attempt to fix env file path. --- .github/workflows/test-coverm.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-coverm.yml b/.github/workflows/test-coverm.yml index be7fab4..5c4a7f3 100644 --- a/.github/workflows/test-coverm.yml +++ b/.github/workflows/test-coverm.yml @@ -15,7 +15,7 @@ jobs: with: auto-update-conda: true python-version: ${{ matrix.python-version }} - environment-file: coverm.yml + environment-file: ../coverm.yml channels: conda-forge,defaults,bioconda - name: Conda info shell: bash -el {0} @@ -39,7 +39,7 @@ jobs: with: auto-update-conda: true python-version: ${{ matrix.python-version }} - environment-file: coverm-osx.yml + environment-file: ../coverm-osx.yml channels: conda-forge,defaults,bioconda - name: Conda info shell: bash -el {0} From aeadc5c8b3cf3ba8ff8b14266d069d36edc7b92b Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Thu, 23 May 2024 16:26:58 +1000 Subject: [PATCH 2/6] actions --- .github/workflows/test-coverm.yml | 48 +++++++++++++++---------------- coverm.yml => environment.yml | 0 2 files changed, 24 insertions(+), 24 deletions(-) rename coverm.yml => environment.yml (100%) diff --git a/.github/workflows/test-coverm.yml b/.github/workflows/test-coverm.yml index 5c4a7f3..ef33480 100644 --- a/.github/workflows/test-coverm.yml +++ b/.github/workflows/test-coverm.yml @@ -26,27 +26,27 @@ jobs: - name: Run test run: | cargo test - miniconda_osx: - name: miniconda_osx (${{ matrix.python-version }}, ${{ matrix.os }}) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: ["macos-latest"] - python-version: ["3.11"] - steps: - - uses: conda-incubator/setup-miniconda@v3 - with: - auto-update-conda: true - python-version: ${{ matrix.python-version }} - environment-file: ../coverm-osx.yml - channels: conda-forge,defaults,bioconda - - name: Conda info - shell: bash -el {0} - run: conda info - - name: Conda list - shell: pwsh - run: conda list - - name: Run test - run: | - cargo test -- --skip bwa_mem2 + # miniconda_osx: + # name: miniconda_osx (${{ matrix.python-version }}, ${{ matrix.os }}) + # runs-on: ${{ matrix.os }} + # strategy: + # fail-fast: false + # matrix: + # os: ["macos-latest"] + # python-version: ["3.11"] + # steps: + # - uses: conda-incubator/setup-miniconda@v3 + # with: + # auto-update-conda: true + # python-version: ${{ matrix.python-version }} + # environment-file: ../coverm-osx.yml + # channels: conda-forge,defaults,bioconda + # - name: Conda info + # shell: bash -el {0} + # run: conda info + # - name: Conda list + # shell: pwsh + # run: conda list + # - name: Run test + # run: | + # cargo test -- --skip bwa_mem2 diff --git a/coverm.yml b/environment.yml similarity index 100% rename from coverm.yml rename to environment.yml From b2829fb32c01248ce4673e9cc7e027a785fba9d3 Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Thu, 23 May 2024 16:27:43 +1000 Subject: [PATCH 3/6] actions --- .github/workflows/test-coverm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-coverm.yml b/.github/workflows/test-coverm.yml index ef33480..ed9324e 100644 --- a/.github/workflows/test-coverm.yml +++ b/.github/workflows/test-coverm.yml @@ -15,7 +15,7 @@ jobs: with: auto-update-conda: true python-version: ${{ matrix.python-version }} - environment-file: ../coverm.yml + environment-file: environment.yml channels: conda-forge,defaults,bioconda - name: Conda info shell: bash -el {0} From 4b2093c73307faf8d3bedc8a8045aef25e485230 Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Thu, 23 May 2024 16:29:55 +1000 Subject: [PATCH 4/6] actions --- .github/workflows/test-coverm.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-coverm.yml b/.github/workflows/test-coverm.yml index ed9324e..2d22e01 100644 --- a/.github/workflows/test-coverm.yml +++ b/.github/workflows/test-coverm.yml @@ -9,12 +9,12 @@ jobs: fail-fast: false matrix: os: ["ubuntu-latest"] - python-version: ["3.11"] + # python-version: ["3.11"] steps: - uses: conda-incubator/setup-miniconda@v3 with: - auto-update-conda: true - python-version: ${{ matrix.python-version }} + # auto-update-conda: true + # python-version: ${{ matrix.python-version }} environment-file: environment.yml channels: conda-forge,defaults,bioconda - name: Conda info From c1e225f5121d7b81fd4fce732b8177b552d92bb3 Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Tue, 17 Sep 2024 08:43:08 +1000 Subject: [PATCH 5/6] filter: Towards MAPQ filtering. --- src/filter.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/filter.rs b/src/filter.rs index de3607d..08d3b02 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -19,10 +19,12 @@ pub struct ReferenceSortedBamFilter { min_aligned_length_single: u32, min_percent_identity_single: f32, min_aligned_percent_single: f32, + min_mapq_single: u8, filter_pairs: bool, min_aligned_length_pair: u32, min_percent_identity_pair: f32, min_aligned_percent_pair: f32, + min_mapq_pair: u8, pub num_detected_primary_alignments: u64, flag_filters: FlagFilter, filter_out: bool, // true if we are filtering out reads @@ -92,6 +94,7 @@ impl ReferenceSortedBamFilter { self.min_aligned_length_single, self.min_percent_identity_single, self.min_aligned_percent_single, + self.min_mapq_single, ); if passes_filter2 == self.filter_out { return res; @@ -179,11 +182,13 @@ impl ReferenceSortedBamFilter { self.min_aligned_length_single, self.min_percent_identity_single, self.min_aligned_percent_single, + self.min_mapq_single, ) && single_read_passes_filter( record, self.min_aligned_length_single, self.min_percent_identity_single, self.min_aligned_percent_single, + self.min_mapq_single, ))) && read_pair_passes_filter( record, @@ -191,6 +196,7 @@ impl ReferenceSortedBamFilter { self.min_aligned_length_pair, self.min_percent_identity_pair, self.min_aligned_percent_pair, + self.min_mapq_pair, ); if passes_filter == self.filter_out { debug!("Read pair passed QC"); @@ -228,7 +234,12 @@ fn single_read_passes_filter( min_aligned_length_single: u32, min_percent_identity_single: f32, min_aligned_percent_single: f32, + min_mapq_single: u8, ) -> bool { + if record.mapq() < min_mapq_single { + return false; + } + let edit_distance1 = nm(record); let mut aligned: u32 = 0; @@ -260,7 +271,12 @@ fn read_pair_passes_filter( min_aligned_length_pair: u32, min_percent_identity_pair: f32, min_aligned_percent_pair: f32, + min_mapq_pair: u8, ) -> bool { + if record1.mapq() < min_mapq_pair || record2.mapq() < min_mapq_pair { + return false; + } + let edit_distance1 = nm(record1); let edit_distance2 = nm(record2); From 9cf7a008dc8ac1bdbe341f575bf16c3e524de1cb Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Tue, 17 Sep 2024 15:33:01 +1000 Subject: [PATCH 6/6] mapq filtering: Implement it. Fixes #219. Suggested by: @SDmetagenomics. --- Cargo.toml | 1 + src/bam_generator.rs | 6 ++ src/bin/coverm.rs | 7 ++ src/cli.rs | 25 ++++++ src/contig.rs | 1 - src/coverage_printer.rs | 1 - src/filter.rs | 147 ++++++++++++++++++++++++++++--- src/genome.rs | 1 - src/mapping_index_maintenance.rs | 2 +- tests/data/mapq_test.sam | 14 +++ tests/test_cmdline.rs | 121 +++++++++++++++++++++++++ 11 files changed, 312 insertions(+), 14 deletions(-) create mode 100644 tests/data/mapq_test.sam diff --git a/Cargo.toml b/Cargo.toml index 7c1f26a..42fffde 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ exclude = [ # Max upload is 10MB, as of writing test data was 15MB "tests/*", ] default-run = "coverm" +edition = "2015" [dependencies] bio = "1.1.0" diff --git a/src/bam_generator.rs b/src/bam_generator.rs index 1d04bc4..5ded5a0 100644 --- a/src/bam_generator.rs +++ b/src/bam_generator.rs @@ -486,6 +486,7 @@ pub fn generate_filtered_bam_readers_from_bam_files( min_aligned_length_single: u32, min_percent_identity_single: f32, min_aligned_percent_single: f32, + min_mapq: u8, min_aligned_length_pair: u32, min_percent_identity_pair: f32, min_aligned_percent_pair: f32, @@ -511,6 +512,7 @@ pub fn generate_filtered_bam_readers_from_bam_files( min_aligned_length_single, min_percent_identity_single, min_aligned_percent_single, + min_mapq, min_aligned_length_pair, min_percent_identity_pair, min_aligned_percent_pair, @@ -544,6 +546,7 @@ pub struct StreamingFilteredNamedBamReaderGenerator { min_aligned_length_single: u32, min_percent_identity_single: f32, min_aligned_percent_single: f32, + min_mapq: u8, min_aligned_length_pair: u32, min_percent_identity_pair: f32, min_aligned_percent_pair: f32, @@ -584,6 +587,7 @@ impl NamedBamReaderGenerator self.min_aligned_length_single, self.min_percent_identity_single, self.min_aligned_percent_single, + self.min_mapq, self.min_aligned_length_pair, self.min_percent_identity_pair, self.min_aligned_percent_pair, @@ -651,6 +655,7 @@ pub fn generate_filtered_named_bam_readers_from_reads( min_aligned_length_single: u32, min_percent_identity_single: f32, min_aligned_percent_single: f32, + min_mapq: u8, min_aligned_length_pair: u32, min_percent_identity_pair: f32, min_aligned_percent_pair: f32, @@ -682,6 +687,7 @@ pub fn generate_filtered_named_bam_readers_from_reads( min_aligned_length_single, min_percent_identity_single, min_aligned_percent_single, + min_mapq, min_aligned_length_pair, min_percent_identity_pair, min_aligned_percent_pair, diff --git a/src/bin/coverm.rs b/src/bin/coverm.rs index c84b1a3..7761d49 100644 --- a/src/bin/coverm.rs +++ b/src/bin/coverm.rs @@ -165,6 +165,7 @@ fn main() { filter_params.min_aligned_length_single, filter_params.min_percent_identity_single, filter_params.min_aligned_percent_single, + filter_params.min_mapq, filter_params.min_aligned_length_pair, filter_params.min_percent_identity_pair, filter_params.min_aligned_percent_pair, @@ -463,6 +464,7 @@ fn main() { filter_params.min_aligned_length_single, filter_params.min_percent_identity_single, filter_params.min_aligned_percent_single, + filter_params.min_mapq, filter_params.min_aligned_length_pair, filter_params.min_percent_identity_pair, filter_params.min_aligned_percent_pair, @@ -519,6 +521,7 @@ fn main() { filter_params.min_aligned_length_single, filter_params.min_percent_identity_single, filter_params.min_aligned_percent_single, + filter_params.min_mapq, filter_params.min_aligned_length_pair, filter_params.min_percent_identity_pair, filter_params.min_aligned_percent_pair, @@ -1209,6 +1212,7 @@ struct FilterParameters { min_aligned_length_single: u32, min_percent_identity_single: f32, min_aligned_percent_single: f32, + min_mapq: u8, // 255 indicates no filtering min_aligned_length_pair: u32, min_percent_identity_pair: f32, min_aligned_percent_pair: f32, @@ -1224,6 +1228,7 @@ impl FilterParameters { min_aligned_length_single: *m.get_one::("min-read-aligned-length").unwrap_or(&0), min_percent_identity_single: parse_percentage(m, "min-read-percent-identity"), min_aligned_percent_single: parse_percentage(m, "min-read-aligned-percent"), + min_mapq: *m.get_one::("min-mapq").unwrap_or(&255), min_aligned_length_pair: *m .get_one::("min-read-aligned-length-pair") .unwrap_or(&0), @@ -1253,6 +1258,7 @@ impl FilterParameters { self.min_percent_identity_single > 0.0 || self.min_percent_identity_pair > 0.0 || self.min_aligned_percent_single > 0.0 + || self.min_mapq < 255 || self.min_aligned_percent_pair > 0.0 || self.min_aligned_length_single > 0 || self.min_aligned_length_pair > 0 @@ -1578,6 +1584,7 @@ fn get_streamed_filtered_bam_readers( filter_params.min_aligned_length_single, filter_params.min_percent_identity_single, filter_params.min_aligned_percent_single, + filter_params.min_mapq, filter_params.min_aligned_length_pair, filter_params.min_percent_identity_pair, filter_params.min_aligned_percent_pair, diff --git a/src/cli.rs b/src/cli.rs index ddd547a..7903e8e 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -175,6 +175,16 @@ fn add_thresholding_options(manual: Manual) -> Manual { default_roff("0") )), ) + .option(Opt::new("INT").long("--min-mapq").help(&format!( + "Exclude reads with a mapping quality \ + below this value. An INT between 0 and 254. \ + When thresholding pairs, both reads \ + are excluded if either has a MAPQ below this value. \ + If 0, \ + reads with MAPQ of 255 (mapping quality unavailable) \ + are excluded. {}", + default_roff("not set") + ))) .flag( Flag::new() .long("--proper-pairs-only") @@ -1322,6 +1332,11 @@ Ben J. Woodcroft .long("min-read-aligned-percent") .value_parser(clap::value_parser!(f32)), ) + .arg( + Arg::new("min-mapq") + .long("min-mapq") + .value_parser(clap::value_parser!(u8)), + ) .arg( Arg::new("min-read-aligned-length-pair") .long("min-read-aligned-length-pair") @@ -1707,6 +1722,11 @@ Ben J. Woodcroft .long("min-read-aligned-percent") .value_parser(clap::value_parser!(f32)), ) + .arg( + Arg::new("min-mapq") + .long("min-mapq") + .value_parser(clap::value_parser!(u8)), + ) .arg( Arg::new("min-read-aligned-length-pair") .long("min-read-aligned-length-pair") @@ -1845,6 +1865,11 @@ Ben J. Woodcroft .long("min-read-percent-identity") .value_parser(clap::value_parser!(f32)), ) + .arg( + Arg::new("min-mapq") + .long("min-mapq") + .value_parser(clap::value_parser!(u8)), + ) .arg( Arg::new("min-read-aligned-percent") .long("min-read-aligned-percent") diff --git a/src/contig.rs b/src/contig.rs index 2031366..470f7cb 100644 --- a/src/contig.rs +++ b/src/contig.rs @@ -273,7 +273,6 @@ mod tests { use mapping_index_maintenance::VanillaIndexStruct; use mapping_parameters::*; use shard_bam_reader::*; - use std::io::Read; use std::str; use OutputWriter; diff --git a/src/coverage_printer.rs b/src/coverage_printer.rs index 93c7ab6..b15f0c8 100644 --- a/src/coverage_printer.rs +++ b/src/coverage_printer.rs @@ -544,7 +544,6 @@ pub fn print_dense_cached_coverage_taker( mod tests { use super::*; use std::io::Cursor; - use std::io::Read; use std::str; use OutputWriter; diff --git a/src/filter.rs b/src/filter.rs index 08d3b02..71f767f 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -10,6 +10,8 @@ use rust_htslib::bam::record::Cigar; use rust_htslib::bam::Read; use rust_htslib::errors::Result as HtslibResult; +const MAPQ_UNAVAILABLE: u8 = 255; + pub struct ReferenceSortedBamFilter { first_set: BTreeMap, Rc>, current_reference: i32, @@ -19,12 +21,11 @@ pub struct ReferenceSortedBamFilter { min_aligned_length_single: u32, min_percent_identity_single: f32, min_aligned_percent_single: f32, - min_mapq_single: u8, + min_mapq_single: u8, // 255 indicates no filtering filter_pairs: bool, min_aligned_length_pair: u32, min_percent_identity_pair: f32, min_aligned_percent_pair: f32, - min_mapq_pair: u8, pub num_detected_primary_alignments: u64, flag_filters: FlagFilter, filter_out: bool, // true if we are filtering out reads @@ -38,17 +39,30 @@ impl ReferenceSortedBamFilter { min_aligned_length_single: u32, min_percent_identity_single: f32, min_aligned_percent_single: f32, + min_mapq_single: u8, min_aligned_length_pair: u32, min_percent_identity_pair: f32, min_aligned_percent_pair: f32, filter_out: bool, ) -> ReferenceSortedBamFilter { - let filtering_single = min_aligned_length_single > 0 + let filtering_single_initial = min_aligned_length_single > 0 || min_percent_identity_single > 0.0 || min_aligned_percent_single > 0.0; - let filtering_pairs = min_aligned_length_pair > 0 + let filtering_pairs_initial = min_aligned_length_pair > 0 || min_percent_identity_pair > 0.0 || min_aligned_percent_pair > 0.0; + // MAPQ filtering is only applied if the MAPQ is not 255. + // MAPQ filtering doesn't require both single and pair filtering to be enabled, but at least one of them + let filtering_single = filtering_single_initial + || (!filtering_pairs_initial && min_mapq_single != MAPQ_UNAVAILABLE); + // When MAPQ filtering is enabled, filter pairs if we are filtering out proper pairs + let filtering_pairs = filtering_pairs_initial + || ((!filtering_single || !flag_filters.include_improper_pairs) + && min_mapq_single != MAPQ_UNAVAILABLE); + debug!( + "filtering_single: {}, filtering_pairs: {}", + filtering_single, filtering_pairs + ); ReferenceSortedBamFilter { first_set: BTreeMap::new(), @@ -59,6 +73,7 @@ impl ReferenceSortedBamFilter { min_aligned_length_single, min_percent_identity_single, min_aligned_percent_single, + min_mapq_single, filter_pairs: filtering_pairs, min_aligned_length_pair, min_percent_identity_pair, @@ -196,7 +211,7 @@ impl ReferenceSortedBamFilter { self.min_aligned_length_pair, self.min_percent_identity_pair, self.min_aligned_percent_pair, - self.min_mapq_pair, + self.min_mapq_single, ); if passes_filter == self.filter_out { debug!("Read pair passed QC"); @@ -236,7 +251,9 @@ fn single_read_passes_filter( min_aligned_percent_single: f32, min_mapq_single: u8, ) -> bool { - if record.mapq() < min_mapq_single { + if min_mapq_single != MAPQ_UNAVAILABLE + && (record.mapq() < min_mapq_single || record.mapq() == MAPQ_UNAVAILABLE) + { return false; } @@ -271,9 +288,14 @@ fn read_pair_passes_filter( min_aligned_length_pair: u32, min_percent_identity_pair: f32, min_aligned_percent_pair: f32, - min_mapq_pair: u8, + min_mapq_single: u8, ) -> bool { - if record1.mapq() < min_mapq_pair || record2.mapq() < min_mapq_pair { + if min_mapq_single != MAPQ_UNAVAILABLE + && (record1.mapq() < min_mapq_single + || record2.mapq() < min_mapq_single + || record1.mapq() == MAPQ_UNAVAILABLE + || record2.mapq() == MAPQ_UNAVAILABLE) + { return false; } @@ -335,6 +357,7 @@ mod tests { 0, 0.0, 0.0, + 0, 90, 0.99, 0.0, @@ -367,6 +390,7 @@ mod tests { 0, 0.0, 0.0, + 0, 90, 0.99, 0.0, @@ -395,6 +419,7 @@ mod tests { 0, 0.0, 0.0, + 0, 250, 0.99, 0.0, @@ -419,6 +444,7 @@ mod tests { 0, 0.0, 0.0, + 0, 300, 0.98, 0.0, @@ -443,6 +469,7 @@ mod tests { 0.0, 0.0, 0, + 0, 0.98, 0.94, true, @@ -465,6 +492,7 @@ mod tests { 0, 0.0, 0.0, + 0, 299, 0.98, 0.0, @@ -491,6 +519,7 @@ mod tests { 0, 0.0, 0.0, + 0, 250, 0.99, 0.0, @@ -515,6 +544,7 @@ mod tests { 0, 0.0, 0.0, + 0, 300, 0.98, 0.0, @@ -539,6 +569,7 @@ mod tests { 0.0, 0.0, 0, + 0, 0.98, 0.94, false, @@ -561,6 +592,7 @@ mod tests { 0, 0.0, 0.0, + 0, 299, 0.98, 0.0, @@ -580,7 +612,7 @@ mod tests { let mut sorted = ReferenceSortedBamFilter::new( reader, FlagFilter { - include_improper_pairs: false, + include_improper_pairs: true, include_secondary: false, include_supplementary: false, }, @@ -588,6 +620,7 @@ mod tests { 0.99, 0.0, 0, + 0, 0.0, 0.0, true, @@ -609,7 +642,7 @@ mod tests { let mut sorted = ReferenceSortedBamFilter::new( reader, FlagFilter { - include_improper_pairs: false, + include_improper_pairs: true, include_secondary: false, include_supplementary: false, }, @@ -617,6 +650,7 @@ mod tests { 0.99, 0.0, 0, + 0, 0.0, 0.0, false, @@ -645,6 +679,7 @@ mod tests { 0, 0.95, 0.0, + 0, 300, 0.0, 0.0, @@ -674,6 +709,7 @@ mod tests { 0, 0.95, 0.0, + 0, 300, 0.0, 0.0, @@ -705,6 +741,7 @@ mod tests { 0, 0.0, 0.0, + 0, 1, 0.0, 0.0, @@ -719,4 +756,94 @@ mod tests { } assert_eq!(11192, num_passing); } + + #[test] + fn test_mapq_filtering_single_reads_no_bads() { + let reader = bam::Reader::from_path("tests/data/mapq_test.sam").unwrap(); + let mut sorted = ReferenceSortedBamFilter::new( + reader, + FlagFilter { + include_improper_pairs: true, + include_secondary: false, + include_supplementary: false, + }, + // 1 base required from reads mapped in proper pair, all pass. + 0, + 0.0, + 0.0, + 1, + 0, + 0.0, + 0.0, + true, + ); + assert!(sorted.filter_single_reads); + assert!(!sorted.filter_pairs); + let queries = vec!["1", "1", "2", "2"]; + let mut record = bam::record::Record::new(); + for i in queries { + sorted.read(&mut record).expect("").expect(""); + assert_eq!(i, str::from_utf8(record.qname()).unwrap()); + } + } + + #[test] + fn test_mapq_filtering_single_reads_single_bad() { + let reader = bam::Reader::from_path("tests/data/mapq_test.sam").unwrap(); + let mut sorted = ReferenceSortedBamFilter::new( + reader, + FlagFilter { + include_improper_pairs: true, + include_secondary: false, + include_supplementary: false, + }, + // 1 base required from reads mapped in proper pair, all pass. + 0, + 0.0, + 0.0, + 51, + 0, + 0.0, + 0.0, + true, + ); + assert!(sorted.filter_single_reads); + assert!(!sorted.filter_pairs); + let queries = vec!["1", "2", "2"]; + let mut record = bam::record::Record::new(); + for i in queries { + sorted.read(&mut record).expect("").expect(""); + assert_eq!(i, str::from_utf8(record.qname()).unwrap()); + } + } + + #[test] + fn test_mapq_filtering_pairs_one_bad() { + let reader = bam::Reader::from_path("tests/data/mapq_test.sam").unwrap(); + let mut sorted = ReferenceSortedBamFilter::new( + reader, + FlagFilter { + include_improper_pairs: true, + include_secondary: false, + include_supplementary: false, + }, + // 1 base required from reads mapped in proper pair, all pass. + 0, + 0.0, + 0.0, + 51, + 1, + 0.0, + 0.0, + true, + ); + assert!(!sorted.filter_single_reads); + assert!(sorted.filter_pairs); + let queries = vec!["2", "2"]; // Only the first of the first pair passes, so both removed + let mut record = bam::record::Record::new(); + for i in queries { + sorted.read(&mut record).expect("").expect(""); + assert_eq!(i, str::from_utf8(record.qname()).unwrap()); + } + } } diff --git a/src/genome.rs b/src/genome.rs index b2d5291..dcd20e6 100644 --- a/src/genome.rs +++ b/src/genome.rs @@ -923,7 +923,6 @@ mod tests { use rust_htslib::bam::Read; use shard_bam_reader::*; use std::collections::HashSet; - use std::io::Read as _; use OutputWriter; fn test_streaming_with_stream>( diff --git a/src/mapping_index_maintenance.rs b/src/mapping_index_maintenance.rs index f3b427e..3bf96ca 100644 --- a/src/mapping_index_maintenance.rs +++ b/src/mapping_index_maintenance.rs @@ -102,7 +102,7 @@ impl TemporaryIndexStruct { | MappingProgram::STROBEALIGN => {} }; if let Some(t) = num_threads { - cmd.arg("-t").arg(&format!("{}", t)); + cmd.arg("-t").arg(format!("{}", t)); } cmd.arg("-d").arg(&index_path).arg(reference_path); } diff --git a/tests/data/mapq_test.sam b/tests/data/mapq_test.sam new file mode 100644 index 0000000..c0d188e --- /dev/null +++ b/tests/data/mapq_test.sam @@ -0,0 +1,14 @@ +@HD VN:1.6 SO:queryname +@SQ SN:genome1~random_sequence_length_11000 LN:11000 +@SQ SN:genome1~random_sequence_length_11010 LN:11010 +@SQ SN:genome2~seq1 LN:1000 +@SQ SN:genome3~random_sequence_length_11001 LN:11001 +@SQ SN:genome4~random_sequence_length_11002 LN:11002 +@SQ SN:genome5~seq2 LN:1000 +@SQ SN:genome6~random_sequence_length_11003 LN:11003 +@PG ID:bwa PN:bwa VN:0.7.17-r1188 CL:bwa mem 7seqs.fna bad_read.1.fq bad_read.2.fq +@PG ID:samtools PN:samtools PP:bwa VN:1.19.2 CL:samtools view -h 7seqs.fnaVbad_read.bam +1 99 genome2~seq1 1 50 4M1D145M = 351 500 GAGTCCTAAAAGCATGACCTGCATCTCCGAGAAGGGTCCCAGGGGCGGTGCTAGATCTCAGCAAGGAGGCAATGTGGTTGGGTTTTATAGTCACCCTTTCGAATACTCGAAGAAGGCGCCTGCCCGTTCTAGAGGGAGACATAAGCCTC AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:3 MD:Z:4^C4G5G134 MC:Z:150M AS:i:135 XS:i:0 +1 147 genome2~seq1 351 51 150M = 1 -500 ATAAACTATCAGGCCGGCGGAGTATGGTGGAGTAGAGGCGGGCATTAAGTTTATTGCTCTAAGAAGAAAGGCAACTACATGGAGAGTAGAGCCCCAGAATGGCCAGTGAAATCGTTAAATTGGAATTGTAAATCAGCAGTTGTACTTTGC AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 MD:Z:150 MC:Z:4M1D145M AS:i:150 XS:i:0 +2 99 genome2~seq1 101 58 150M = 451 500 CGAATACTCGAAGAAGGCGCCTGCCCGTTCTAGAGGGAGACATAAGCCTCTGCATCTGCAACGCGGATGGCTAATCAATAATGACCTGCGGTGGATCAGATACCGCATCGCAAGGACCACGTAGGTTGGCTGTCCAACGCGCAAGCGCGC AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 MD:Z:150 MC:Z:150M AS:i:150 XS:i:0 +2 147 genome2~seq1 451 60 150M = 101 -500 GGCCAGTGAAATCGTTAAATTGGAATTGTAAATCAGCAGTTGTACTTTGCCGCTGGTATGACATGACTCCCAGGATGAAATATAACCGCCAAGTATGCGCAAATAGAGTTTTAACCCTGCATTAGACTGTTATCACGGTTGAGCCTTACC AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA NM:i:0 MD:Z:150 MC:Z:150M AS:i:150 XS:i:0 \ No newline at end of file diff --git a/tests/test_cmdline.rs b/tests/test_cmdline.rs index 9c8bf3a..0e5416d 100644 --- a/tests/test_cmdline.rs +++ b/tests/test_cmdline.rs @@ -2900,6 +2900,127 @@ genome6~random_sequence_length_11003 0 0 0 ) .unwrap(); } + + #[test] + fn test_cmdline_mapq_filtering_all_out() { + Assert::main_binary() + .with_args(&[ + "genome", + "-m", + "mean", + "covered_fraction", + "-b", + "tests/data/mapq_test.sam", + "--single-genome", + "--min-covered-fraction", + "0", + // "--min-mapq", + // "10", + ]) + .succeeds() + .stdout() + .is("Genome\tmapq_test Mean\tmapq_test Covered Fraction\n\ + genome1\t0.009380695\t0.00875193\n") + .unwrap(); + + Assert::main_binary() + .with_args(&[ + "genome", + "-m", + "mean", + "covered_fraction", + "-b", + "tests/data/mapq_test.sam", + "--single-genome", + "--min-covered-fraction", + "0", + "--min-mapq", + "100", + ]) + .succeeds() + .stdout() + .is("Genome\tmapq_test Mean\tmapq_test Covered Fraction\n\ + genome1\t0\t0\n") + .unwrap(); + } + + // test mapq filter 51 with single read filtering + #[test] + fn test_cmdline_mapq_filtering_single_read() { + Assert::main_binary() + .with_args(&[ + "contig", + "-m", + "mean", + "covered_fraction", + "-b", + "tests/data/mapq_test.sam", + // "--min-mapq", + // "51", + ]) + .succeeds() + .stdout() + .is("Contig\tmapq_test Mean\tmapq_test Covered Fraction\n\ + genome1~random_sequence_length_11000\t0\t0\n\ + genome1~random_sequence_length_11010\t0\t0\n\ + genome2~seq1\t0.61764705\t0.499\n\ + genome3~random_sequence_length_11001\t0\t0\n\ + genome4~random_sequence_length_11002\t0\t0\n\ + genome5~seq2\t0\t0\n\ + genome6~random_sequence_length_11003\t0\t0\n") + .unwrap(); + + Assert::main_binary() + .with_args(&[ + "contig", + "-m", + "mean", + "covered_fraction", + "-b", + "tests/data/mapq_test.sam", + "--min-mapq", + "51", + ]) + .succeeds() + .stdout() + .is("Contig\tmapq_test Mean\tmapq_test Covered Fraction\n\ + genome1~random_sequence_length_11000\t0\t0\n\ + genome1~random_sequence_length_11010\t0\t0\n\ + genome2~seq1\t0.5294118\t0.4\n\ + genome3~random_sequence_length_11001\t0\t0\n\ + genome4~random_sequence_length_11002\t0\t0\n\ + genome5~seq2\t0\t0\n\ + genome6~random_sequence_length_11003\t0\t0\n") + .unwrap(); + } + + // test mapq filter 51 with paired read filtering + #[test] + fn test_cmdline_mapq_filtering_single_read_fail_proper_pairs() { + Assert::main_binary() + .with_args(&[ + "contig", + "-m", + "mean", + "covered_fraction", + "-b", + "tests/data/mapq_test.sam", + "--min-mapq", + "51", + "--proper-pairs-only", + ]) + .succeeds() + .stdout() + .is("Contig\tmapq_test Mean\tmapq_test Covered Fraction\n\ + genome1~random_sequence_length_11000\t0\t0\n\ + genome1~random_sequence_length_11010\t0\t0\n\ + genome2~seq1\t0.3529412\t0.3\n\ + genome3~random_sequence_length_11001\t0\t0\n\ + genome4~random_sequence_length_11002\t0\t0\n\ + genome5~seq2\t0\t0\n\ + genome6~random_sequence_length_11003\t0\t0\n") + .unwrap(); + } } // TODO: Add mismatching bases test