From c76eec3e5ed0bce9d9adf1b5d1df317405de06eb Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 17 Sep 2023 13:41:36 -0700 Subject: [PATCH] cleanup template, replace with selection --- src/core/src/collection.rs | 4 + src/core/src/ffi/index/revindex.rs | 25 +++- src/core/src/index/revindex/disk_revindex.rs | 18 +-- src/core/src/index/revindex/mem_revindex.rs | 94 ++++--------- src/core/src/index/revindex/mod.rs | 136 +++++++------------ src/core/src/selection.rs | 56 ++++---- src/core/src/signature.rs | 2 + src/core/src/sketch/minhash.rs | 2 + 8 files changed, 142 insertions(+), 195 deletions(-) diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index 4e5a81a6ba..ca340af011 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -60,6 +60,10 @@ impl CollectionSet { pub fn into_inner(self) -> Collection { self.collection } + + pub fn selection(&self) -> Selection { + todo!("Extract selection from first sig") + } } impl Collection { diff --git a/src/core/src/ffi/index/revindex.rs b/src/core/src/ffi/index/revindex.rs index ab24e2ac3e..e38bdef7fb 100644 --- a/src/core/src/ffi/index/revindex.rs +++ b/src/core/src/ffi/index/revindex.rs @@ -8,6 +8,7 @@ use crate::ffi::signature::SourmashSignature; use crate::ffi::utils::{ForeignObject, SourmashStr}; use crate::index::revindex::mem_revindex::RevIndex; use crate::index::Index; +use crate::prelude::*; use crate::signature::{Signature, SigsTrait}; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; @@ -18,6 +19,21 @@ impl ForeignObject for SourmashRevIndex { type RustObject = RevIndex; } +// TODO: remove this when it is possible to pass Selection thru the FFI +fn from_template(template: &Sketch) -> Selection { + let (num, scaled) = match template { + Sketch::MinHash(mh) => (mh.num(), mh.scaled() as u32), + Sketch::LargeMinHash(mh) => (mh.num(), mh.scaled() as u32), + _ => unimplemented!(), + }; + + Selection::builder() + .ksize(template.ksize() as u32) + .num(num) + .scaled(scaled) + .build() +} + ffi_fn! { unsafe fn revindex_new_with_paths( search_sigs_ptr: *const *const SourmashStr, @@ -58,9 +74,12 @@ unsafe fn revindex_new_with_paths( .collect(); Some(queries_vec.as_ref()) }; + + let selection = from_template(&template); + let revindex = RevIndex::new( search_sigs.as_ref(), - &template, + &selection, threshold, queries, keep_sigs, @@ -105,7 +124,9 @@ unsafe fn revindex_new_with_sigs( .collect(); Some(queries_vec.as_ref()) }; - let revindex = RevIndex::new_with_sigs(search_sigs, &template, threshold, queries)?; + + let selection = from_template(&template); + let revindex = RevIndex::new_with_sigs(search_sigs, &selection, threshold, queries)?; Ok(SourmashRevIndex::from_rust(revindex)) } } diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 8bf98cde09..d63abf58cd 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -17,6 +17,7 @@ use crate::index::revindex::{ }; use crate::index::{GatherResult, SigCounter}; use crate::manifest::Manifest; +use crate::prelude::*; use crate::signature::SigsTrait; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; @@ -283,11 +284,12 @@ impl RevIndexOps for RevIndex { hash_to_color: HashToColor, threshold: usize, orig_query: &KmerMinHash, - template: &Sketch, + selection: Option, ) -> Result> { let mut match_size = usize::max_value(); let mut matches = vec![]; //let mut query: KmerMinHashBTree = orig_query.clone().into(); + let selection = selection.unwrap_or_else(|| self.collection.selection()); while match_size > threshold && !counter.is_empty() { trace!("counter len: {}", counter.len()); @@ -298,22 +300,20 @@ impl RevIndexOps for RevIndex { let match_sig = self.collection.sig_for_dataset(dataset_id)?; - let match_mh = - prepare_query(&match_sig, template).expect("Couldn't find a compatible MinHash"); - // Calculate stats let f_orig_query = match_size as f64 / orig_query.size() as f64; - let f_match = match_size as f64 / match_mh.size() as f64; let name = match_sig.name(); - let unique_intersect_bp = match_mh.scaled() as usize * match_size; let gather_result_rank = matches.len(); + let match_ = match_sig.clone(); + let md5 = match_sig.md5sum(); + let match_mh = prepare_query(match_sig.into(), &selection) + .expect("Couldn't find a compatible MinHash"); + let f_match = match_size as f64 / match_mh.size() as f64; + let unique_intersect_bp = match_mh.scaled() as usize * match_size; let (intersect_orig, _) = match_mh.intersection_size(orig_query)?; let intersect_bp = (match_mh.scaled() * intersect_orig) as usize; - let f_unique_to_query = intersect_orig as f64 / orig_query.size() as f64; - let match_ = match_sig.clone(); - let md5 = match_sig.md5sum(); // TODO: all of these let filename = "".into(); diff --git a/src/core/src/index/revindex/mem_revindex.rs b/src/core/src/index/revindex/mem_revindex.rs index e3efb7146a..2d37d4c274 100644 --- a/src/core/src/index/revindex/mem_revindex.rs +++ b/src/core/src/index/revindex/mem_revindex.rs @@ -84,7 +84,7 @@ impl LinearIndex { impl RevIndex { pub fn new( search_sigs: &[PathBuf], - template: &Sketch, + selection: &Selection, threshold: usize, queries: Option<&[KmerMinHash]>, _keep_sigs: bool, @@ -92,8 +92,7 @@ impl RevIndex { // If threshold is zero, let's merge all queries and save time later let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); - let collection = - Collection::from_paths(search_sigs)?.select(&Selection::from_template(template))?; + let collection = Collection::from_paths(search_sigs)?.select(&selection)?; let linear = LinearIndex::from_collection(collection.try_into()?); Ok(linear.index(threshold, merged_query, queries)) @@ -101,7 +100,7 @@ impl RevIndex { pub fn from_zipfile>( zipfile: P, - template: &Sketch, + selection: &Selection, threshold: usize, queries: Option<&[KmerMinHash]>, _keep_sigs: bool, @@ -109,8 +108,7 @@ impl RevIndex { // If threshold is zero, let's merge all queries and save time later let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); - let collection = - Collection::from_zipfile(zipfile)?.select(&Selection::from_template(template))?; + let collection = Collection::from_zipfile(zipfile)?.select(&selection)?; let linear = LinearIndex::from_collection(collection.try_into()?); Ok(linear.index(threshold, merged_query, queries)) @@ -130,15 +128,14 @@ impl RevIndex { pub fn new_with_sigs( search_sigs: Vec, - template: &Sketch, + selection: &Selection, threshold: usize, queries: Option<&[KmerMinHash]>, ) -> Result { // If threshold is zero, let's merge all queries and save time later let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); - let collection = - Collection::from_sigs(search_sigs)?.select(&Selection::from_template(template))?; + let collection = Collection::from_sigs(search_sigs)?.select(selection)?; let linear = LinearIndex::from_collection(collection.try_into()?); let idx = linear.index(threshold, merged_query, queries); @@ -338,24 +335,18 @@ impl<'a> Index<'a> for RevIndex { mod test { use super::*; + use crate::index::revindex::prepare_query; use crate::sketch::minhash::max_hash_for_scaled; use crate::Result; #[test] fn revindex_new() -> Result<()> { - let max_hash = max_hash_for_scaled(10000); - let template = Sketch::MinHash( - KmerMinHash::builder() - .num(0u32) - .ksize(31) - .max_hash(max_hash) - .build(), - ); + let selection = Selection::builder().ksize(31).scaled(10000).build(); let search_sigs = [ "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(), "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(), ]; - let index = RevIndex::new(&search_sigs, &template, 0, None, false)?; + let index = RevIndex::new(&search_sigs, &selection, 0, None, false)?; assert_eq!(index.colors.len(), 3); Ok(()) @@ -363,21 +354,14 @@ mod test { #[test] fn revindex_many() -> Result<()> { - let max_hash = max_hash_for_scaled(10000); - let template = Sketch::MinHash( - KmerMinHash::builder() - .num(0u32) - .ksize(31) - .max_hash(max_hash) - .build(), - ); + let selection = Selection::builder().ksize(31).scaled(10000).build(); let search_sigs = [ "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(), "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(), "../../tests/test-data/gather/GCF_000008105.1_ASM810v1_genomic.fna.gz.sig".into(), ]; - let index = RevIndex::new(&search_sigs, &template, 0, None, false)?; + let index = RevIndex::new(&search_sigs, &selection, 0, None, false)?; //dbg!(&index.linear.collection().manifest); /* dbg!(&index.colors.colors); @@ -399,14 +383,7 @@ mod test { #[test] fn revindex_from_sigs() -> Result<()> { - let max_hash = max_hash_for_scaled(10000); - let template = Sketch::MinHash( - KmerMinHash::builder() - .num(0u32) - .ksize(31) - .max_hash(max_hash) - .build(), - ); + let selection = Selection::builder().ksize(31).scaled(10000).build(); let search_sigs: Vec = [ "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig", "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig", @@ -416,7 +393,7 @@ mod test { .map(|path| Signature::from_path(path).unwrap().swap_remove(0)) .collect(); - let index = RevIndex::new_with_sigs(search_sigs, &template, 0, None)?; + let index = RevIndex::new_with_sigs(search_sigs, &selection, 0, None)?; /* dbg!(&index.colors.colors); 0: 86 @@ -436,18 +413,14 @@ mod test { #[test] fn revindex_from_zipstorage() -> Result<()> { - let max_hash = max_hash_for_scaled(100); - let template = Sketch::MinHash( - KmerMinHash::builder() - .num(0u32) - .ksize(19) - .hash_function(crate::encodings::HashFunctions::murmur64_protein) - .max_hash(max_hash) - .build(), - ); + let selection = Selection::builder() + .ksize(19) + .scaled(100) + .moltype(crate::encodings::HashFunctions::murmur64_protein) + .build(); let index = RevIndex::from_zipfile( "../../tests/test-data/prot/protein.zip", - &template, + &selection, 0, None, false, @@ -460,34 +433,27 @@ mod test { "../../tests/test-data/prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", ) .expect("Error processing query") - .swap_remove(0); - - let template = Sketch::MinHash( - KmerMinHash::builder() - .num(0u32) - .ksize(57) - .hash_function(crate::encodings::HashFunctions::murmur64_protein) - .max_hash(max_hash) - .build(), - ); + .swap_remove(0) + .select(&selection)?; + let mut query_mh = None; - if let Some(Sketch::MinHash(mh)) = query_sig.select_sketch(&template) { - query_mh = Some(mh); + if let Some(q) = prepare_query(query_sig, &selection) { + query_mh = Some(q); } let query_mh = query_mh.expect("Couldn't find a compatible MinHash"); - let counter_rev = index.counter_for_query(query_mh); - let counter_lin = index.linear.counter_for_query(query_mh); + let counter_rev = index.counter_for_query(&query_mh); + let counter_lin = index.linear.counter_for_query(&query_mh); let results_rev = index.search(counter_rev, false, 0).unwrap(); let results_linear = index.linear.search(counter_lin, false, 0).unwrap(); assert_eq!(results_rev, results_linear); - let counter_rev = index.counter_for_query(query_mh); - let counter_lin = index.linear.counter_for_query(query_mh); + let counter_rev = index.counter_for_query(&query_mh); + let counter_lin = index.linear.counter_for_query(&query_mh); - let results_rev = index.gather(counter_rev, 0, query_mh).unwrap(); - let results_linear = index.linear.gather(counter_lin, 0, query_mh).unwrap(); + let results_rev = index.gather(counter_rev, 0, &query_mh).unwrap(); + let results_linear = index.linear.gather(counter_lin, 0, &query_mh).unwrap(); assert_eq!(results_rev.len(), 1); assert_eq!(results_rev, results_linear); diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index c1762811a9..30f7630c1b 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -15,8 +15,9 @@ use serde::{Deserialize, Serialize}; use crate::collection::CollectionSet; use crate::encodings::{Color, Colors, Idx}; use crate::index::{GatherResult, SigCounter}; -use crate::signature::{Signature, SigsTrait}; -use crate::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; +use crate::prelude::*; +use crate::signature::Signature; +use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; use crate::HashIntoType; use crate::Result; @@ -75,7 +76,7 @@ pub trait RevIndexOps { hash_to_color: HashToColor, threshold: usize, query: &KmerMinHash, - template: &Sketch, + selection: Option, ) -> Result>; } @@ -215,53 +216,16 @@ impl RevIndex { } } -fn check_compatible_downsample(me: &KmerMinHash, other: &KmerMinHash) -> Result<()> { - /* - if self.num != other.num { - return Err(Error::MismatchNum { - n1: self.num, - n2: other.num, - } - .into()); - } - */ - use crate::Error; - - if me.ksize() != other.ksize() { - return Err(Error::MismatchKSizes); - } - if me.hash_function() != other.hash_function() { - // TODO: fix this error - return Err(Error::MismatchDNAProt); - } - if me.max_hash() < other.max_hash() { - return Err(Error::MismatchScaled); - } - if me.seed() != other.seed() { - return Err(Error::MismatchSeed); - } - Ok(()) -} +pub fn prepare_query(search_sig: Signature, selection: &Selection) -> Option { + let sig = search_sig.select(selection).ok(); -pub fn prepare_query(search_sig: &Signature, template: &Sketch) -> Option { - let mut search_mh = None; - if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) { - search_mh = Some(mh.clone()); - } else { - // try to find one that can be downsampled - if let Sketch::MinHash(template_mh) = template { - for sketch in search_sig.sketches() { - if let Sketch::MinHash(ref_mh) = sketch { - if check_compatible_downsample(&ref_mh, template_mh).is_ok() { - let max_hash = max_hash_for_scaled(template_mh.scaled()); - let mh = ref_mh.downsample_max_hash(max_hash).unwrap(); - search_mh = Some(mh); - } - } - } + sig.and_then(|sig| { + if let Sketch::MinHash(mh) = sig.sketches().swap_remove(0) { + Some(mh) + } else { + None } - } - search_mh + }) } #[derive(Debug, Default, PartialEq, Clone)] @@ -481,22 +445,12 @@ mod test { use crate::collection::Collection; use crate::prelude::*; use crate::selection::Selection; - use crate::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; + use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; use crate::Result; use super::{prepare_query, RevIndex, RevIndexOps}; - fn build_template(ksize: u8, scaled: usize) -> Sketch { - let max_hash = max_hash_for_scaled(scaled as u64); - let template_mh = KmerMinHash::builder() - .num(0u32) - .ksize(ksize as u32) - .max_hash(max_hash) - .build(); - Sketch::MinHash(template_mh) - } - #[test] fn revindex_index() -> Result<()> { let mut basedir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); @@ -510,20 +464,19 @@ mod test { }) .collect(); - let template = build_template(31, 10000); + let selection = Selection::builder().ksize(31).scaled(10000).build(); let output = TempDir::new()?; - let query_sig = Signature::from_path(&siglist[0])?; let mut query = None; - for sig in &query_sig { - if let Some(q) = prepare_query(sig, &template) { - query = Some(q); - } + let query_sig = Signature::from_path(&siglist[0])? + .swap_remove(0) + .select(&selection)?; + if let Some(q) = prepare_query(query_sig, &selection) { + query = Some(q); } let query = query.unwrap(); - let collection = - Collection::from_paths(&siglist)?.select(&Selection::from_template(&template))?; + let collection = Collection::from_paths(&siglist)?.select(&selection)?; let index = RevIndex::create(output.path(), collection.try_into()?, false)?; let counter = index.counter_for_query(&query); @@ -547,13 +500,12 @@ mod test { }) .collect(); - let template = build_template(31, 10000); + let selection = Selection::builder().ksize(31).scaled(10000).build(); let output = TempDir::new()?; let mut new_siglist = siglist.clone(); { - let collection = - Collection::from_paths(&siglist)?.select(&Selection::from_template(&template))?; + let collection = Collection::from_paths(&siglist)?.select(&selection)?; RevIndex::create(output.path(), collection.try_into()?, false)?; } @@ -561,17 +513,16 @@ mod test { filename.push("genome-s12.fa.gz.sig"); new_siglist.push(filename); - let query_sig = Signature::from_path(&new_siglist[2])?; let mut query = None; - for sig in &query_sig { - if let Some(q) = prepare_query(sig, &template) { - query = Some(q); - } + let query_sig = Signature::from_path(&new_siglist[2])? + .swap_remove(0) + .select(&selection)?; + if let Some(q) = prepare_query(query_sig, &selection) { + query = Some(q); } let query = query.unwrap(); - let new_collection = - Collection::from_paths(&new_siglist)?.select(&Selection::from_template(&template))?; + let new_collection = Collection::from_paths(&new_siglist)?.select(&selection)?; let index = RevIndex::open(output.path(), false)?.update(new_collection.try_into()?)?; let counter = index.counter_for_query(&query); @@ -596,30 +547,39 @@ mod test { }) .collect(); - let template = build_template(31, 10000); + let selection = Selection::builder().ksize(31).scaled(10000).build(); let output = TempDir::new()?; - let query_sig = Signature::from_path(&siglist[0])?; let mut query = None; - for sig in &query_sig { - if let Some(q) = prepare_query(sig, &template) { - query = Some(q); - } + let query_sig = Signature::from_path(&siglist[0])? + .swap_remove(0) + .select(&selection)?; + if let Some(q) = prepare_query(query_sig, &selection) { + query = Some(q); } let query = query.unwrap(); { - let collection = - Collection::from_paths(&siglist)?.select(&Selection::from_template(&template))?; + let collection = Collection::from_paths(&siglist)?.select(&selection)?; let _index = RevIndex::create(output.path(), collection.try_into()?, false); } let index = RevIndex::open(output.path(), true)?; - let counter = index.counter_for_query(&query); - let matches = index.matches_from_counter(counter, 0); + let (counter, query_colors, hash_to_color) = index.prepare_gather_counters(&query); - assert_eq!(matches, [("../genome-s10.fa.gz".into(), 48)]); + let matches = index.gather( + counter, + query_colors, + hash_to_color, + 0, + &query, + Some(selection), + )?; + + assert_eq!(matches.len(), 1); + assert_eq!(matches[0].name(), "../genome-s10.fa.gz"); + assert_eq!(matches[0].f_match(), 1.0); Ok(()) } diff --git a/src/core/src/selection.rs b/src/core/src/selection.rs index 3e18f8fb31..86d42273e0 100644 --- a/src/core/src/selection.rs +++ b/src/core/src/selection.rs @@ -3,10 +3,32 @@ use typed_builder::TypedBuilder; use crate::encodings::HashFunctions; use crate::manifest::Record; -use crate::signature::SigsTrait; -use crate::sketch::Sketch; use crate::Result; +#[derive(Default, Debug, TypedBuilder)] +pub struct Selection { + #[builder(default, setter(strip_option))] + ksize: Option, + + #[builder(default, setter(strip_option))] + abund: Option, + + #[builder(default, setter(strip_option))] + num: Option, + + #[builder(default, setter(strip_option))] + scaled: Option, + + #[builder(default, setter(strip_option))] + containment: Option, + + #[builder(default, setter(strip_option))] + moltype: Option, + + #[builder(default, setter(strip_option))] + picklist: Option, +} + #[derive(Default, TypedBuilder, CopyGetters, Getters, Setters, Clone, Debug)] pub struct Picklist { #[getset(get = "pub", set = "pub")] @@ -34,17 +56,6 @@ pub enum PickStyle { Exclude = 2, } -#[derive(Default, Debug)] -pub struct Selection { - ksize: Option, - abund: Option, - num: Option, - scaled: Option, - containment: Option, - moltype: Option, - picklist: Option, -} - pub trait Select { fn select(self, selection: &Selection) -> Result where @@ -108,25 +119,6 @@ impl Selection { self.picklist = Some(value); } - pub fn from_template(template: &Sketch) -> Self { - let (num, scaled) = match template { - Sketch::MinHash(mh) => (Some(mh.num()), Some(mh.scaled() as u32)), - Sketch::LargeMinHash(mh) => (Some(mh.num()), Some(mh.scaled() as u32)), - _ => (None, None), - }; - - Selection { - ksize: Some(template.ksize() as u32), - abund: None, - containment: None, - //moltype: Some(template.hash_function()), - moltype: None, - num, - picklist: None, - scaled, - } - } - pub fn from_record(row: &Record) -> Result { Ok(Self { ksize: Some(*row.ksize()), diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 9ac9bfe2aa..dd05f9005d 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -771,6 +771,8 @@ impl Select for Signature { } else { valid }; + // TODO: execute downsample if needed + /* valid = if let Some(abund) = selection.abund() { valid && *s.with_abundance() == abund diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 454a50f7a7..4f61056853 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -829,6 +829,8 @@ impl SigsTrait for KmerMinHash { // TODO: fix this error return Err(Error::MismatchDNAProt); } + // TODO: if supporting downsampled to be compatible + //if self.max_hash < other.max_hash { if self.max_hash != other.max_hash { return Err(Error::MismatchScaled); }