From 84ce2bfca3b29fc06444108910e46c33be7d3e5e Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 16 Sep 2023 12:57:53 -0700 Subject: [PATCH] reorg select --- include/sourmash.h | 1 + src/core/src/collection.rs | 14 +- src/core/src/index/linear.rs | 57 ++---- src/core/src/index/mod.rs | 106 +--------- src/core/src/index/revindex/disk_revindex.rs | 6 - src/core/src/index/revindex/mem_revindex.rs | 191 +------------------ src/core/src/index/revindex/mod.rs | 114 +++++++++-- src/core/src/lib.rs | 2 +- src/core/src/manifest.rs | 6 +- src/core/src/picklist.rs | 29 --- src/core/src/prelude.rs | 11 +- src/core/src/selection.rs | 141 ++++++++++++++ src/core/src/signature.rs | 56 +++--- src/core/src/storage.rs | 5 +- 14 files changed, 315 insertions(+), 424 deletions(-) delete mode 100644 src/core/src/picklist.rs create mode 100644 src/core/src/selection.rs diff --git a/include/sourmash.h b/include/sourmash.h index 011aee2925..d647378da7 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -43,6 +43,7 @@ enum SourmashErrorCode { SOURMASH_ERROR_CODE_SERDE_ERROR = 100004, SOURMASH_ERROR_CODE_NIFFLER_ERROR = 100005, SOURMASH_ERROR_CODE_CSV_ERROR = 100006, + SOURMASH_ERROR_CODE_ROCKS_DB_ERROR = 100007, }; typedef uint32_t SourmashErrorCode; diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index 164db5efe7..8f3b049313 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -3,8 +3,8 @@ use std::ops::{Deref, DerefMut}; use camino::Utf8Path as Path; use crate::encodings::Idx; -use crate::index::Selection; use crate::manifest::{Manifest, Record}; +use crate::prelude::*; use crate::signature::Signature; use crate::storage::{FSStorage, InnerStorage, MemStorage, SigStore, Storage, ZipStorage}; use crate::Result; @@ -53,6 +53,12 @@ impl TryFrom for CollectionSet { } } +impl CollectionSet { + pub fn into_inner(self) -> Collection { + self.collection + } +} + impl Collection { pub fn from_zipfile>(zipfile: P) -> Result { let storage = ZipStorage::from_file(zipfile)?; @@ -127,9 +133,11 @@ impl Collection { assert_eq!(sig.signatures.len(), 1); Ok(sig) } +} - pub fn select(mut self, selection: &Selection) -> Result { - self.manifest = self.manifest.select_to_manifest(selection)?; +impl Select for Collection { + fn select(mut self, selection: &Selection) -> Result { + self.manifest = self.manifest.select(selection)?; Ok(self) } } diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs index ed12bbd745..1b4cd2f8ec 100644 --- a/src/core/src/index/linear.rs +++ b/src/core/src/index/linear.rs @@ -11,13 +11,13 @@ use crate::collection::CollectionSet; use crate::encodings::Idx; use crate::index::{GatherResult, Index, Selection, SigCounter}; use crate::manifest::Manifest; +use crate::selection::Select; use crate::signature::{Signature, SigsTrait}; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; use crate::storage::{InnerStorage, SigStore, Storage}; use crate::Result; -//#[derive(Serialize, Deserialize)] pub struct LinearIndex { collection: CollectionSet, template: Sketch, @@ -58,46 +58,6 @@ impl LinearIndex { Some(self.collection.storage.clone()) } - pub fn select(mut self, selection: &Selection) -> Result { - let manifest = self.collection.manifest.select_to_manifest(selection)?; - self.collection.manifest = manifest; - - Ok(self) - /* - # if we have a manifest, run 'select' on the manifest. - manifest = self.manifest - traverse_yield_all = self.traverse_yield_all - - if manifest is not None: - manifest = manifest.select_to_manifest(**kwargs) - return ZipFileLinearIndex(self.storage, - selection_dict=None, - traverse_yield_all=traverse_yield_all, - manifest=manifest, - use_manifest=True) - else: - # no manifest? just pass along all the selection kwargs to - # the new ZipFileLinearIndex. - - assert manifest is None - if self.selection_dict: - # combine selects... - d = dict(self.selection_dict) - for k, v in kwargs.items(): - if k in d: - if d[k] is not None and d[k] != v: - raise ValueError(f"incompatible select on '{k}'") - d[k] = v - kwargs = d - - return ZipFileLinearIndex(self.storage, - selection_dict=kwargs, - traverse_yield_all=traverse_yield_all, - manifest=None, - use_manifest=False) - */ - } - pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { let processed_sigs = AtomicUsize::new(0); @@ -348,6 +308,21 @@ impl LinearIndex { } } +impl Select for LinearIndex { + fn select(self, selection: &Selection) -> Result { + let Self { + collection, + template, + } = self; + let collection = collection.into_inner().select(selection)?.try_into()?; + + Ok(Self { + collection, + template, + }) + } +} + impl<'a> Index<'a> for LinearIndex { type Item = SigStore; diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index ad65bf9d08..ec55249b04 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -18,13 +18,9 @@ use getset::{CopyGetters, Getters, Setters}; use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; -use crate::encodings::{HashFunctions, Idx}; +use crate::encodings::Idx; use crate::index::search::{search_minhashes, search_minhashes_containment}; -use crate::manifest::Record; -use crate::picklist::Picklist; use crate::prelude::*; -use crate::signature::SigsTrait; -use crate::sketch::Sketch; use crate::Result; #[derive(TypedBuilder, CopyGetters, Getters, Setters, Serialize, Deserialize, Debug, PartialEq)] @@ -68,108 +64,8 @@ impl GatherResult { } } -#[derive(Default, Debug)] -pub struct Selection { - ksize: Option, - abund: Option, - num: Option, - scaled: Option, - containment: Option, - moltype: Option, - picklist: Option, -} - type SigCounter = counter::Counter; -impl Selection { - pub fn ksize(&self) -> Option { - self.ksize - } - - pub fn set_ksize(&mut self, ksize: u32) { - self.ksize = Some(ksize); - } - - pub fn abund(&self) -> Option { - self.abund - } - - pub fn set_abund(&mut self, value: bool) { - self.abund = Some(value); - } - - pub fn num(&self) -> Option { - self.num - } - - pub fn set_num(&mut self, num: u32) { - self.num = Some(num); - } - - pub fn scaled(&self) -> Option { - self.scaled - } - - pub fn set_scaled(&mut self, scaled: u32) { - self.scaled = Some(scaled); - } - - pub fn containment(&self) -> Option { - self.containment - } - - pub fn set_containment(&mut self, containment: bool) { - self.containment = Some(containment); - } - - pub fn moltype(&self) -> Option { - self.moltype - } - - pub fn set_moltype(&mut self, value: HashFunctions) { - self.moltype = Some(value); - } - - pub fn picklist(&self) -> Option { - self.picklist.clone() - } - - pub fn set_picklist(&mut self, value: Picklist) { - self.picklist = Some(value); - } - - pub fn from_template(template: &Sketch) -> Self { - let (num, scaled) = match template { - Sketch::MinHash(mh) => (Some(mh.num()), Some(mh.scaled() as u32)), - Sketch::LargeMinHash(mh) => (Some(mh.num()), Some(mh.scaled() as u32)), - _ => (None, None), - }; - - Selection { - ksize: Some(template.ksize() as u32), - abund: None, - containment: None, - //moltype: Some(template.hash_function()), - moltype: None, - num, - picklist: None, - scaled, - } - } - - pub fn from_record(row: &Record) -> Result { - Ok(Self { - ksize: Some(*row.ksize()), - abund: Some(*row.with_abundance()), - moltype: Some(row.moltype()), - num: None, - scaled: None, - containment: None, - picklist: None, - }) - } -} - pub trait Index<'a> { type Item: Comparable; //type SignatureIterator: Iterator; diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 60e33cf1d4..4c4064acc4 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -26,11 +26,6 @@ use crate::Result; fn compute_color(idxs: &Datasets) -> Color { let s = BuildHasherDefault::::default(); let mut hasher = s.build_hasher(); - /* - // TODO: remove this... - let mut sorted: Vec<_> = idxs.iter().collect(); - sorted.sort(); - */ idxs.hash(&mut hasher); hasher.finish() } @@ -198,7 +193,6 @@ impl RevIndex { .merge_cf(&cf_hashes, &hash_bytes[..], colors.as_slice()) .expect("error merging"); } - // TODO: save collection to DB? } } diff --git a/src/core/src/index/revindex/mem_revindex.rs b/src/core/src/index/revindex/mem_revindex.rs index 113452ed1b..b951d9513d 100644 --- a/src/core/src/index/revindex/mem_revindex.rs +++ b/src/core/src/index/revindex/mem_revindex.rs @@ -1,4 +1,4 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::sync::atomic::{AtomicUsize, Ordering}; use camino::Utf8Path as Path; @@ -13,7 +13,9 @@ use rayon::prelude::*; use crate::collection::Collection; use crate::encodings::{Color, Colors, Idx}; use crate::index::linear::LinearIndex; -use crate::index::{GatherResult, Index, Selection, SigCounter}; +use crate::index::revindex::HashToColor; +use crate::index::{GatherResult, Index, SigCounter}; +use crate::prelude::*; use crate::signature::{Signature, SigsTrait}; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; @@ -21,90 +23,12 @@ use crate::storage::Storage; use crate::HashIntoType; use crate::Result; -// Use rkyv for serialization? -// https://davidkoloski.me/rkyv/ -//#[derive(Serialize, Deserialize)] pub struct RevIndex { linear: LinearIndex, hash_to_color: HashToColor, colors: Colors, } -#[derive(Serialize, Deserialize)] -struct HashToColor(HashMap>); - -impl HashToColor { - fn new() -> Self { - HashToColor(HashMap::< - HashIntoType, - Color, - BuildNoHashHasher, - >::with_hasher(BuildNoHashHasher::default())) - } - - fn get(&self, hash: &HashIntoType) -> Option<&Color> { - self.0.get(hash) - } - - fn retain(&mut self, hashes: &HashSet) { - self.0.retain(|hash, _| hashes.contains(hash)) - } - - fn len(&self) -> usize { - self.0.len() - } - - fn is_empty(&self) -> bool { - self.0.is_empty() - } - - fn add_to(&mut self, colors: &mut Colors, dataset_id: usize, matched_hashes: Vec) { - let mut color = None; - - matched_hashes.into_iter().for_each(|hash| { - color = Some(colors.update(color, &[dataset_id as Idx]).unwrap()); - self.0.insert(hash, color.unwrap()); - }); - } - - fn reduce_hashes_colors( - a: (HashToColor, Colors), - b: (HashToColor, Colors), - ) -> (HashToColor, Colors) { - let ((small_hashes, small_colors), (mut large_hashes, mut large_colors)) = - if a.0.len() > b.0.len() { - (b, a) - } else { - (a, b) - }; - - small_hashes.0.into_iter().for_each(|(hash, color)| { - large_hashes - .0 - .entry(hash) - .and_modify(|entry| { - // Hash is already present. - // Update the current color by adding the indices from - // small_colors. - let ids = small_colors.indices(&color); - let new_color = large_colors.update(Some(*entry), ids).unwrap(); - *entry = new_color; - }) - .or_insert_with(|| { - // In this case, the hash was not present yet. - // we need to create the same color from small_colors - // into large_colors. - let ids = small_colors.indices(&color); - let new_color = large_colors.update(None, ids).unwrap(); - assert_eq!(new_color, color); - new_color - }); - }); - - (large_hashes, large_colors) - } -} - impl LinearIndex { fn index( self, @@ -171,50 +95,6 @@ impl LinearIndex { } impl RevIndex { - pub fn load>( - _index_path: P, - _queries: Option<&[KmerMinHash]>, - ) -> Result { - unimplemented!() - /* - let (rdr, _) = niffler::from_path(index_path)?; - let revindex = if let Some(qs) = queries { - // TODO: avoid loading full revindex if query != None - /* - struct PartialRevIndex { - hashes_to_keep: Option>, - marker: PhantomData T>, - } - - impl PartialRevIndex { - pub fn new(hashes_to_keep: HashSet) -> Self { - PartialRevIndex { - hashes_to_keep: Some(hashes_to_keep), - marker: PhantomData, - } - } - } - */ - - let mut hashes: HashSet = HashSet::new(); - for q in qs { - hashes.extend(q.iter_mins()); - } - - //let mut revindex: RevIndex = PartialRevIndex::new(hashes).deserialize(&rdr).unwrap(); - - let mut revindex: RevIndex = serde_json::from_reader(rdr)?; - revindex.hash_to_color.retain(&hashes); - revindex - } else { - // Load the full revindex - serde_json::from_reader(rdr)? - }; - - Ok(revindex) - */ - } - pub fn new( search_sigs: &[PathBuf], template: &Sketch, @@ -380,32 +260,6 @@ impl RevIndex { containment: bool, _ignore_scaled: bool, ) -> Result> { - /* - let template_mh = None; - if let Sketch::MinHash(mh) = self.template { - template_mh = Some(mh); - }; - // TODO: throw error - let template_mh = template_mh.unwrap(); - - let tmp_mh; - let mh = if template_mh.scaled() > mh.scaled() { - // TODO: proper error here - tmp_mh = mh.downsample_scaled(self.scaled)?; - &tmp_mh - } else { - mh - }; - - if self.scaled < mh.scaled() && !ignore_scaled { - return Err(LcaDBError::ScaledMismatchError { - db: self.scaled, - query: mh.scaled(), - } - .into()); - } - */ - // TODO: proper threshold calculation let threshold: usize = (threshold * (mh.size() as f64)) as _; @@ -490,43 +344,6 @@ impl<'a> Index<'a> for RevIndex { } } -/* -impl RevIndexOps for RevIndex { - /* TODO: need the repair_cf variant, not available in rocksdb-rust yet - pub fn repair(index: &Path, colors: bool); - */ - - fn matches_from_counter(&self, counter: SigCounter, threshold: usize) -> Vec<(String, usize)>; - - fn prepare_gather_counters( - &self, - query: &KmerMinHash, - ) -> (SigCounter, QueryColors, HashToColor); - - fn index(&self, index_sigs: Vec, template: &Sketch, threshold: f64, save_paths: bool); - - fn update(&self, index_sigs: Vec, template: &Sketch, threshold: f64, save_paths: bool); - - fn compact(&self); - - fn flush(&self) -> Result<()>; - - fn convert(&self, output_db: RevIndex) -> Result<()>; - - fn check(&self, quick: bool); - - fn gather( - &self, - counter: SigCounter, - query_colors: QueryColors, - hash_to_color: HashToColor, - threshold: usize, - query: &KmerMinHash, - template: &Sketch, - ) -> Result>; -} -*/ - #[cfg(test)] mod test { use super::*; diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index 460a5429af..42a9837d13 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -8,11 +8,12 @@ use std::sync::Arc; use byteorder::{LittleEndian, WriteBytesExt}; use enum_dispatch::enum_dispatch; - +use nohash_hasher::BuildNoHashHasher; use roaring::RoaringBitmap; +use serde::{Deserialize, Serialize}; use crate::collection::CollectionSet; -use crate::encodings::{Color, Idx}; +use crate::encodings::{Color, Colors, Idx}; use crate::index::{GatherResult, SigCounter}; use crate::signature::{Signature, SigsTrait}; use crate::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; @@ -20,11 +21,12 @@ use crate::sketch::Sketch; use crate::HashIntoType; use crate::Result; -//type DB = rocksdb::DBWithThreadMode; type DB = rocksdb::DBWithThreadMode; type QueryColors = HashMap; -type HashToColor = HashMap; +type HashToColorT = HashMap>; +#[derive(Serialize, Deserialize)] +pub struct HashToColor(HashToColorT); const HASHES: &str = "hashes"; const COLORS: &str = "colors"; @@ -75,6 +77,83 @@ pub trait RevIndexOps { ) -> Result>; } +impl HashToColor { + fn new() -> Self { + HashToColor(HashMap::< + HashIntoType, + Color, + BuildNoHashHasher, + >::with_hasher(BuildNoHashHasher::default())) + } + + fn get(&self, hash: &HashIntoType) -> Option<&Color> { + self.0.get(hash) + } + + fn len(&self) -> usize { + self.0.len() + } + + fn is_empty(&self) -> bool { + self.0.is_empty() + } + + fn add_to(&mut self, colors: &mut Colors, dataset_id: usize, matched_hashes: Vec) { + let mut color = None; + + matched_hashes.into_iter().for_each(|hash| { + color = Some(colors.update(color, &[dataset_id as Idx]).unwrap()); + self.0.insert(hash, color.unwrap()); + }); + } + + fn reduce_hashes_colors( + a: (HashToColor, Colors), + b: (HashToColor, Colors), + ) -> (HashToColor, Colors) { + let ((small_hashes, small_colors), (mut large_hashes, mut large_colors)) = + if a.0.len() > b.0.len() { + (b, a) + } else { + (a, b) + }; + + small_hashes.0.into_iter().for_each(|(hash, color)| { + large_hashes + .0 + .entry(hash) + .and_modify(|entry| { + // Hash is already present. + // Update the current color by adding the indices from + // small_colors. + let ids = small_colors.indices(&color); + let new_color = large_colors.update(Some(*entry), ids).unwrap(); + *entry = new_color; + }) + .or_insert_with(|| { + // In this case, the hash was not present yet. + // we need to create the same color from small_colors + // into large_colors. + let ids = small_colors.indices(&color); + let new_color = large_colors.update(None, ids).unwrap(); + assert_eq!(new_color, color); + new_color + }); + }); + + (large_hashes, large_colors) + } +} + +impl FromIterator<(HashIntoType, Color)> for HashToColor { + fn from_iter(iter: T) -> Self + where + T: IntoIterator, + { + HashToColor(HashToColorT::from_iter(iter)) + } +} + impl RevIndex { /* TODO: need the repair_cf variant, not available in rocksdb-rust yet pub fn repair(index: &Path, colors: bool) { @@ -387,27 +466,30 @@ fn stats_for_cf(db: Arc, cf_name: &str, deep_check: bool, quick: bool) { } } -fn build_template(ksize: u8, scaled: usize) -> Sketch { - let max_hash = max_hash_for_scaled(scaled as u64); - let template_mh = KmerMinHash::builder() - .num(0u32) - .ksize(ksize as u32) - .max_hash(max_hash) - .build(); - Sketch::MinHash(template_mh) -} - #[cfg(test)] mod test { use camino::Utf8PathBuf as PathBuf; use tempfile::TempDir; + use crate::collection::Collection; use crate::prelude::*; + use crate::selection::Selection; + use crate::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; + use crate::sketch::Sketch; use crate::Result; - use crate::{collection::Collection, index::Selection}; - use super::{build_template, prepare_query, RevIndex, RevIndexOps}; + use super::{prepare_query, RevIndex, RevIndexOps}; + + fn build_template(ksize: u8, scaled: usize) -> Sketch { + let max_hash = max_hash_for_scaled(scaled as u64); + let template_mh = KmerMinHash::builder() + .num(0u32) + .ksize(ksize as u32) + .max_hash(max_hash) + .build(); + Sketch::MinHash(template_mh) + } #[test] fn revindex_index() -> Result<()> { diff --git a/src/core/src/lib.rs b/src/core/src/lib.rs index dc88d34363..da383372a0 100644 --- a/src/core/src/lib.rs +++ b/src/core/src/lib.rs @@ -30,7 +30,7 @@ pub mod cmd; pub mod collection; pub mod index; pub mod manifest; -pub mod picklist; +pub mod selection; pub mod signature; pub mod sketch; pub mod storage; diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs index ca2602b269..39647480f2 100644 --- a/src/core/src/manifest.rs +++ b/src/core/src/manifest.rs @@ -8,7 +8,7 @@ use serde::de; use serde::{Deserialize, Serialize}; use crate::encodings::HashFunctions; -use crate::index::Selection; +use crate::prelude::*; use crate::signature::{Signature, SigsTrait}; use crate::sketch::Sketch; use crate::Result; @@ -163,8 +163,10 @@ impl Manifest { pub fn iter(&self) -> impl Iterator { self.records.iter() } +} - pub fn select_to_manifest(&self, selection: &Selection) -> Result { +impl Select for Manifest { + fn select(self, selection: &Selection) -> Result { let rows = self.records.iter().filter(|row| { let mut valid = true; valid = if let Some(ksize) = selection.ksize() { diff --git a/src/core/src/picklist.rs b/src/core/src/picklist.rs deleted file mode 100644 index 943d3f051a..0000000000 --- a/src/core/src/picklist.rs +++ /dev/null @@ -1,29 +0,0 @@ -use getset::{CopyGetters, Getters, Setters}; -use typed_builder::TypedBuilder; - -#[derive(Default, TypedBuilder, CopyGetters, Getters, Setters, Clone, Debug)] -pub struct Picklist { - #[getset(get = "pub", set = "pub")] - #[builder(default = "".into())] - coltype: String, - - #[getset(get = "pub", set = "pub")] - #[builder(default = "".into())] - pickfile: String, - - #[getset(get = "pub", set = "pub")] - #[builder(default = "".into())] - column_name: String, - - #[getset(get = "pub", set = "pub")] - #[builder] - pickstyle: PickStyle, -} - -#[derive(Clone, Default, Debug)] -#[repr(u32)] -pub enum PickStyle { - #[default] - Include = 1, - Exclude = 2, -} diff --git a/src/core/src/prelude.rs b/src/core/src/prelude.rs index ef7d4aa27b..90598186c4 100644 --- a/src/core/src/prelude.rs +++ b/src/core/src/prelude.rs @@ -1,27 +1,28 @@ use std::io::Write; -use crate::Error; +use crate::Result; +pub use crate::selection::{Select, Selection}; pub use crate::signature::Signature; pub use crate::storage::Storage; pub trait ToWriter { - fn to_writer(&self, writer: &mut W) -> Result<(), Error> + fn to_writer(&self, writer: &mut W) -> Result<()> where W: Write; } pub trait Update { - fn update(&self, other: &mut O) -> Result<(), Error>; + fn update(&self, other: &mut O) -> Result<()>; } pub trait FromFactory { - fn factory(&self, name: &str) -> Result; + fn factory(&self, name: &str) -> Result; } /// Implemented by anything that wants to read specific data from a storage. pub trait ReadData { - fn data(&self) -> Result<&D, Error>; + fn data(&self) -> Result<&D>; } // TODO: split into two traits, Similarity and Containment? diff --git a/src/core/src/selection.rs b/src/core/src/selection.rs new file mode 100644 index 0000000000..3e18f8fb31 --- /dev/null +++ b/src/core/src/selection.rs @@ -0,0 +1,141 @@ +use getset::{CopyGetters, Getters, Setters}; +use typed_builder::TypedBuilder; + +use crate::encodings::HashFunctions; +use crate::manifest::Record; +use crate::signature::SigsTrait; +use crate::sketch::Sketch; +use crate::Result; + +#[derive(Default, TypedBuilder, CopyGetters, Getters, Setters, Clone, Debug)] +pub struct Picklist { + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + coltype: String, + + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + pickfile: String, + + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + column_name: String, + + #[getset(get = "pub", set = "pub")] + #[builder] + pickstyle: PickStyle, +} + +#[derive(Clone, Default, Debug)] +#[repr(u32)] +pub enum PickStyle { + #[default] + Include = 1, + Exclude = 2, +} + +#[derive(Default, Debug)] +pub struct Selection { + ksize: Option, + abund: Option, + num: Option, + scaled: Option, + containment: Option, + moltype: Option, + picklist: Option, +} + +pub trait Select { + fn select(self, selection: &Selection) -> Result + where + Self: Sized; +} + +impl Selection { + pub fn ksize(&self) -> Option { + self.ksize + } + + pub fn set_ksize(&mut self, ksize: u32) { + self.ksize = Some(ksize); + } + + pub fn abund(&self) -> Option { + self.abund + } + + pub fn set_abund(&mut self, value: bool) { + self.abund = Some(value); + } + + pub fn num(&self) -> Option { + self.num + } + + pub fn set_num(&mut self, num: u32) { + self.num = Some(num); + } + + pub fn scaled(&self) -> Option { + self.scaled + } + + pub fn set_scaled(&mut self, scaled: u32) { + self.scaled = Some(scaled); + } + + pub fn containment(&self) -> Option { + self.containment + } + + pub fn set_containment(&mut self, containment: bool) { + self.containment = Some(containment); + } + + pub fn moltype(&self) -> Option { + self.moltype + } + + pub fn set_moltype(&mut self, value: HashFunctions) { + self.moltype = Some(value); + } + + pub fn picklist(&self) -> Option { + self.picklist.clone() + } + + pub fn set_picklist(&mut self, value: Picklist) { + self.picklist = Some(value); + } + + pub fn from_template(template: &Sketch) -> Self { + let (num, scaled) = match template { + Sketch::MinHash(mh) => (Some(mh.num()), Some(mh.scaled() as u32)), + Sketch::LargeMinHash(mh) => (Some(mh.num()), Some(mh.scaled() as u32)), + _ => (None, None), + }; + + Selection { + ksize: Some(template.ksize() as u32), + abund: None, + containment: None, + //moltype: Some(template.hash_function()), + moltype: None, + num, + picklist: None, + scaled, + } + } + + pub fn from_record(row: &Record) -> Result { + Ok(Self { + ksize: Some(*row.ksize()), + abund: Some(*row.with_abundance()), + moltype: Some(row.moltype()), + num: None, + scaled: None, + containment: None, + picklist: None, + }) + } +} diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 19ec308617..9ac9bfe2aa 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -17,8 +17,8 @@ use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; use crate::encodings::{aa_to_dayhoff, aa_to_hp, revcomp, to_aa, HashFunctions, VALID}; -use crate::index::Selection; use crate::prelude::*; +use crate::selection::{Select, Selection}; use crate::sketch::Sketch; use crate::Error; use crate::HashIntoType; @@ -534,32 +534,6 @@ impl Signature { None } - pub fn select(mut self, selection: &Selection) -> Result { - self.signatures.retain(|s| { - let mut valid = true; - valid = if let Some(ksize) = selection.ksize() { - let k = s.ksize() as u32; - k == ksize || k == ksize * 3 - } else { - valid - }; - /* - valid = if let Some(abund) = selection.abund() { - valid && *s.with_abundance() == abund - } else { - valid - }; - valid = if let Some(moltype) = selection.moltype() { - valid && s.moltype() == moltype - } else { - valid - }; - */ - valid - }); - Ok(self) - } - pub fn from_path>(path: P) -> Result, Error> { let mut reader = io::BufReader::new(File::open(path)?); Signature::from_reader(&mut reader) @@ -787,6 +761,34 @@ impl ToWriter for Signature { } } +impl Select for Signature { + fn select(mut self, selection: &Selection) -> Result { + self.signatures.retain(|s| { + let mut valid = true; + valid = if let Some(ksize) = selection.ksize() { + let k = s.ksize() as u32; + k == ksize || k == ksize * 3 + } else { + valid + }; + /* + valid = if let Some(abund) = selection.abund() { + valid && *s.with_abundance() == abund + } else { + valid + }; + valid = if let Some(moltype) = selection.moltype() { + valid && s.moltype() == moltype + } else { + valid + }; + */ + valid + }); + Ok(self) + } +} + impl Default for Signature { fn default() -> Signature { Signature { diff --git a/src/core/src/storage.rs b/src/core/src/storage.rs index 3e2a2cab6e..ad017e65a7 100644 --- a/src/core/src/storage.rs +++ b/src/core/src/storage.rs @@ -13,7 +13,6 @@ use thiserror::Error; use typed_builder::TypedBuilder; use crate::errors::ReadDataError; -use crate::index::Selection; use crate::prelude::*; use crate::signature::SigsTrait; use crate::sketch::Sketch; @@ -431,8 +430,10 @@ impl SigStore { pub fn name(&self) -> String { self.name.clone() } +} - pub fn select(mut self, selection: &Selection) -> Result { +impl Select for SigStore { + fn select(mut self, selection: &Selection) -> Result { // TODO: find better error let sig = self.data.take().ok_or(Error::MismatchKSizes)?; self.data = OnceCell::with_value(sig.select(selection)?);