From 0a857f4f28e34111ab99ffc02fa2e1316349ce78 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 13 Nov 2024 07:01:26 -0800 Subject: [PATCH 01/13] WIP: update code after sourmash fix to `Collection::sig_from_record()` --- Cargo.lock | 82 +++++++++++++++++++++++++++++++++--------------------- Cargo.toml | 3 +- 2 files changed, 53 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 72ed890a..74434028 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -188,7 +188,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b48ee4c818e9d19bbdf75b56e1b2eb9682bb4fbd7ff8e7e7f2cc9956e1aefac7" dependencies = [ "flate2", - "thiserror", + "thiserror 1.0.61", ] [[package]] @@ -215,7 +215,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -262,7 +262,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", "syn_derive", ] @@ -518,7 +518,7 @@ checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -535,7 +535,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -559,7 +559,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -660,7 +660,7 @@ dependencies = [ "proc-macro-error2", "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -707,7 +707,7 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "58cf6b99a250776d813cdf2f0b478a053a822d078e7a2baf5cb36afc88c41a7c" dependencies = [ - "thiserror", + "thiserror 1.0.61", ] [[package]] @@ -1003,7 +1003,7 @@ checksum = "254a5372af8fc138e36684761d3c0cdb758a4410e938babcff1c860ce14ddbfc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -1055,7 +1055,7 @@ dependencies = [ "bzip2", "cfg-if", "flate2", - "thiserror", + "thiserror 1.0.61", "xz2", "zstd", ] @@ -1159,7 +1159,7 @@ dependencies = [ "proc-macro2", "proc-macro2-diagnostics", "quote", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -1197,7 +1197,7 @@ dependencies = [ "flate2", "log", "memchr", - "thiserror", + "thiserror 1.0.61", ] [[package]] @@ -1255,7 +1255,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e" dependencies = [ "proc-macro2", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -1329,7 +1329,7 @@ dependencies = [ "proc-macro-error-attr2", "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -1349,7 +1349,7 @@ checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", "version_check", "yansi", ] @@ -1422,7 +1422,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -1435,7 +1435,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -1735,7 +1735,7 @@ checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -1795,8 +1795,8 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" -version = "0.17.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=latest#c7363154b546058eb417b78bb77aca6523591cb1" +version = "0.17.1" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=fix_sig_from_record#fb65e525eace74abce5001a292427fc343ec257a" dependencies = [ "az", "byteorder", @@ -1832,7 +1832,7 @@ dependencies = [ "serde_json", "statrs", "streaming-stats", - "thiserror", + "thiserror 2.0.3", "twox-hash", "typed-builder", "vec-collections", @@ -1909,9 +1909,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.85" +version = "2.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5023162dfcd14ef8f32034d8bcd4cc5ddc61ef7a247c024a33e24e1f24d21b56" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" dependencies = [ "proc-macro2", "quote", @@ -1927,7 +1927,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -1967,7 +1967,16 @@ version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.61", +] + +[[package]] +name = "thiserror" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" +dependencies = [ + "thiserror-impl 2.0.3", ] [[package]] @@ -1978,7 +1987,18 @@ checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", ] [[package]] @@ -2041,7 +2061,7 @@ checksum = "1f718dfaf347dcb5b983bfc87608144b0bad87970aebcbea5ce44d2a30c08e63" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -2138,7 +2158,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", "wasm-bindgen-shared", ] @@ -2160,7 +2180,7 @@ checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -2332,7 +2352,7 @@ checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.85", + "syn 2.0.87", ] [[package]] @@ -2346,7 +2366,7 @@ dependencies = [ "crossbeam-utils", "displaydoc", "indexmap", - "thiserror", + "thiserror 1.0.61", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 667c8369..cb373b31 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.22.6", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.213", features = ["derive"] } -sourmash = { version = "0.17.0", features = ["branchwater"] } +#sourmash = { version = "0.17.0", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "fix_sig_from_record", features = ["branchwater"] } serde_json = "1.0.132" niffler = "2.4.0" log = "0.4.22" From 304645d49cb88a2ac1a69bc45b82e51064d00abf Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Nov 2024 06:56:51 -0800 Subject: [PATCH 02/13] fix md5 mismatches --- src/fastmultigather.rs | 2 +- src/utils/mod.rs | 2 +- src/utils/multicollection.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 83ce8cf1..065741ed 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -105,7 +105,7 @@ pub fn fastmultigather( let query_filename = query_sig.filename(); let query_name = query_sig.name(); - let query_md5 = query_sig.md5sum(); + let query_md5 = record.md5().clone(); let query_mh: KmerMinHash = query_sig.try_into().expect("cannot get sketch"); diff --git a/src/utils/mod.rs b/src/utils/mod.rs index a039b9cc..a274c9b8 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -450,7 +450,7 @@ pub fn load_sketches_above_threshold( if let Ok(against_sig) = coll.sig_from_record(against_record) { let against_filename = against_sig.filename(); let against_mh: KmerMinHash = against_sig.try_into().expect("cannot get sketch"); - let against_md5 = against_mh.md5sum(); // keep original md5sum + let against_md5 = against_record.md5().clone(); // keep original md5sum let against_mh_ds = against_mh .downsample_scaled(query.scaled()) diff --git a/src/utils/multicollection.rs b/src/utils/multicollection.rs index 12089ff4..e83336e0 100644 --- a/src/utils/multicollection.rs +++ b/src/utils/multicollection.rs @@ -329,7 +329,7 @@ impl MultiCollection { ); let sig_name = sig.name(); - let sig_md5 = sig.md5sum(); + let sig_md5 = record.md5().clone(); let selected_sig = sig.select(selection).ok()?; let mut minhash: KmerMinHash = selected_sig.try_into().expect("cannot extract sketch"); From 7e1a71de04739ffe535f392a95941efdd20caeab Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Nov 2024 07:01:38 -0800 Subject: [PATCH 03/13] do not set default scaled in index_siglist --- src/python/tests/sourmash_tst_utils.py | 9 ++++++--- src/python/tests/test_fastmultigather.py | 3 +++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/python/tests/sourmash_tst_utils.py b/src/python/tests/sourmash_tst_utils.py index dabce721..3075d1f5 100644 --- a/src/python/tests/sourmash_tst_utils.py +++ b/src/python/tests/sourmash_tst_utils.py @@ -36,11 +36,15 @@ def index_siglist( db, *, ksize=31, - scaled=1000, + scaled=None, moltype="DNA", toggle_internal_storage="--internal-storage", ): # build index + extra_args = [] + if scaled is not None: + extra_args = ["--scaled", str(scaled)] + runtmp.sourmash( "scripts", "index", @@ -49,11 +53,10 @@ def index_siglist( db, "-k", str(ksize), - "--scaled", - str(scaled), "--moltype", moltype, toggle_internal_storage, + *extra_args ) return db diff --git a/src/python/tests/test_fastmultigather.py b/src/python/tests/test_fastmultigather.py index d08873b5..9d9c17e7 100644 --- a/src/python/tests/test_fastmultigather.py +++ b/src/python/tests/test_fastmultigather.py @@ -2114,6 +2114,9 @@ def test_simple_query_scaled_indexed(runtmp): def test_equal_matches(runtmp, indexed): # check that equal matches get returned from fastmultigather + # NOTE: the use of a bunch of bottom hashes in the artifical sketches + # below makes some of the numbers weird if you downsample etc. So + # be careful! base = sourmash.MinHash(scaled=1, ksize=31, n=0) a = base.copy_and_clear() From ca0cff9e07bb5a975662f0a2e438bf4aff509ea2 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Nov 2024 07:12:15 -0800 Subject: [PATCH 04/13] remove many now-unnecessary downsamples --- src/fastmultigather.rs | 5 +---- src/fastmultigather_rocksdb.rs | 4 +++- src/manysearch.rs | 4 +--- src/manysearch_rocksdb.rs | 4 +++- src/utils/multicollection.rs | 5 ++++- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 065741ed..b4c480be 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -80,6 +80,7 @@ pub fn fastmultigather( allow_failed_sigpaths, )?; // load against sketches into memory, downsampling on the way + // @CTB can probably eliminated against_selection here now. let against = against_collection.load_sketches(&against_selection)?; // Iterate over all queries => do prefetch and gather! @@ -109,10 +110,6 @@ pub fn fastmultigather( let query_mh: KmerMinHash = query_sig.try_into().expect("cannot get sketch"); - let query_mh = query_mh - .downsample_scaled(common_scaled) - .expect("cannot downsample query"); - // CTB refactor let query_scaled = query_mh.scaled(); let query_ksize = query_mh.ksize().try_into().unwrap(); diff --git a/src/fastmultigather_rocksdb.rs b/src/fastmultigather_rocksdb.rs index ddedf27d..70b6dd1a 100644 --- a/src/fastmultigather_rocksdb.rs +++ b/src/fastmultigather_rocksdb.rs @@ -95,9 +95,11 @@ pub fn fastmultigather_rocksdb( let mut results = vec![]; if let Ok(query_mh) = >::try_into(query_sig) { + /* @CTB let query_mh = query_mh .downsample_scaled(selection_scaled) - .expect("cannot downsample!?"); + .expect("cannot downsample!?"); + */ let _ = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); // Gather! let (counter, query_colors, hash_to_color) = diff --git a/src/manysearch.rs b/src/manysearch.rs index b18b29ae..d085f7c2 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -94,9 +94,6 @@ pub fn manysearch( if let Ok(against_mh) = >::try_into(against_sig) { - let against_mh = against_mh - .downsample_scaled(common_scaled) - .expect("cannot downsample search minhash to requested scaled"); for query in query_sketchlist.iter() { // be paranoid and confirm scaled match. if query.minhash.scaled() != common_scaled { @@ -251,6 +248,7 @@ fn downsample_and_inflate_abundances( let sum_all_abunds: u64; // avoid downsampling if we can + // @CTB maybe this can be removed now, and fn renamed?? if against_scaled != query_scaled { let against_ds = against .clone() diff --git a/src/manysearch_rocksdb.rs b/src/manysearch_rocksdb.rs index d47248b2..6d25fb30 100644 --- a/src/manysearch_rocksdb.rs +++ b/src/manysearch_rocksdb.rs @@ -99,12 +99,14 @@ pub fn manysearch_rocksdb( if let Ok(query_mh) = query_sig.try_into() { let mut query_mh: KmerMinHash = query_mh; + /* @CTB eliminate? if let Some(set_scaled) = set_selection.scaled() { query_mh = query_mh .clone() .downsample_scaled(set_scaled) .expect("cannot downsample query"); - } + } + */ let query_size = query_mh.size(); let counter = db.counter_for_query(&query_mh); let matches = diff --git a/src/utils/multicollection.rs b/src/utils/multicollection.rs index e83336e0..5723ee25 100644 --- a/src/utils/multicollection.rs +++ b/src/utils/multicollection.rs @@ -334,11 +334,14 @@ impl MultiCollection { let mut minhash: KmerMinHash = selected_sig.try_into().expect("cannot extract sketch"); + // @CTB maybe eliminate? + /* if let Some(select_scaled) = selection.scaled() { minhash = minhash .downsample_scaled(select_scaled) .expect("cannot downsample to desired scaled"); - } + } + */ Some(SmallSignature { location: record.internal_location().to_string(), From 7390e7384664c2405ee34f7c243a75c2c6c8b94e Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Nov 2024 07:45:23 -0800 Subject: [PATCH 05/13] remove many downsample_scaled --- src/fastmultigather_rocksdb.rs | 5 ----- src/manysearch.rs | 7 +++++-- src/manysearch_rocksdb.rs | 12 ++---------- src/utils/mod.rs | 2 +- src/utils/multicollection.rs | 11 +---------- 5 files changed, 9 insertions(+), 28 deletions(-) diff --git a/src/fastmultigather_rocksdb.rs b/src/fastmultigather_rocksdb.rs index 70b6dd1a..5adbdc9b 100644 --- a/src/fastmultigather_rocksdb.rs +++ b/src/fastmultigather_rocksdb.rs @@ -95,11 +95,6 @@ pub fn fastmultigather_rocksdb( let mut results = vec![]; if let Ok(query_mh) = >::try_into(query_sig) { - /* @CTB - let query_mh = query_mh - .downsample_scaled(selection_scaled) - .expect("cannot downsample!?"); - */ let _ = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); // Gather! let (counter, query_colors, hash_to_color) = diff --git a/src/manysearch.rs b/src/manysearch.rs index d085f7c2..00bc5b24 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -248,14 +248,17 @@ fn downsample_and_inflate_abundances( let sum_all_abunds: u64; // avoid downsampling if we can - // @CTB maybe this can be removed now, and fn renamed?? + // @CTB maybe this can be removed now? if against_scaled != query_scaled { + panic!("wat"); + /* let against_ds = against .clone() .downsample_scaled(query.scaled()) .expect("cannot downsample sketch"); (abunds, sum_weighted) = query.inflated_abundances(&against_ds)?; - sum_all_abunds = against_ds.sum_abunds(); + sum_all_abunds = against_ds.sum_abunds(); + */ } else { (abunds, sum_weighted) = query.inflated_abundances(against)?; sum_all_abunds = against.sum_abunds(); diff --git a/src/manysearch_rocksdb.rs b/src/manysearch_rocksdb.rs index 6d25fb30..30f75dd5 100644 --- a/src/manysearch_rocksdb.rs +++ b/src/manysearch_rocksdb.rs @@ -11,6 +11,7 @@ use sourmash::index::revindex::{RevIndex, RevIndexOps}; use sourmash::selection::Selection; use sourmash::signature::SigsTrait; use sourmash::sketch::minhash::KmerMinHash; +use sourmash::storage::SigStore; use crate::utils::{ csvwriter_thread, is_revindex_database, load_collection, ReportType, SearchResult, @@ -97,16 +98,7 @@ pub fn manysearch_rocksdb( let query_md5 = query_sig.md5sum().clone(); let query_file = query_sig.filename().clone(); - if let Ok(query_mh) = query_sig.try_into() { - let mut query_mh: KmerMinHash = query_mh; - /* @CTB eliminate? - if let Some(set_scaled) = set_selection.scaled() { - query_mh = query_mh - .clone() - .downsample_scaled(set_scaled) - .expect("cannot downsample query"); - } - */ + if let Ok(query_mh) = >::try_into(query_sig) { let query_size = query_mh.size(); let counter = db.counter_for_query(&query_mh); let matches = diff --git a/src/utils/mod.rs b/src/utils/mod.rs index a274c9b8..98a260a9 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -76,7 +76,7 @@ pub fn prefetch( .filter_map(|result| { let mut mm = None; let searchsig = &result.minhash; - // downsample within count_common + // downsample within count_common @CTB needed? let overlap = searchsig.count_common(query_mh, true); if let Ok(overlap) = overlap { if overlap >= threshold_hashes { diff --git a/src/utils/multicollection.rs b/src/utils/multicollection.rs index 5723ee25..530d692c 100644 --- a/src/utils/multicollection.rs +++ b/src/utils/multicollection.rs @@ -331,18 +331,9 @@ impl MultiCollection { let sig_name = sig.name(); let sig_md5 = record.md5().clone(); let selected_sig = sig.select(selection).ok()?; - let mut minhash: KmerMinHash = + let minhash: KmerMinHash = selected_sig.try_into().expect("cannot extract sketch"); - // @CTB maybe eliminate? - /* - if let Some(select_scaled) = selection.scaled() { - minhash = minhash - .downsample_scaled(select_scaled) - .expect("cannot downsample to desired scaled"); - } - */ - Some(SmallSignature { location: record.internal_location().to_string(), name: sig_name, From fef2f585a0339c00019771355c1dafc7a0d9de6c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Nov 2024 07:58:36 -0800 Subject: [PATCH 06/13] eliminate select in load_sketches --- src/fastmultigather.rs | 2 +- src/manysearch.rs | 2 +- src/multisearch.rs | 8 ++++++-- src/pairwise.rs | 2 +- src/utils/multicollection.rs | 5 ++--- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index b4c480be..1722ba50 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -81,7 +81,7 @@ pub fn fastmultigather( )?; // load against sketches into memory, downsampling on the way // @CTB can probably eliminated against_selection here now. - let against = against_collection.load_sketches(&against_selection)?; + let against = against_collection.load_sketches()?; // Iterate over all queries => do prefetch and gather! let processed_queries = AtomicUsize::new(0); diff --git a/src/manysearch.rs b/src/manysearch.rs index 00bc5b24..18174bfd 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -50,7 +50,7 @@ pub fn manysearch( selection.set_scaled(common_scaled); // load all query sketches into memory, downsampling on the way - let query_sketchlist = query_collection.load_sketches(&selection)?; + let query_sketchlist = query_collection.load_sketches()?; // Against: Load collection, potentially off disk & not into memory. let against_collection = load_collection( diff --git a/src/multisearch.rs b/src/multisearch.rs index f0befb16..13faeed1 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -1,6 +1,7 @@ /// multisearch: massively parallel in-memory sketch search. use anyhow::Result; use rayon::prelude::*; +use sourmash::prelude::Select; use sourmash::selection::Selection; use sourmash::signature::SigsTrait; use sourmash::sketch::minhash::KmerMinHash; @@ -171,7 +172,10 @@ pub fn multisearch( let mut new_selection = selection; new_selection.set_scaled(expected_scaled); - let queries: Vec = query_collection.load_sketches(&new_selection)?; + // update selection with new scaled. + let query_collection = query_collection.select(&new_selection)?; + + let queries: Vec = query_collection.load_sketches()?; // Load all against sketches into memory at once. let against_collection = load_collection( @@ -181,7 +185,7 @@ pub fn multisearch( allow_failed_sigpaths, )?; - let againsts: Vec = against_collection.load_sketches(&new_selection)?; + let againsts: Vec = against_collection.load_sketches()?; let ( n_comparisons, diff --git a/src/pairwise.rs b/src/pairwise.rs index ae1f0c44..b9e1a3b3 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -51,7 +51,7 @@ pub fn pairwise( let mut selection = selection; selection.set_scaled(common_scaled); - let sketches = collection.load_sketches(&selection)?; + let sketches = collection.load_sketches()?; // set up a multi-producer, single-consumer channel. let (send, recv) = diff --git a/src/utils/multicollection.rs b/src/utils/multicollection.rs index 530d692c..59360975 100644 --- a/src/utils/multicollection.rs +++ b/src/utils/multicollection.rs @@ -313,7 +313,7 @@ impl MultiCollection { // Load all sketches into memory, using SmallSignature to track original // signature metadata. - pub fn load_sketches(self, selection: &Selection) -> Result> { + pub fn load_sketches(self) -> Result> { if self.contains_revindex { eprintln!("WARNING: loading all sketches from a RocksDB into memory!"); } @@ -330,9 +330,8 @@ impl MultiCollection { let sig_name = sig.name(); let sig_md5 = record.md5().clone(); - let selected_sig = sig.select(selection).ok()?; let minhash: KmerMinHash = - selected_sig.try_into().expect("cannot extract sketch"); + sig.try_into().expect("cannot extract sketch"); Some(SmallSignature { location: record.internal_location().to_string(), From a9876362562f8519fbaa8891f02c9491874891d5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Nov 2024 08:13:23 -0800 Subject: [PATCH 07/13] refactor around downsampling --- src/fastmultigather.rs | 3 +-- src/manysearch.rs | 25 ++++--------------------- 2 files changed, 5 insertions(+), 23 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 1722ba50..5380704b 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -79,8 +79,7 @@ pub fn fastmultigather( ReportType::Against, allow_failed_sigpaths, )?; - // load against sketches into memory, downsampling on the way - // @CTB can probably eliminated against_selection here now. + // load against sketches into memory let against = against_collection.load_sketches()?; // Iterate over all queries => do prefetch and gather! diff --git a/src/manysearch.rs b/src/manysearch.rs index 18174bfd..1910fc9a 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -142,7 +142,7 @@ pub fn manysearch( median_abund, std_abund, ) = if calc_abund_stats { - downsample_and_inflate_abundances(&query.minhash, &against_mh) + inflate_abundances(&query.minhash, &against_mh) .ok()? } else { (None, None, None, None, None) @@ -227,7 +227,7 @@ pub fn manysearch( Ok(()) } -fn downsample_and_inflate_abundances( +fn inflate_abundances( query: &KmerMinHash, against: &KmerMinHash, ) -> Result< @@ -240,29 +240,12 @@ fn downsample_and_inflate_abundances( ), SourmashError, > { - let query_scaled = query.scaled(); - let against_scaled = against.scaled(); - let abunds: Vec; let sum_weighted: u64; let sum_all_abunds: u64; - // avoid downsampling if we can - // @CTB maybe this can be removed now? - if against_scaled != query_scaled { - panic!("wat"); - /* - let against_ds = against - .clone() - .downsample_scaled(query.scaled()) - .expect("cannot downsample sketch"); - (abunds, sum_weighted) = query.inflated_abundances(&against_ds)?; - sum_all_abunds = against_ds.sum_abunds(); - */ - } else { - (abunds, sum_weighted) = query.inflated_abundances(against)?; - sum_all_abunds = against.sum_abunds(); - } + (abunds, sum_weighted) = query.inflated_abundances(against)?; + sum_all_abunds = against.sum_abunds(); let average_abund = sum_weighted as f64 / abunds.len() as f64; let median_abund = median(abunds.iter().cloned()).expect("error"); From b5db0f40a26bd22586e1796cc03b4a4b3913845d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Nov 2024 08:19:08 -0800 Subject: [PATCH 08/13] upd --- Cargo.toml | 2 +- src/utils/mod.rs | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cb373b31..76961151 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ crate-type = ["cdylib"] pyo3 = { version = "0.22.6", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.213", features = ["derive"] } -#sourmash = { version = "0.17.0", features = ["branchwater"] } +#sourmash = { version = "0.17.2", features = ["branchwater"] } sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "fix_sig_from_record", features = ["branchwater"] } serde_json = "1.0.132" niffler = "2.4.0" diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 98a260a9..0473e3de 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -76,8 +76,7 @@ pub fn prefetch( .filter_map(|result| { let mut mm = None; let searchsig = &result.minhash; - // downsample within count_common @CTB needed? - let overlap = searchsig.count_common(query_mh, true); + let overlap = searchsig.count_common(query_mh, false); if let Ok(overlap) = overlap { if overlap >= threshold_hashes { let result = PrefetchResult { overlap, ..result }; From 97a0ecc39440f7e0fd33ed8a2574ac2eb525abe7 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Nov 2024 08:19:32 -0800 Subject: [PATCH 09/13] fix cargo fmt --- src/manysearch.rs | 3 +-- src/utils/multicollection.rs | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/manysearch.rs b/src/manysearch.rs index 1910fc9a..968f1548 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -142,8 +142,7 @@ pub fn manysearch( median_abund, std_abund, ) = if calc_abund_stats { - inflate_abundances(&query.minhash, &against_mh) - .ok()? + inflate_abundances(&query.minhash, &against_mh).ok()? } else { (None, None, None, None, None) }; diff --git a/src/utils/multicollection.rs b/src/utils/multicollection.rs index 59360975..89451202 100644 --- a/src/utils/multicollection.rs +++ b/src/utils/multicollection.rs @@ -330,8 +330,7 @@ impl MultiCollection { let sig_name = sig.name(); let sig_md5 = record.md5().clone(); - let minhash: KmerMinHash = - sig.try_into().expect("cannot extract sketch"); + let minhash: KmerMinHash = sig.try_into().expect("cannot extract sketch"); Some(SmallSignature { location: record.internal_location().to_string(), From d37888c322df1848c5eec4bcc1306850c8cd30b6 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Nov 2024 08:21:00 -0800 Subject: [PATCH 10/13] black --- src/python/tests/sourmash_tst_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/tests/sourmash_tst_utils.py b/src/python/tests/sourmash_tst_utils.py index 3075d1f5..86c97c57 100644 --- a/src/python/tests/sourmash_tst_utils.py +++ b/src/python/tests/sourmash_tst_utils.py @@ -56,7 +56,7 @@ def index_siglist( "--moltype", moltype, toggle_internal_storage, - *extra_args + *extra_args, ) return db From 7b7be364405517007d96dad80d84bdf9cb8193dc Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Nov 2024 10:09:53 -0800 Subject: [PATCH 11/13] not sure why that test succeeded, actually. --- Cargo.lock | 43 ++++++++-------------------------- Cargo.toml | 2 +- src/index.rs | 6 ++--- src/python/tests/test_index.py | 10 +++----- src/utils/multicollection.rs | 3 +-- 5 files changed, 18 insertions(+), 46 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 74434028..d32514a1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -199,17 +199,16 @@ checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab" [[package]] name = "bindgen" -version = "0.65.1" +version = "0.69.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" dependencies = [ - "bitflags 1.3.2", + "bitflags", "cexpr", "clang-sys", + "itertools 0.11.0", "lazy_static", "lazycell", - "peeking_take_while", - "prettyplease", "proc-macro2", "quote", "regex", @@ -218,12 +217,6 @@ dependencies = [ "syn 2.0.87", ] -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.5.0" @@ -858,9 +851,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "librocksdb-sys" -version = "0.11.0+8.1.1" +version = "0.16.0+8.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3386f101bcb4bd252d8e9d2fb41ec3b0862a15a62b478c355b2982efa469e3e" +checksum = "ce3d60bc059831dc1c83903fb45c103f75db65c5a7bf22272764d9cc683e348c" dependencies = [ "bindgen", "bzip2-sys", @@ -1168,12 +1161,6 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" -[[package]] -name = "peeking_take_while" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" - [[package]] name = "petgraph" version = "0.6.5" @@ -1248,16 +1235,6 @@ dependencies = [ "termtree", ] -[[package]] -name = "prettyplease" -version = "0.2.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e" -dependencies = [ - "proc-macro2", - "syn 2.0.87", -] - [[package]] name = "primal-check" version = "0.3.4" @@ -1618,9 +1595,9 @@ dependencies = [ [[package]] name = "rocksdb" -version = "0.21.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb6f170a4041d50a0ce04b0d2e14916d6ca863ea2e422689a5b694395d299ffe" +checksum = "6bd13e55d6d7b8cd0ea569161127567cd587676c99f4472f779a0279aa60a7a7" dependencies = [ "libc", "librocksdb-sys", @@ -1670,7 +1647,7 @@ version = "0.38.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99e4ea3e1cdc4b559b8e5650f9c8e5998e3e5c1343b4eaf034565f32318d63c0" dependencies = [ - "bitflags 2.5.0", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -1796,7 +1773,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.17.1" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=fix_sig_from_record#fb65e525eace74abce5001a292427fc343ec257a" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=latest#5cb3df9aa71317059ab6e88000fc89fb6c61ff19" dependencies = [ "az", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 76961151..3a2c25f8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ pyo3 = { version = "0.22.6", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.213", features = ["derive"] } #sourmash = { version = "0.17.2", features = ["branchwater"] } -sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "fix_sig_from_record", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "latest", features = ["branchwater"] } serde_json = "1.0.132" niffler = "2.4.0" log = "0.4.22" diff --git a/src/index.rs b/src/index.rs index 102892fd..31c54a3f 100644 --- a/src/index.rs +++ b/src/index.rs @@ -30,8 +30,8 @@ pub fn index>( // Try to convert it into a Collection and then CollectionSet. let collection = match Collection::try_from(multi.clone()) { // conversion worked! - Ok(c) => { - let cs: CollectionSet = c.select(&selection)?.try_into()?; + Ok(coll) => { + let cs: CollectionSet = coll.try_into()?; Ok(cs) } // conversion failed; can we/should we load it into memory? @@ -39,7 +39,7 @@ pub fn index>( if use_internal_storage { eprintln!("WARNING: loading all sketches into memory in order to index."); eprintln!("See 'index' documentation for details."); - let c: Collection = multi.load_all_sigs(&selection)?; + let c: Collection = multi.load_all_sigs()?; let cs: CollectionSet = c.try_into()?; Ok(cs) } else { diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index 9e8a1d4a..72d40bee 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -407,6 +407,7 @@ def test_index_zipfile_repeated_md5sums(runtmp, capfd, toggle_internal_storage): def test_index_zipfile_multiparam(runtmp, capfd, toggle_internal_storage): # test index from sourmash zipfile with multiple ksizes / scaled /moltype + # SHOULD FAIL. siglist = runtmp.output("db-sigs.txt") sig2 = get_test_data("2.fa.sig.gz") @@ -424,13 +425,8 @@ def test_index_zipfile_multiparam(runtmp, capfd, toggle_internal_storage): output = runtmp.output("db.rocksdb") - runtmp.sourmash("scripts", "index", zipf, "-o", output, toggle_internal_storage) - assert os.path.exists(output) - print(runtmp.last_result.err) - - assert "index is done" in runtmp.last_result.err - captured = capfd.readouterr() - print(captured.err) + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash("scripts", "index", zipf, "-o", output, toggle_internal_storage) def test_index_zipfile_bad(runtmp, capfd): diff --git a/src/utils/multicollection.rs b/src/utils/multicollection.rs index 89451202..d0b8796f 100644 --- a/src/utils/multicollection.rs +++ b/src/utils/multicollection.rs @@ -359,12 +359,11 @@ impl MultiCollection { } // Load all sketches into memory, producing an in-memory Collection. - pub fn load_all_sigs(self, selection: &Selection) -> Result { + pub fn load_all_sigs(self) -> Result { let all_sigs: Vec = self .par_iter() .filter_map(|(coll, _idx, record)| match coll.sig_from_record(record) { Ok(sig) => { - let sig = sig.select(selection).ok()?; Some(Signature::from(sig)) } Err(_) => { From b0d6a44bed7f456722fe965cca1b67c627922988 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Nov 2024 10:14:01 -0800 Subject: [PATCH 12/13] cargo fmt --- src/utils/multicollection.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/utils/multicollection.rs b/src/utils/multicollection.rs index d0b8796f..4c9329a1 100644 --- a/src/utils/multicollection.rs +++ b/src/utils/multicollection.rs @@ -363,9 +363,7 @@ impl MultiCollection { let all_sigs: Vec = self .par_iter() .filter_map(|(coll, _idx, record)| match coll.sig_from_record(record) { - Ok(sig) => { - Some(Signature::from(sig)) - } + Ok(sig) => Some(Signature::from(sig)), Err(_) => { eprintln!( "FAILED to load sketch from '{}'", From daaf5938a0a5088e5aec677c149e6e8c63b74317 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Nov 2024 11:12:54 -0800 Subject: [PATCH 13/13] upd to 0.17.2 --- Cargo.lock | 7 ++++--- Cargo.toml | 3 +-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d32514a1..24f4e778 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -206,7 +206,7 @@ dependencies = [ "bitflags", "cexpr", "clang-sys", - "itertools 0.11.0", + "itertools 0.12.1", "lazy_static", "lazycell", "proc-macro2", @@ -1772,8 +1772,9 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" -version = "0.17.1" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=latest#5cb3df9aa71317059ab6e88000fc89fb6c61ff19" +version = "0.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54e30f752d984b1d8456024973f8d89772b4ba248f592b77b57d59ad27a232a0" dependencies = [ "az", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 3a2c25f8..064fa0c0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,8 +12,7 @@ crate-type = ["cdylib"] pyo3 = { version = "0.22.6", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.213", features = ["derive"] } -#sourmash = { version = "0.17.2", features = ["branchwater"] } -sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "latest", features = ["branchwater"] } +sourmash = { version = "0.17.2", features = ["branchwater"] } serde_json = "1.0.132" niffler = "2.4.0" log = "0.4.22"