From c6cadb494beccb0d9dc0d554e70a91e6f893d4a9 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" <1330696+mr-c@users.noreply.github.com> Date: Wed, 13 Nov 2024 18:29:44 +0100 Subject: [PATCH 1/2] Upgrade rocksdb to 0.22.0, bump MSRV to 1.66 (#3383) Allow easier packaging for Debian --- .github/workflows/rust.yml | 4 ++-- Cargo.lock | 33 ++++++++------------------------- src/core/Cargo.toml | 4 ++-- src/core/README.md | 2 +- 4 files changed, 13 insertions(+), 30 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index b2fb07f08..459f29ece 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -273,11 +273,11 @@ jobs: - uses: actions-rs/toolchain@v1 with: - toolchain: "1.65.0" + toolchain: "1.66.0" override: true - name: check if README matches MSRV defined here - run: grep '1.65.0' src/core/README.md + run: grep '1.66.0' src/core/README.md - name: Check if it builds properly uses: actions-rs/cargo@v1 diff --git a/Cargo.lock b/Cargo.lock index 931de74a0..06ab8e9bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -90,17 +90,16 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.65.1" +version = "0.69.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.1", "cexpr", "clang-sys", + "itertools 0.12.1", "lazy_static", "lazycell", - "peeking_take_while", - "prettyplease", "proc-macro2", "quote", "regex", @@ -844,9 +843,9 @@ checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" [[package]] name = "librocksdb-sys" -version = "0.11.0+8.1.1" +version = "0.16.0+8.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3386f101bcb4bd252d8e9d2fb41ec3b0862a15a62b478c355b2982efa469e3e" +checksum = "ce3d60bc059831dc1c83903fb45c103f75db65c5a7bf22272764d9cc683e348c" dependencies = [ "bindgen", "bzip2-sys", @@ -1163,12 +1162,6 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" -[[package]] -name = "peeking_take_while" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" - [[package]] name = "piz" version = "0.5.1" @@ -1225,16 +1218,6 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" -[[package]] -name = "prettyplease" -version = "0.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c64d9ba0963cdcea2e1b2230fbae2bab30eb25a174be395c41e764bfb65dd62" -dependencies = [ - "proc-macro2", - "syn 2.0.87", -] - [[package]] name = "primal-check" version = "0.3.4" @@ -1485,9 +1468,9 @@ dependencies = [ [[package]] name = "rocksdb" -version = "0.21.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb6f170a4041d50a0ce04b0d2e14916d6ca863ea2e422689a5b694395d299ffe" +checksum = "6bd13e55d6d7b8cd0ea569161127567cd587676c99f4472f779a0279aa60a7a7" dependencies = [ "libc", "librocksdb-sys", diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 7167f3916..bb309900c 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -11,7 +11,7 @@ edition = "2021" readme = "README.md" autoexamples = false autobins = false -rust-version = "1.65.0" +rust-version = "1.66.0" [lib] name = "sourmash" @@ -109,6 +109,6 @@ wasm-bindgen-test = "0.3.42" ### These crates don't compile on wasm [target.'cfg(not(target_arch = "wasm32"))'.dependencies] -rocksdb = { version = "0.21.0", optional = true } +rocksdb = { version = "0.22.0", optional = true } [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] criterion = "0.5.1" diff --git a/src/core/README.md b/src/core/README.md index b71baaabc..0fa56ad29 100644 --- a/src/core/README.md +++ b/src/core/README.md @@ -38,4 +38,4 @@ Development happens on github at ## Minimum supported Rust version -Currently the minimum supported Rust version is 1.65.0. +Currently the minimum supported Rust version is 1.66.0. From 97e7808fbdc1b9f993beb66a1b141c68ddafed32 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 13 Nov 2024 10:22:19 -0800 Subject: [PATCH 2/2] MRG: change `sig_from_record` to use scaled from `Record` to downsample (#3387) Fixes https://github.com/sourmash-bio/sourmash/issues/3384. This PR changes `Manifest::select` so that if `scaled` is set in the selection, all matching `Record`s have their scaled value updated. It also updates `Selection::from_record` to set `scaled` to match the `Record` scaled value. In turn, this allows `Collection::sig_from_record` to respect the specified `scaled` value when loading `Signature`. --------- Co-authored-by: Luiz Irber --- src/core/Cargo.toml | 2 +- src/core/src/collection.rs | 11 +++---- src/core/src/manifest.rs | 63 ++++++++++++++++---------------------- src/core/src/selection.rs | 2 +- 4 files changed, 33 insertions(+), 45 deletions(-) diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index bb309900c..fc00b075a 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "sourmash" version = "0.17.1" -authors = ["Luiz Irber ", "N. Tessa Pierce-Ward "] +authors = ["Luiz Irber ", "N. Tessa Pierce-Ward ", "C. Titus Brown "] description = "tools for comparing biological sequences with k-mer sketches" repository = "https://github.com/sourmash-bio/sourmash" keywords = ["minhash", "bioinformatics"] diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index 28659df05..fcfbabbeb 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -409,8 +409,8 @@ mod test { // no sigs should remain assert_eq!(cl.len(), 6); for (_idx, rec) in cl.iter() { - // need to pass select again here so we actually downsample - let this_sig = cl.sig_from_record(rec).unwrap().select(&selection).unwrap(); + dbg!("record scaled is: {}", rec.scaled()); + let this_sig = cl.sig_from_record(rec).unwrap(); let this_mh = this_sig.minhash().unwrap(); assert_eq!(this_mh.scaled(), 2000); } @@ -440,7 +440,7 @@ mod test { filename.push("../../tests/test-data/prot/hp.zip"); // create Selection object let mut selection = Selection::default(); - selection.set_scaled(100); + selection.set_scaled(200); selection.set_moltype(HashFunctions::Murmur64Hp); // load sigs into collection + select compatible signatures let cl = Collection::from_zipfile(&filename) @@ -450,10 +450,9 @@ mod test { // count collection length assert_eq!(cl.len(), 2); for (idx, _rec) in cl.iter() { - // need to pass select again here so we actually downsample - let this_sig = cl.sig_for_dataset(idx).unwrap().select(&selection).unwrap(); + let this_sig = cl.sig_for_dataset(idx).unwrap(); let this_mh = this_sig.minhash().unwrap(); - assert_eq!(this_mh.scaled(), 100); + assert_eq!(this_mh.scaled(), 200); } } diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs index a23776dfd..21f8ecbc5 100644 --- a/src/core/src/manifest.rs +++ b/src/core/src/manifest.rs @@ -259,8 +259,13 @@ impl Manifest { } impl Select for Manifest { + // select only records that satisfy selection conditions; also update + // scaled value to match. fn select(self, selection: &Selection) -> Result { - let rows = self.records.iter().filter(|row| { + let Manifest { mut records } = self; + + // TODO: with num as well? + records.retain_mut(|row| { let mut valid = true; valid = if let Some(ksize) = selection.ksize() { row.ksize == ksize @@ -279,7 +284,12 @@ impl Select for Manifest { }; valid = if let Some(scaled) = selection.scaled() { // num sigs have row.scaled = 0, don't include them - valid && row.scaled != 0 && row.scaled <= scaled + let v = valid && row.scaled != 0 && row.scaled <= scaled; + // if scaled is set, update! + if v { + row.scaled = scaled + }; + v } else { valid }; @@ -291,41 +301,7 @@ impl Select for Manifest { valid }); - Ok(Manifest { - records: rows.cloned().collect(), - }) - - /* - matching_rows = self.rows - if ksize: - matching_rows = ( row for row in matching_rows - if row['ksize'] == ksize ) - if moltype: - matching_rows = ( row for row in matching_rows - if row['moltype'] == moltype ) - if scaled or containment: - if containment and not scaled: - raise ValueError("'containment' requires 'scaled' in Index.select'") - - matching_rows = ( row for row in matching_rows - if row['scaled'] and not row['num'] ) - if num: - matching_rows = ( row for row in matching_rows - if row['num'] and not row['scaled'] ) - - if abund: - # only need to concern ourselves if abundance is _required_ - matching_rows = ( row for row in matching_rows - if row['with_abundance'] ) - - if picklist: - matching_rows = ( row for row in matching_rows - if picklist.matches_manifest_row(row) ) - - # return only the internal filenames! - for row in matching_rows: - yield row - */ + Ok(Manifest { records }) } } @@ -567,6 +543,19 @@ mod test { selection.set_scaled(100); let scaled100 = manifest.select(&selection).unwrap(); assert_eq!(scaled100.len(), 6); + + // check that 'scaled' is updated + let manifest = collection.manifest().clone(); + selection = Selection::default(); + selection.set_scaled(400); + let scaled400 = manifest.select(&selection).unwrap(); + assert_eq!(scaled400.len(), 6); + let max_scaled = scaled400 + .iter() + .map(|r| r.scaled()) + .max() + .expect("no records?!"); + assert_eq!(*max_scaled, 400); } #[test] diff --git a/src/core/src/selection.rs b/src/core/src/selection.rs index 3c94c7f0c..59f83c114 100644 --- a/src/core/src/selection.rs +++ b/src/core/src/selection.rs @@ -125,7 +125,7 @@ impl Selection { abund: Some(row.with_abundance()), moltype: Some(row.moltype()), num: None, - scaled: None, + scaled: Some(*row.scaled()), containment: None, picklist: None, })