From 61906099ed83e0b0ea6ec44c92b8b539b182bcb9 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 17 Dec 2024 13:33:19 -0800 Subject: [PATCH] add skipmers; update as needed for core changes --- Cargo.lock | 121 +++++++++++++++++++++++++--------------- Cargo.toml | 5 +- environment.yml | 4 +- src/directsketch.rs | 26 +++------ src/utils/buildutils.rs | 59 ++++++++++++++++---- tests/test_gbsketch.py | 2 +- tests/test_urlsketch.py | 2 +- 7 files changed, 139 insertions(+), 80 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5ba4cdd..3273d4f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -97,7 +97,7 @@ dependencies = [ "crc32fast", "futures-lite", "pin-project", - "thiserror", + "thiserror 1.0.63", "tokio", "tokio-util", ] @@ -148,7 +148,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b48ee4c818e9d19bbdf75b56e1b2eb9682bb4fbd7ff8e7e7f2cc9956e1aefac7" dependencies = [ "flate2", - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -333,9 +333,9 @@ checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" [[package]] name = "csv" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" dependencies = [ "csv-core", "itoa", @@ -631,11 +631,11 @@ checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" [[package]] name = "histogram" -version = "0.11.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62b8d85713ddc62e5e78db13bf9f9305610d0419276faa845076a68b7165872" +checksum = "58cf6b99a250776d813cdf2f0b478a053a822d078e7a2baf5cb36afc88c41a7c" dependencies = [ - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -992,13 +992,12 @@ checksum = "a2983372caf4480544083767bf2d27defafe32af49ab4df3a0b7fc90793a3664" [[package]] name = "nalgebra" -version = "0.32.6" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5c17de023a86f59ed79891b2e5d5a94c705dbe904a5b5c9c952ea6221b03e4" +checksum = "26aecdf64b707efd1310e3544d709c5c0ac61c13756046aaaba41be5c4f66a3b" dependencies = [ "approx", "matrixmultiply", - "nalgebra-macros", "num-complex", "num-rational", "num-traits", @@ -1008,17 +1007,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "nalgebra-macros" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "254a5372af8fc138e36684761d3c0cdb758a4410e938babcff1c860ce14ddbfc" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "native-tls" version = "0.2.12" @@ -1050,6 +1038,17 @@ dependencies = [ "xz2", ] +[[package]] +name = "needletail" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3de09e373770238e3d30eb1a9f09f4754134d0ef354d0570bc1203d2517257" +dependencies = [ + "buffer-redux", + "bytecount", + "memchr", +] + [[package]] name = "niffler" version = "2.5.0" @@ -1060,7 +1059,7 @@ dependencies = [ "bzip2", "cfg-if", "flate2", - "thiserror", + "thiserror 1.0.63", "xz2", "zstd 0.12.4", ] @@ -1071,6 +1070,16 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + [[package]] name = "num-complex" version = "0.4.6" @@ -1106,6 +1115,7 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" dependencies = [ + "num-bigint", "num-integer", "num-traits", ] @@ -1300,7 +1310,7 @@ dependencies = [ "flate2", "log", "memchr", - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -1357,9 +1367,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] @@ -1616,9 +1626,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.6" +version = "0.10.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f4b84ba6e838ceb47b41de5194a60244fac43d9fe03b71dbe8c5a201081d6d1" +checksum = "41589aba99537475bf697f2118357cad1c31590c5a1b9f6d9fc4ad6d07503661" dependencies = [ "bytemuck", "byteorder", @@ -1744,18 +1754,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.210" +version = "1.0.216" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" +checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.210" +version = "1.0.216" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" +checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" dependencies = [ "proc-macro2", "quote", @@ -1764,9 +1774,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.132" +version = "1.0.133" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" +checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" dependencies = [ "itoa", "memchr", @@ -1803,9 +1813,9 @@ dependencies = [ [[package]] name = "simba" -version = "0.8.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061507c94fc6ab4ba1c9a0305018408e312e17c041eb63bef8aa726fa33aceae" +checksum = "b3a386a501cd104797982c15ae17aafe8b9261315b5d07e3ec803f2ea26be0fa" dependencies = [ "approx", "num-complex", @@ -1853,9 +1863,8 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" -version = "0.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "596f20eac8896a06ca65889399ea6f408deeba375aa44c4a2efb3b46e31a02c0" +version = "0.18.0" +source = "git+https://github.com/sourmash-bio/sourmash/?branch=try-skipmers#8cb0beaa53706520e61d5c61e2901665c27b99ea" dependencies = [ "az", "byteorder", @@ -1875,7 +1884,7 @@ dependencies = [ "md5", "memmap2", "murmurhash3", - "needletail", + "needletail 0.6.1", "niffler", "nohash-hasher", "num-iter", @@ -1889,7 +1898,7 @@ dependencies = [ "serde_json", "statrs", "streaming-stats", - "thiserror", + "thiserror 2.0.7", "twox-hash", "typed-builder", "vec-collections", @@ -1910,7 +1919,7 @@ dependencies = [ "getset", "lazy_static", "md5", - "needletail", + "needletail 0.5.1", "niffler", "openssl", "pyo3", @@ -1939,9 +1948,9 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "statrs" -version = "0.17.1" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f697a07e4606a0a25c044de247e583a330dbb1731d11bc7350b81f48ad567255" +checksum = "2a3fe7c28c6512e766b0874335db33c94ad7b8f9054228ae1c2abd47ce7d335e" dependencies = [ "approx", "nalgebra", @@ -1966,9 +1975,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.77" +version = "2.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" dependencies = [ "proc-macro2", "quote", @@ -2030,7 +2039,16 @@ version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.63", +] + +[[package]] +name = "thiserror" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93605438cbd668185516ab499d589afb7ee1859ea3d5fc8f6b0755e1c7443767" +dependencies = [ + "thiserror-impl 2.0.7", ] [[package]] @@ -2044,6 +2062,17 @@ dependencies = [ "syn", ] +[[package]] +name = "thiserror-impl" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d8749b4531af2117677a5fcd12b1348a3fe2b81e36e61ffeac5c4aa3273e36" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tinyvec" version = "1.8.0" diff --git a/Cargo.toml b/Cargo.toml index c780061..4165db1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.23.3", features = ["extension-module","anyhow"], default = ["extension-module"] } rayon = "1.10.0" serde = { version = "1.0.204", features = ["derive"] } -sourmash = { version = "0.16.0"} +#sourmash = { version = "0.18.0"} +sourmash = {git = "https://github.com/sourmash-bio/sourmash/", branch = "try-skipmers"} serde_json = "1.0.132" niffler = "2.4.0" needletail = "0.5.1" @@ -20,7 +21,7 @@ async_zip={version="0.0.17", features=["full"]} simple-error = "0.3.1" anyhow = "1.0.90" camino = "1.1.7" -csv = "1.3.0" +csv = "1.3.1" reqwest = { version = "0.12.8", features = ["json", "stream"] } tokio = { version = "1.42.0", features = ["full"] } tokio-util = "0.7.11" diff --git a/environment.yml b/environment.yml index 0e33adb..d3eeac7 100644 --- a/environment.yml +++ b/environment.yml @@ -6,8 +6,8 @@ channels: dependencies: - sourmash>=4.8,<5 - pip - - rust + - rust=1.75 - maturin>=1,<2 - pytest - pandas - - compilers \ No newline at end of file + - compilers diff --git a/src/directsketch.rs b/src/directsketch.rs index 36e8787..b825634 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -540,34 +540,24 @@ async fn load_existing_zip_batches(outpath: &PathBuf) -> Result<(MultiCollection if let Some(file_name) = entry_path.file_name() { // Check if the file matches the base zip file or any batched zip file (outpath.zip, outpath.1.zip, etc.) if let Some(captures) = zip_file_pattern.captures(file_name) { - // Wrap the `from_zipfile` call in `catch_unwind` to prevent panic propagation - let result = panic::catch_unwind(|| Collection::from_zipfile(&entry_path)); - match result { - Ok(Ok(collection)) => { - // Successfully loaded the collection, push to `collections` + Collection::from_zipfile(&entry_path) + .and_then(|collection| { collections.push(collection); - // Extract the batch number (if it exists) and update the highest_batch + // Extract batch number if it exists if let Some(batch_str) = captures.get(1) { if let Ok(batch_num) = batch_str.as_str().parse::() { highest_batch = max(highest_batch, batch_num); } } - } - Ok(Err(e)) => { - // Handle the case where `from_zipfile` returned an error + Ok(()) // Return Ok(()) for the closure + }) + .unwrap_or_else(|e| { eprintln!( - "Warning: Failed to load zip file '{}'. Error: {:?}", + "Warning: Failed to load zip file '{}'; skipping. Zipfile Error: {:?}", entry_path, e ); - continue; // Skip the file and continue - } - Err(_) => { - // The code inside `from_zipfile` panicked - eprintln!("Warning: Invalid zip file '{}'; skipping.", entry_path); - continue; // Skip the file and continue - } - } + }); } } } diff --git a/src/utils/buildutils.rs b/src/utils/buildutils.rs index 218a859..5c1af63 100644 --- a/src/utils/buildutils.rs +++ b/src/utils/buildutils.rs @@ -51,7 +51,7 @@ impl MultiSelection { pub fn from_input_moltype(input_moltype: &str) -> Result { // currently we don't allow translation. Will need to change this when we do. // is there a better way to do this? - let mut moltypes = vec!["DNA"]; + let mut moltypes = vec!["DNA", "skipm1n3", "skipm2n3"]; if input_moltype == "protein" { moltypes = vec!["protein", "dayhoff", "hp"]; } @@ -97,7 +97,7 @@ pub struct BuildRecord { num: u32, #[getset(get = "pub")] - scaled: u64, + scaled: u32, #[getset(get = "pub", set = "pub")] n_hashes: Option, @@ -183,6 +183,24 @@ impl BuildRecord { } } + pub fn default_skipm1n3() -> Self { + Self { + moltype: "skipm1n3".to_string(), + ksize: 21, + scaled: 1000, + ..Self::default_dna() + } + } + + pub fn default_skipm2n3() -> Self { + Self { + moltype: "skipm2n3".to_string(), + ksize: 21, + scaled: 1000, + ..Self::default_dna() + } + } + pub fn moltype(&self) -> HashFunctions { self.moltype.as_str().try_into().unwrap() } @@ -215,7 +233,7 @@ impl BuildRecord { if let Some(scaled) = selection.scaled() { // num sigs have self.scaled = 0, don't include them - valid = valid && self.scaled != 0 && self.scaled <= scaled as u64; + valid = valid && self.scaled != 0 && self.scaled <= scaled; } if let Some(num) = selection.num() { @@ -225,7 +243,7 @@ impl BuildRecord { valid } - pub fn params(&self) -> (u32, String, bool, u32, u64) { + pub fn params(&self) -> (u32, String, bool, u32, u32) { ( self.ksize, self.moltype.clone(), @@ -287,7 +305,7 @@ impl BuildManifest { self.records.clear(); } - pub fn summarize_params(&self) -> HashSet<(u32, String, bool, u32, u64)> { + pub fn summarize_params(&self) -> HashSet<(u32, String, bool, u32, u32)> { self.iter().map(|record| record.params()).collect() } @@ -439,19 +457,34 @@ impl BuildCollection { Ok(mf.records.len()) } + pub fn skipm1n3_size(&self) -> Result { + let multiselection = MultiSelection::from_moltypes(vec!["skipm1n3"])?; + let mut mf = self.manifest.clone(); + mf.select(&multiselection)?; + Ok(mf.records.len()) + } + + pub fn skipm2n3_size(&self) -> Result { + let multiselection = MultiSelection::from_moltypes(vec!["skipm2n3"])?; + let mut mf = self.manifest.clone(); + mf.select(&multiselection)?; + Ok(mf.records.len()) + } + pub fn protein_size(&self) -> Result { let multiselection = MultiSelection::from_moltypes(vec!["protein"])?; - let mut mf = self.manifest.clone(); // temporary mutable copy + let mut mf = self.manifest.clone(); mf.select(&multiselection)?; Ok(mf.records.len()) } pub fn anyprotein_size(&self) -> Result { let multiselection = MultiSelection::from_moltypes(vec!["protein", "dayhoff", "hp"])?; - let mut mf = self.manifest.clone(); // temporary mutable copy + let mut mf = self.manifest.clone(); mf.select(&multiselection)?; Ok(mf.records.len()) } + pub fn parse_ksize(value: &str) -> Result { value .parse::() @@ -484,7 +517,7 @@ impl BuildCollection { pub fn parse_moltype(item: &str, current: &mut Option) -> Result { let new_moltype = match item { - "protein" | "dna" | "dayhoff" | "hp" => item.to_string(), + "protein" | "dna" | "dayhoff" | "hp" | "skipm1n3" | "skipm2n3" => item.to_string(), _ => return Err(format!("unknown moltype '{}'", item)), }; @@ -523,7 +556,7 @@ impl BuildCollection { let mut moltype: Option = None; let mut track_abundance: Option = None; let mut num: Option = None; - let mut scaled: Option = None; + let mut scaled: Option = None; let mut seed: Option = None; for item in p_str.split(',') { @@ -534,7 +567,7 @@ impl BuildCollection { "abund" | "noabund" => { Self::parse_abundance(item, &mut track_abundance)?; } - "protein" | "dna" | "DNA" | "dayhoff" | "hp" => { + "protein" | "dna" | "DNA" | "dayhoff" | "hp" | "skipm1n3" | "skipm2n3" => { Self::parse_moltype(item, &mut moltype)?; } _ if item.starts_with("num=") => { @@ -557,6 +590,8 @@ impl BuildCollection { Some("protein") => BuildRecord::default_protein(), Some("dayhoff") => BuildRecord::default_dayhoff(), Some("hp") => BuildRecord::default_hp(), + Some("skipm1n3") => BuildRecord::default_skipm1n3(), + Some("skipm2n3") => BuildRecord::default_skipm2n3(), _ => BuildRecord::default_dna(), // no moltype --> assume DNA }; @@ -644,6 +679,8 @@ impl BuildCollection { .dna(record.moltype == "DNA") .dayhoff(record.moltype == "dayhoff") .hp(record.moltype == "hp") + .skipm1n3(record.moltype == "skipm1n3") + .skipm2n3(record.moltype == "skipm2n3") .num_hashes(record.num) .track_abundance(record.with_abundance) .build(); @@ -732,6 +769,8 @@ impl BuildCollection { } } else if (input_moltype == "DNA" || input_moltype == "dna") && rec.moltype() == HashFunctions::Murmur64Dna + || rec.moltype() == HashFunctions::Murmur64Skipm2n3 + || rec.moltype() == HashFunctions::Murmur64Skipm1n3 { sig.add_sequence(&record.seq(), true) .context("Failed to add sequence")?; diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py index 9ad472d..072186f 100644 --- a/tests/test_gbsketch.py +++ b/tests/test_gbsketch.py @@ -782,7 +782,7 @@ def test_gbsketch_simple_batch_restart_with_incomplete_zip(runtmp, capfd): assert not os.path.exists(output) # for now, orig output file should be empty. captured = capfd.readouterr() print(captured.err) - assert f"Warning: Invalid zip file '{out2}'; skipping." in captured.err + assert f"Warning: Failed to load zip file '{out2}'" in captured.err # we created this one with sig cat idx = sourmash.load_file_as_index(out1) diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index a1d354c..0ae7bd5 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -636,7 +636,7 @@ def test_urlsketch_simple_batch_restart_with_incomplete_zip(runtmp, capfd): assert not os.path.exists(output) # for now, orig output file should be empty. captured = capfd.readouterr() print(captured.err) - assert f"Warning: Invalid zip file '{out2}'; skipping." in captured.err + assert f"Warning: Failed to load zip file '{out2}'" in captured.err expected_siginfo = { (ss2.name, ss2.md5sum(), ss2.minhash.moltype),