From 713243362b725dca04a637d7e6bfdbd89ec26660 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 16 Feb 2024 13:51:01 -0800 Subject: [PATCH 1/9] use core manifest utils --- Cargo.lock | 63 ++++++++++++++------------- Cargo.toml | 3 +- src/python/tests/test_gather.py | 4 +- src/python/tests/test_sketch.py | 51 +++++++++++++++++++++- src/utils.rs | 75 +++++---------------------------- 5 files changed, 95 insertions(+), 101 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 51cea82e..76c675d3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,9 +10,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.7.7" +version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a824f2aa7e75a0c98c5a504fceb80649e9c35265d44525b5f94de4771a395cd" +checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9" dependencies = [ "getrandom", "once_cell", @@ -226,9 +226,9 @@ checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" [[package]] name = "bytecheck" -version = "0.6.11" +version = "0.6.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b6372023ac861f6e6dc89c8344a8f398fb42aaba2b5dbc649ca0c0e9dbcb627" +checksum = "23cdc57ce23ac53c931e88a43d06d070a6fd142f2617be5855eb75efc9beb1c2" dependencies = [ "bytecheck_derive", "ptr_meta", @@ -237,9 +237,9 @@ dependencies = [ [[package]] name = "bytecheck_derive" -version = "0.6.11" +version = "0.6.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7ec4c6f261935ad534c0c22dbef2201b45918860eb1c574b972bd213a76af61" +checksum = "3db406d29fbcd95542e92559bed4d8ad92636d1ca8b3b72ede10b4bcc010e659" dependencies = [ "proc-macro2", "quote", @@ -327,9 +327,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.33" +version = "0.4.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f13690e35a5e4ace198e7beea2895d29f3a9cc55015fcebe6336bd2010af9eb" +checksum = "5bc015644b92d5890fab7489e49d21f879d5c990186827d42ec511919404f38b" dependencies = [ "android-tzdata", "iana-time-zone", @@ -583,9 +583,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "histogram" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5ee9487899388cf1a1155759c39e3c156c5d198b6da1734053954a6e40e6d4d" +checksum = "4b634390eb8a63662e127836d4e2f26d7ae930600d4e05ee0fd85a009eeb1175" dependencies = [ "thiserror", ] @@ -660,9 +660,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.67" +version = "0.3.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a1d36f1235bc969acba30b7f5990b864423a6068a10f7c90ae8f0112e3a59d1" +checksum = "406cda4b368d531c842222cf9d2600a9a4acce8d29423695379c6868a143a9ee" dependencies = [ "wasm-bindgen", ] @@ -713,9 +713,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.14" +version = "1.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "295c17e837573c8c821dbaeb3cceb3d745ad082f7572191409e69cbc1b3fd050" +checksum = "037731f5d3aaa87a5675e895b63ddff1a87624bc29f77004ea829809654e48f6" dependencies = [ "cc", "pkg-config", @@ -879,9 +879,9 @@ dependencies = [ [[package]] name = "num-iter" -version = "0.1.43" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" +checksum = "d869c01cc0c455284163fd0092f1f93835385ccab5a98a0dcc497b2f8bf055a9" dependencies = [ "autocfg", "num-integer", @@ -1267,9 +1267,9 @@ checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "rend" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2571463863a6bd50c32f94402933f03457a3fbaf697a707c5be741e459f08fd" +checksum = "71fe3824f5629716b1589be05dacd749f6aa084c87e00e016714a8cdfccc997c" dependencies = [ "bytecheck", ] @@ -1437,8 +1437,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa8187a00787432261dc522b6ebf813251dbbeabc04ed7a47f5cbb9be0d4a508" +source = "git+https://github.com/sourmash-bio/sourmash?rev=375ddb2ed25afbffc2ffa981c627256b195bf794#375ddb2ed25afbffc2ffa981c627256b195bf794" dependencies = [ "az", "byteorder", @@ -1693,9 +1692,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1223296a201415c7fad14792dbefaace9bd52b62d33453ade1c5b5f07555406" +checksum = "c1e124130aee3fb58c5bdd6b639a0509486b0338acaaae0c84a5124b0f588b7f" dependencies = [ "cfg-if", "serde", @@ -1705,9 +1704,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcdc935b63408d58a32f8cc9738a0bffd8f05cc7c002086c6ef20b7312ad9dcd" +checksum = "c9e7e1900c352b609c8488ad12639a311045f40a35491fb69ba8c12f758af70b" dependencies = [ "bumpalo", "log", @@ -1720,9 +1719,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e4c238561b2d428924c49815533a8b9121c664599558a5d9ec51f8a1740a999" +checksum = "b30af9e2d358182b5c7449424f017eba305ed32a7010509ede96cdc4696c46ed" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1730,9 +1729,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bae1abb6806dc1ad9e560ed242107c0f6c84335f1749dd4e8ddb012ebd5e25a7" +checksum = "642f325be6301eb8107a83d12a8ac6c1e1c54345a7ef1a9261962dfefda09e66" dependencies = [ "proc-macro2", "quote", @@ -1743,15 +1742,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d91413b1c31d7539ba5ef2451af3f0b833a005eb27a631cec32bc0635a8602b" +checksum = "4f186bd2dcf04330886ce82d6f33dd75a7bfcf69ecf5763b89fcde53b6ac9838" [[package]] name = "web-sys" -version = "0.3.67" +version = "0.3.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58cd2333b6e0be7a39605f0e255892fd7418a682d8da8fe042fe25128794d2ed" +checksum = "96565907687f7aceb35bc5fc03770a8a0471d82e479f25832f54a0e3f4b28446" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index fca3c5e4..1080abaf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.196", features = ["derive"] } -sourmash = { version = "0.12.1", features = ["branchwater"] } +#sourmash = { version = "0.12.1", features = ["branchwater"] } +sourmash = { git="https://github.com/sourmash-bio/sourmash", rev="375ddb2ed25afbffc2ffa981c627256b195bf794", features = ["branchwater"] } serde_json = "1.0.113" niffler = "2.4.0" log = "0.4.14" diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index c777821a..d4093d3a 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -669,8 +669,8 @@ def test_simple_with_manifest_loading(runtmp): sig63 = get_test_data('63.fa.sig.gz') make_file_list(against_list, [sig2, sig47, sig63]) - query_manifest = get_test_data("query-manifest.csv") - against_manifest = get_test_data("against-manifest.csv") + query_manifest = runtmp.output("query-manifest.csv") + against_manifest = runtmp.output("against-manifest.csv") runtmp.sourmash("sig", "manifest", query, "-o", query_manifest) runtmp.sourmash("sig", "manifest", against_list, "-o", against_manifest) diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py index d6bbc5fa..c313aa55 100644 --- a/src/python/tests/test_sketch.py +++ b/src/python/tests/test_sketch.py @@ -13,11 +13,14 @@ def get_test_data(filename): def make_file_csv(filename, genome_paths, protein_paths = []): + # equalize path lengths by adding "". names = [os.path.basename(x).split('.fa')[0] for x in genome_paths] - # Check if the number of protein paths is less than genome paths - # and fill in the missing paths with "". if len(protein_paths) < len(genome_paths): protein_paths.extend(["" for _ in range(len(genome_paths) - len(protein_paths))]) + elif len(genome_paths) < len(protein_paths): + genome_paths.extend(["" for _ in range(len(protein_paths) - len(genome_paths))]) + names = [os.path.basename(x).split('.fa')[0] for x in protein_paths] + with open(filename, 'wt') as fp: fp.write("name,genome_filename,protein_filename\n") for name, genome_path, protein_path in zip(names, genome_paths, protein_paths): @@ -405,3 +408,47 @@ def test_zip_manifest(runtmp, capfd): assert sig.minhash.ksize == 31 assert sig.minhash.moltype == 'DNA' assert sig.minhash.scaled == 1 + + +def test_protein_zip_manifest(runtmp, capfd): + # test basic manifest-generating functionality. + fa_csv = runtmp.output('db-fa.csv') + + fa1 = get_test_data('short.fa') + fa2 = get_test_data('short-protein.fa') + + make_file_csv(fa_csv, [fa1], [fa2]) + output = runtmp.output('db.zip') + + runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, + '--param-str', "protein,k=10,scaled=1") + + loader = sourmash.load_file_as_index(output) + + rows = [] + siglist = [] + # make manifest via sourmash python code + for (sig, loc) in loader._signatures_with_internal(): + row = index.CollectionManifest.make_manifest_row(sig, loc) + rows.append(row) + siglist.append(sig) + + manifest = index.CollectionManifest(rows) + + assert len(manifest) == len(rows) + assert len(manifest) == 1 + + md5_list = [ row['md5'] for row in manifest.rows ] + assert 'eb4467d11e0ecd2dbde4193bfc255310' in md5_list + ksize_list = [ row['ksize'] for row in manifest.rows ] + assert 10 in ksize_list + scaled_list = [ row['scaled'] for row in manifest.rows ] + assert 1 in scaled_list + moltype_list = [ row['moltype'] for row in manifest.rows ] + assert "protein" in moltype_list + + for sig in siglist: + assert sig in manifest + assert sig.minhash.ksize == 10 + assert sig.minhash.moltype == 'protein' + assert sig.minhash.scaled == 1 diff --git a/src/utils.rs b/src/utils.rs index a25fd890..75de865a 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,7 +1,6 @@ /// Utility functions for sourmash_plugin_branchwater. use rayon::prelude::*; use sourmash::encodings::HashFunctions; -use sourmash::manifest::Manifest; use sourmash::selection::Select; use anyhow::{anyhow, Context, Result}; @@ -19,12 +18,11 @@ use std::sync::atomic; use std::sync::atomic::AtomicUsize; use sourmash::collection::Collection; -use sourmash::manifest::Record; +use sourmash::manifest::{Manifest, Record}; use sourmash::selection::Selection; use sourmash::signature::{Signature, SigsTrait}; use sourmash::sketch::minhash::KmerMinHash; use sourmash::storage::{FSStorage, InnerStorage, SigStore}; - /// Track a name/minhash. pub struct SmallSignature { @@ -758,42 +756,6 @@ impl Serialize for BoolPython { } } -pub fn make_manifest_row( - sig: &Signature, - filename: &Path, - internal_location: &str, - scaled: u64, - num: u32, - abund: bool, - is_dna: bool, - is_protein: bool, -) -> ManifestRow { - if is_dna && is_protein { - panic!("Both is_dna and is_protein cannot be true at the same time."); - } else if !is_dna && !is_protein { - panic!("Either is_dna or is_protein must be true."); - } - let moltype = if is_dna { - "DNA".to_string() - } else { - "protein".to_string() - }; - let sketch = &sig.sketches()[0]; - ManifestRow { - internal_location: internal_location.to_string(), - md5: sig.md5sum(), - md5short: sig.md5sum()[0..8].to_string(), - ksize: sketch.ksize() as u32, - moltype, - num, - scaled, - n_hashes: sketch.size(), - with_abundance: BoolPython(abund), - name: sig.name().to_string(), - filename: filename.to_string(), - } -} - pub fn open_stdout_or_file(output: Option) -> Box { // if output is a file, use open_output_file if let Some(path) = output { @@ -856,18 +818,20 @@ pub fn sigwriter( .compression_method(zip::CompressionMethod::Stored) .large_file(true); let mut zip = zip::ZipWriter::new(file_writer); - let mut manifest_rows: Vec = Vec::new(); + let mut manifest_rows: Vec = Vec::new(); // keep track of md5sum occurrences to prevent overwriting duplicates let mut md5sum_occurrences: std::collections::HashMap = std::collections::HashMap::new(); while let Ok(message) = recv.recv() { match message { - ZipMessage::SignatureData(sigs, params, filename) => { + // TODO: can remove params and filename from here now + ZipMessage::SignatureData(sigs, params, _filename) => { if sigs.len() != params.len() { bail!("Mismatched lengths of signatures and parameters"); } - for (sig, param) in sigs.iter().zip(params.iter()) { + // TODO: can remove _param here now. + for (sig, _param) in sigs.iter().zip(params.iter()) { let md5sum_str = sig.md5sum(); let count = md5sum_occurrences.entry(md5sum_str.clone()).or_insert(0); *count += 1; @@ -877,16 +841,9 @@ pub fn sigwriter( format!("signatures/{}.sig.gz", md5sum_str) }; write_signature(sig, &mut zip, options, &sig_filename); - manifest_rows.push(make_manifest_row( - sig, - &filename, - &sig_filename, - param.scaled, - param.num, - param.track_abundance, - param.is_dna, - param.is_protein, - )); + let records: Vec = Record::from_sig(&sig, &sig_filename.as_str()); + eprintln!("{:?}", &records); + manifest_rows.extend(records); } } ZipMessage::WriteManifest => { @@ -894,20 +851,10 @@ pub fn sigwriter( // Start the CSV file inside the zip zip.start_file("SOURMASH-MANIFEST.csv", options).unwrap(); // write manifest version line - writeln!(&mut zip, "# SOURMASH-MANIFEST-VERSION: 1.0").unwrap(); // scoped block for csv writing { - let mut csv_writer = Writer::from_writer(&mut zip); - - for row in &manifest_rows { - if let Err(e) = csv_writer.serialize(row) { - eprintln!("Error writing item: {:?}", e); - } - } - // CSV writer must be manually flushed to ensure all data is written - if let Err(e) = csv_writer.flush() { - eprintln!("Error flushing CSV writer: {:?}", e); - } + let manifest: Manifest = manifest_rows.clone().into(); + manifest.to_writer(&mut zip)?; } // drop csv writer here // Properly finish writing to the ZIP file From 28ef1ee8aec06f4dfaaac0840c7bcf234dad5aa8 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 16 Feb 2024 17:19:47 -0800 Subject: [PATCH 2/9] upd smash br --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 76c675d3..8e7da625 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1437,7 +1437,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.1" -source = "git+https://github.com/sourmash-bio/sourmash?rev=375ddb2ed25afbffc2ffa981c627256b195bf794#375ddb2ed25afbffc2ffa981c627256b195bf794" +source = "git+https://github.com/sourmash-bio/sourmash?rev=296e4b49ba73e4afaafbb2612e952464fdafdb5b#296e4b49ba73e4afaafbb2612e952464fdafdb5b" dependencies = [ "az", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 1080abaf..9d21680a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.196", features = ["derive"] } #sourmash = { version = "0.12.1", features = ["branchwater"] } -sourmash = { git="https://github.com/sourmash-bio/sourmash", rev="375ddb2ed25afbffc2ffa981c627256b195bf794", features = ["branchwater"] } +sourmash = { git="https://github.com/sourmash-bio/sourmash", rev="296e4b49ba73e4afaafbb2612e952464fdafdb5b", features = ["branchwater"] } serde_json = "1.0.113" niffler = "2.4.0" log = "0.4.14" From 0ae5d2a936b5093d9900a072635c35b7b56490a3 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sun, 18 Feb 2024 07:19:02 -0800 Subject: [PATCH 3/9] no need to keep track of params independently now! --- src/manysketch.rs | 28 ++++++++-------------------- src/utils.rs | 19 +++++-------------- 2 files changed, 13 insertions(+), 34 deletions(-) diff --git a/src/manysketch.rs b/src/manysketch.rs index a4eefc7a..a0e5b000 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -79,14 +79,8 @@ fn parse_params_str(params_strs: String) -> Result, String> { Ok(unique_params.into_iter().collect()) } -fn build_siginfo( - params: &[Params], - moltype: &str, - name: &str, - filename: &Path, -) -> (Vec, Vec) { +fn build_siginfo(params: &[Params], moltype: &str, name: &str, filename: &Path) -> Vec { let mut sigs = Vec::new(); - let mut params_vec = Vec::new(); for param in params.iter().cloned() { match moltype { @@ -121,11 +115,10 @@ fn build_siginfo( .signatures(template) .build(); sigs.push(sig); - - params_vec.push(param); } - (sigs, params_vec) + sigs + // (sigs, params_vec) } pub fn manysketch( @@ -144,7 +137,7 @@ pub fn manysketch( bail!("No files to load, exiting."); } - // if output doesnt end in zip, bail + // if output doesn't end in zip, bail if Path::new(&output) .extension() .map_or(true, |ext| ext != "zip") @@ -195,7 +188,7 @@ pub fn manysketch( } // build sig templates from params - let (mut sigs, sig_params) = build_siginfo(¶ms_vec, moltype, name, filename); + let mut sigs = build_siginfo(¶ms_vec, moltype, name, filename); // if no sigs to build, skip if sigs.is_empty() { let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); @@ -231,17 +224,12 @@ pub fn manysketch( } } } - Some((sigs, sig_params, filename)) + Some((sigs)) }) .try_for_each_with( send.clone(), - |s: &mut std::sync::Arc>, - (sigs, sig_params, filename)| { - if let Err(e) = s.send(ZipMessage::SignatureData( - sigs, - sig_params, - filename.clone(), - )) { + |s: &mut std::sync::Arc>, (sigs)| { + if let Err(e) = s.send(ZipMessage::SignatureData(sigs)) { Err(format!("Unable to send internal data: {:?}", e)) } else { Ok(()) diff --git a/src/utils.rs b/src/utils.rs index 75de865a..34bf5583 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -800,7 +800,7 @@ impl Hash for Params { } pub enum ZipMessage { - SignatureData(Vec, Vec, PathBuf), + SignatureData(Vec), WriteManifest, } @@ -825,13 +825,8 @@ pub fn sigwriter( while let Ok(message) = recv.recv() { match message { - // TODO: can remove params and filename from here now - ZipMessage::SignatureData(sigs, params, _filename) => { - if sigs.len() != params.len() { - bail!("Mismatched lengths of signatures and parameters"); - } - // TODO: can remove _param here now. - for (sig, _param) in sigs.iter().zip(params.iter()) { + ZipMessage::SignatureData(sigs) => { + for sig in sigs.iter() { let md5sum_str = sig.md5sum(); let count = md5sum_occurrences.entry(md5sum_str.clone()).or_insert(0); *count += 1; @@ -850,12 +845,8 @@ pub fn sigwriter( println!("Writing manifest"); // Start the CSV file inside the zip zip.start_file("SOURMASH-MANIFEST.csv", options).unwrap(); - // write manifest version line - // scoped block for csv writing - { - let manifest: Manifest = manifest_rows.clone().into(); - manifest.to_writer(&mut zip)?; - } // drop csv writer here + let manifest: Manifest = manifest_rows.clone().into(); + manifest.to_writer(&mut zip)?; // Properly finish writing to the ZIP file if let Err(e) = zip.finish() { From abaf03dba618a895ca43098c600f4724a86d5a1c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sun, 18 Feb 2024 07:42:30 -0800 Subject: [PATCH 4/9] clean up sig building --- src/manysketch.rs | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/src/manysketch.rs b/src/manysketch.rs index a0e5b000..7859436d 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -79,7 +79,7 @@ fn parse_params_str(params_strs: String) -> Result, String> { Ok(unique_params.into_iter().collect()) } -fn build_siginfo(params: &[Params], moltype: &str, name: &str, filename: &Path) -> Vec { +fn build_siginfo(params: &[Params], moltype: &str) -> Vec { let mut sigs = Vec::new(); for param in params.iter().cloned() { @@ -106,19 +106,11 @@ fn build_siginfo(params: &[Params], moltype: &str, name: &str, filename: &Path) .track_abundance(param.track_abundance) .build(); - // let sig = Signature::from_params(&cp); // cant set name with this - let template = sourmash::cmd::build_template(&cp); - let sig = Signature::builder() - .hash_function("0.murmur64") - .name(Some(name.to_string())) - .filename(Some(filename.to_string())) - .signatures(template) - .build(); + let sig = Signature::from_params(&cp); sigs.push(sig); } sigs - // (sigs, params_vec) } pub fn manysketch( @@ -188,7 +180,7 @@ pub fn manysketch( } // build sig templates from params - let mut sigs = build_siginfo(¶ms_vec, moltype, name, filename); + let mut sigs = build_siginfo(¶ms_vec, moltype); // if no sigs to build, skip if sigs.is_empty() { let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); @@ -210,20 +202,26 @@ pub fn manysketch( Ok(record) => { // do we need to normalize to make sure all the bases are consistently capitalized? // let norm_seq = record.normalize(false); - for sig in &mut sigs { + sigs.iter_mut().for_each(|sig| { if moltype == "protein" { - sig.add_protein(&record.seq()).unwrap(); + sig.add_protein(&record.seq()) + .expect("Failed to add protein"); } else { - sig.add_sequence(&record.seq(), true).unwrap(); + sig.add_sequence(&record.seq(), true) + .expect("Failed to add sequence"); // if not force, panics with 'N' in dna sequence } - } - } - Err(err) => { - eprintln!("Error while processing record: {:?}", err); + }); } + Err(err) => eprintln!("Error while processing record: {:?}", err), } } + + // Set name and filename for each signature after processing all records + sigs.iter_mut().for_each(|sig| { + sig.set_name(name); + sig.set_filename(filename.as_str()); + }); Some((sigs)) }) .try_for_each_with( From a01f3e95f121aa1432ef9cc1063515cd295cf2a1 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sun, 18 Feb 2024 07:52:06 -0800 Subject: [PATCH 5/9] remove errant eprintln --- src/manysketch.rs | 1 + src/utils.rs | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/manysketch.rs b/src/manysketch.rs index 7859436d..e6d9d6ef 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -99,6 +99,7 @@ fn build_siginfo(params: &[Params], moltype: &str) -> Vec { let cp = ComputeParameters::builder() .ksizes(vec![adjusted_ksize]) + // .ksizes(vec![param.ksize]) .scaled(param.scaled) .protein(param.is_protein) .dna(param.is_dna) diff --git a/src/utils.rs b/src/utils.rs index 34bf5583..93cf0d3b 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -837,7 +837,6 @@ pub fn sigwriter( }; write_signature(sig, &mut zip, options, &sig_filename); let records: Vec = Record::from_sig(&sig, &sig_filename.as_str()); - eprintln!("{:?}", &records); manifest_rows.extend(records); } } From 86e074b752216f920b6ca36c8943b3d043c64f4a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 21 Feb 2024 11:30:02 -0800 Subject: [PATCH 6/9] upd core, clippy fixes --- Cargo.lock | 19 ++++++------------- Cargo.toml | 2 +- src/check.rs | 2 +- src/fastgather.rs | 1 + src/lib.rs | 1 + src/manysketch.rs | 4 ++-- src/mastiff_manygather.rs | 2 +- src/mastiff_manysearch.rs | 2 +- src/utils.rs | 6 +++--- 9 files changed, 17 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8e7da625..466f61ad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -254,9 +254,9 @@ checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205" [[package]] name = "bytemuck" -version = "1.14.0" +version = "1.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6" +checksum = "a2ef034f05691a48569bd920a96c81b9d91bbad1ab5ac7c4616c1f6ef36cb79f" [[package]] name = "byteorder" @@ -1274,12 +1274,6 @@ dependencies = [ "bytecheck", ] -[[package]] -name = "retain_mut" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" - [[package]] name = "rkyv" version = "0.7.44" @@ -1311,13 +1305,12 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.2" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873" +checksum = "a1c77081a55300e016cb86f2864415b7518741879db925b8d488a0ee0d2da6bf" dependencies = [ "bytemuck", "byteorder", - "retain_mut", ] [[package]] @@ -1436,8 +1429,8 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" -version = "0.12.1" -source = "git+https://github.com/sourmash-bio/sourmash?rev=296e4b49ba73e4afaafbb2612e952464fdafdb5b#296e4b49ba73e4afaafbb2612e952464fdafdb5b" +version = "0.13.0" +source = "git+https://github.com/sourmash-bio/sourmash?rev=45a7ad843c76cdcdcf0ae0347f4711481654cbd0#45a7ad843c76cdcdcf0ae0347f4711481654cbd0" dependencies = [ "az", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 9d21680a..21d1ac4c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.196", features = ["derive"] } #sourmash = { version = "0.12.1", features = ["branchwater"] } -sourmash = { git="https://github.com/sourmash-bio/sourmash", rev="296e4b49ba73e4afaafbb2612e952464fdafdb5b", features = ["branchwater"] } +sourmash = { git="https://github.com/sourmash-bio/sourmash", rev="45a7ad843c76cdcdcf0ae0347f4711481654cbd0", features = ["branchwater"] } serde_json = "1.0.113" niffler = "2.4.0" log = "0.4.14" diff --git a/src/check.rs b/src/check.rs index 2995284b..5ffc0ff5 100644 --- a/src/check.rs +++ b/src/check.rs @@ -8,7 +8,7 @@ pub fn check(index: camino::Utf8PathBuf, quick: bool) -> Result<(), Box>, (sigs)| { + |s: &mut std::sync::Arc>, sigs| { if let Err(e) = s.send(ZipMessage::SignatureData(sigs)) { Err(format!("Unable to send internal data: {:?}", e)) } else { diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index cb794735..f5a029c0 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -23,7 +23,7 @@ pub fn mastiff_manygather( bail!("'{}' is not a valid RevIndex database", index); } // Open database once - let db = RevIndex::open(index, true)?; + let db = RevIndex::open(index, true, None)?; println!("Loaded DB"); let query_collection = load_collection( diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 7d793b2c..4f4be0c5 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -24,7 +24,7 @@ pub fn mastiff_manysearch( bail!("'{}' is not a valid RevIndex database", index); } // Open database once - let db = RevIndex::open(index, true)?; + let db = RevIndex::open(index, true, None)?; println!("Loaded DB"); diff --git a/src/utils.rs b/src/utils.rs index 23d0ba74..82d47302 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -375,11 +375,11 @@ fn collection_from_pathlist( let n_failed = AtomicUsize::new(0); let records: Vec = lines .par_iter() - .filter_map(|path| match Signature::from_path(&path) { + .filter_map(|path| match Signature::from_path(path) { Ok(signatures) => { let recs: Vec = signatures .into_iter() - .flat_map(|v| Record::from_sig(&v, &path)) + .flat_map(|v| Record::from_sig(&v, path)) .collect(); Some(recs) } @@ -844,7 +844,7 @@ pub fn sigwriter( format!("signatures/{}.sig.gz", md5sum_str) }; write_signature(sig, &mut zip, options, &sig_filename); - let records: Vec = Record::from_sig(&sig, &sig_filename.as_str()); + let records: Vec = Record::from_sig(sig, sig_filename.as_str()); manifest_rows.extend(records); } } From 12f496c73874c50f1527423e0b831ee70e0ec59e Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 21 Feb 2024 14:02:42 -0800 Subject: [PATCH 7/9] rm errant comment line --- Cargo.lock | 169 +++++++++++++++++++++++++++++++++++++++++++++- Cargo.toml | 2 +- src/manysketch.rs | 1 - 3 files changed, 169 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 466f61ad..fbb5e63a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -103,6 +103,15 @@ version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" +[[package]] +name = "approx" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" +dependencies = [ + "num-traits", +] + [[package]] name = "assert_cmd" version = "2.0.13" @@ -695,6 +704,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + [[package]] name = "librocksdb-sys" version = "0.11.0+8.1.1" @@ -765,6 +780,16 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "matrixmultiply" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7574c1cf36da4798ab73da5b215bbf444f50718207754cb522201d78d1cd0ff2" +dependencies = [ + "autocfg", + "rawpointer", +] + [[package]] name = "md5" version = "0.7.0" @@ -816,6 +841,35 @@ version = "0.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2983372caf4480544083767bf2d27defafe32af49ab4df3a0b7fc90793a3664" +[[package]] +name = "nalgebra" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d506eb7e08d6329505faa8a3a00a5dcc6de9f76e0c77e4b75763ae3c770831ff" +dependencies = [ + "approx", + "matrixmultiply", + "nalgebra-macros", + "num-complex", + "num-rational", + "num-traits", + "rand", + "rand_distr", + "simba", + "typenum", +] + +[[package]] +name = "nalgebra-macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01fcc0b8149b4632adc89ac3b7b31a12fb6099a0317a4eb2ebff574ef7de7218" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "needletail" version = "0.5.1" @@ -867,6 +921,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" +[[package]] +name = "num-complex" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23c6602fda94a57c990fe0df199a035d83576b496aa29f4e634a8ac6004e68a6" +dependencies = [ + "num-traits", +] + [[package]] name = "num-integer" version = "0.1.45" @@ -888,6 +951,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-rational" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.17" @@ -895,6 +969,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -951,6 +1026,12 @@ dependencies = [ "windows-targets 0.48.5", ] +[[package]] +name = "paste" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -1207,6 +1288,22 @@ dependencies = [ "getrandom", ] +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand", +] + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + [[package]] name = "rayon" version = "1.8.1" @@ -1323,6 +1420,12 @@ dependencies = [ "librocksdb-sys", ] +[[package]] +name = "roots" +version = "0.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "082f11ffa03bbef6c2c6ea6bea1acafaade2fd9050ae0234ab44a2153742b058" + [[package]] name = "rustc-hash" version = "1.1.0" @@ -1348,6 +1451,15 @@ version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" +[[package]] +name = "safe_arch" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f398075ce1e6a179b46f51bd88d0598b92b00d3551f1a2d4ac49e771b56ac354" +dependencies = [ + "bytemuck", +] + [[package]] name = "safemem" version = "0.3.3" @@ -1403,6 +1515,19 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simba" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0b7840f121a46d63066ee7a99fc81dcabbc6105e437cae43528cea199b5a05f" +dependencies = [ + "approx", + "num-complex", + "num-traits", + "paste", + "wide", +] + [[package]] name = "simdutf8" version = "0.1.4" @@ -1430,7 +1555,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.13.0" -source = "git+https://github.com/sourmash-bio/sourmash?rev=45a7ad843c76cdcdcf0ae0347f4711481654cbd0#45a7ad843c76cdcdcf0ae0347f4711481654cbd0" +source = "git+https://github.com/sourmash-bio/sourmash?rev=297ff0b963adbed7462508ea1247fe192be02b98#297ff0b963adbed7462508ea1247fe192be02b98" dependencies = [ "az", "byteorder", @@ -1444,6 +1569,7 @@ dependencies = [ "getrandom", "getset", "histogram", + "itertools", "log", "md5", "memmap2", @@ -1459,8 +1585,11 @@ dependencies = [ "rkyv", "roaring", "rocksdb", + "roots", "serde", "serde_json", + "statrs", + "streaming-stats", "thiserror", "twox-hash", "typed-builder", @@ -1499,6 +1628,28 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "statrs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d08e5e1748192713cc281da8b16924fb46be7b0c2431854eadc785823e5696e" +dependencies = [ + "approx", + "lazy_static", + "nalgebra", + "num-traits", + "rand", +] + +[[package]] +name = "streaming-stats" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0d670ce4e348a2081843569e0f79b21c99c91bb9028b3b3ecb0f050306de547" +dependencies = [ + "num-traits", +] + [[package]] name = "syn" version = "1.0.109" @@ -1617,6 +1768,12 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + [[package]] name = "unicode-ident" version = "1.0.12" @@ -1749,6 +1906,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "wide" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89beec544f246e679fc25490e3f8e08003bc4bf612068f325120dad4cea02c1c" +dependencies = [ + "bytemuck", + "safe_arch", +] + [[package]] name = "windows-core" version = "0.52.0" diff --git a/Cargo.toml b/Cargo.toml index 21d1ac4c..fac4f6cc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.196", features = ["derive"] } #sourmash = { version = "0.12.1", features = ["branchwater"] } -sourmash = { git="https://github.com/sourmash-bio/sourmash", rev="45a7ad843c76cdcdcf0ae0347f4711481654cbd0", features = ["branchwater"] } +sourmash = { git="https://github.com/sourmash-bio/sourmash", rev="297ff0b963adbed7462508ea1247fe192be02b98", features = ["branchwater"] } serde_json = "1.0.113" niffler = "2.4.0" log = "0.4.14" diff --git a/src/manysketch.rs b/src/manysketch.rs index 650ea628..ec1bbe9d 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -99,7 +99,6 @@ fn build_siginfo(params: &[Params], moltype: &str) -> Vec { let cp = ComputeParameters::builder() .ksizes(vec![adjusted_ksize]) - // .ksizes(vec![param.ksize]) .scaled(param.scaled) .protein(param.is_protein) .dna(param.is_dna) From b1a56ba75ee750bff69fe4099dd76e27d05b22c0 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 21 Feb 2024 14:09:46 -0800 Subject: [PATCH 8/9] avoid additional iter_mut to set name/filename --- src/manysketch.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/manysketch.rs b/src/manysketch.rs index ec1bbe9d..485ed2b3 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -197,12 +197,18 @@ pub fn manysketch( } }; // parse fasta and add to signature + let mut set_name = false; while let Some(record_result) = reader.next() { match record_result { Ok(record) => { // do we need to normalize to make sure all the bases are consistently capitalized? // let norm_seq = record.normalize(false); sigs.iter_mut().for_each(|sig| { + if !set_name { + sig.set_name(name); + sig.set_filename(filename.as_str()); + set_name = true; + }; if moltype == "protein" { sig.add_protein(&record.seq()) .expect("Failed to add protein"); @@ -217,11 +223,6 @@ pub fn manysketch( } } - // Set name and filename for each signature after processing all records - sigs.iter_mut().for_each(|sig| { - sig.set_name(name); - sig.set_filename(filename.as_str()); - }); Some(sigs) }) .try_for_each_with( From 020a5140740cf0c15457909daad6f0b2ff6c9adc Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 23 Feb 2024 11:33:38 -0800 Subject: [PATCH 9/9] use updated sourmash core (0.13.0)! --- Cargo.lock | 3 ++- Cargo.toml | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c9d6c126..ed14c3fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1503,7 +1503,8 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.13.0" -source = "git+https://github.com/sourmash-bio/sourmash?rev=297ff0b963adbed7462508ea1247fe192be02b98#297ff0b963adbed7462508ea1247fe192be02b98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae9e413cb7387bbb4405e960920e5d8c5f255ec4a86f021a18a455014565e749" dependencies = [ "az", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index ee7cbbae..5a607980 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,8 +12,7 @@ crate-type = ["cdylib"] pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.196", features = ["derive"] } -#sourmash = { version = "0.12.1", features = ["branchwater"] } -sourmash = { git="https://github.com/sourmash-bio/sourmash", rev="297ff0b963adbed7462508ea1247fe192be02b98", features = ["branchwater"] } +sourmash = { version = "0.13.0", features = ["branchwater"] } serde_json = "1.0.113" niffler = "2.4.0" log = "0.4.14"