From 999683d1f2d13b5f74a0ca7772dd26527ac16647 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 26 Feb 2024 14:13:56 -0800 Subject: [PATCH 01/11] upd manysketch for multiple files --- src/lib.rs | 9 +- src/manysketch.rs | 122 +++++++++-------- .../sourmash_plugin_branchwater/__init__.py | 5 +- src/utils.rs | 127 ++++++++++++------ 4 files changed, 162 insertions(+), 101 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 99264b87..5b7da2de 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -259,8 +259,13 @@ fn do_pairwise( } #[pyfunction] -fn do_manysketch(filelist: String, param_str: String, output: String) -> anyhow::Result { - match manysketch::manysketch(filelist, param_str, output) { +fn do_manysketch( + filelist: String, + param_str: String, + output: String, + singleton: bool, +) -> anyhow::Result { + match manysketch::manysketch(filelist, param_str, output, singleton) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/manysketch.rs b/src/manysketch.rs index 485ed2b3..eaff89e6 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -117,6 +117,7 @@ pub fn manysketch( filelist: String, param_str: String, output: String, + singleton: bool, ) -> Result<(), Box> { let fileinfo = match load_fasta_fromfile(filelist) { Ok(result) => result, @@ -165,70 +166,81 @@ pub fn manysketch( let send_result = fileinfo .par_iter() - .filter_map(|(name, filename, moltype)| { - // increment processed_fastas counter; make 1-based for % reporting - let i = processed_fastas.fetch_add(1, atomic::Ordering::SeqCst); - // progress report at threshold - if (i + 1) % reporting_threshold == 0 { - let percent_processed = (((i + 1) as f64 / n_fastas as f64) * 100.0).round(); - eprintln!( - "Starting file {}/{} ({}%)", - (i + 1), - n_fastas, - percent_processed - ); - } - - // build sig templates from params - let mut sigs = build_siginfo(¶ms_vec, moltype); - // if no sigs to build, skip - if sigs.is_empty() { - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - return None; - } + .filter_map(|(name, filenames, moltype)| { + let mut allsigs = Vec::new(); + for filename in filenames { + // increment processed_fastas counter; make 1-based for % reporting + let i = processed_fastas.fetch_add(1, atomic::Ordering::SeqCst); + // progress report at threshold + if (i + 1) % reporting_threshold == 0 { + let percent_processed = (((i + 1) as f64 / n_fastas as f64) * 100.0).round(); + eprintln!( + "Starting file {}/{} ({}%)", + (i + 1), + n_fastas, + percent_processed + ); + } - // Open fasta file reader - let mut reader = match parse_fastx_file(filename) { - Ok(r) => r, - Err(err) => { - eprintln!("Error opening file {}: {:?}", filename, err); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + // build sig templates from params + let sig_templates = build_siginfo(¶ms_vec, moltype); + let mut sigs = sig_templates.clone(); + // if no sigs to build, skip + if sigs.is_empty() { + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); return None; } - }; - // parse fasta and add to signature - let mut set_name = false; - while let Some(record_result) = reader.next() { - match record_result { - Ok(record) => { - // do we need to normalize to make sure all the bases are consistently capitalized? - // let norm_seq = record.normalize(false); - sigs.iter_mut().for_each(|sig| { - if !set_name { - sig.set_name(name); - sig.set_filename(filename.as_str()); - set_name = true; - }; - if moltype == "protein" { - sig.add_protein(&record.seq()) - .expect("Failed to add protein"); - } else { - sig.add_sequence(&record.seq(), true) - .expect("Failed to add sequence"); - // if not force, panics with 'N' in dna sequence - } - }); + + // Open fasta file reader + let mut reader = match parse_fastx_file(filename) { + Ok(r) => r, + Err(err) => { + eprintln!("Error opening file {}: {:?}", filename, err); + let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + return None; + } + }; + + // parse fasta and add to signature + let mut set_name = false; + while let Some(record_result) = reader.next() { + match record_result { + Ok(record) => { + // do we need to normalize to make sure all the bases are consistently capitalized? + // let norm_seq = record.normalize(false); + sigs.iter_mut().for_each(|sig| { + if !set_name { + sig.set_name(name); + sig.set_filename(filename.as_str()); + set_name = true; + }; + if moltype == "protein" { + sig.add_protein(&record.seq()) + .expect("Failed to add protein"); + } else { + sig.add_sequence(&record.seq(), true) + .expect("Failed to add sequence"); + // if not force, panics with 'N' in dna sequence + } + }); + } + Err(err) => eprintln!("Error while processing record: {:?}", err), + } + if singleton { + allsigs.append(&mut sigs); + sigs = sig_templates.clone(); } - Err(err) => eprintln!("Error while processing record: {:?}", err), + } + if !singleton { + allsigs.append(&mut sigs); } } - - Some(sigs) + Some(allsigs) }) .try_for_each_with( send.clone(), - |s: &mut std::sync::Arc>, sigs| { - if let Err(e) = s.send(ZipMessage::SignatureData(sigs)) { + |s: &mut std::sync::Arc>, filled_sigs| { + if let Err(e) = s.send(ZipMessage::SignatureData(filled_sigs)) { Err(format!("Unable to send internal data: {:?}", e)) } else { Ok(()) diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index e1912817..fef4c3ba 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -336,6 +336,8 @@ def __init__(self, p): help='parameter string for sketching (default: k=31,scaled=1000)') p.add_argument('-c', '--cores', default=0, type=int, help='number of cores to use (default is all available)') + p.add_argument('-s', '--singleton', action="store_true", + help='build one sketch per fasta record') def main(self, args): print_version() @@ -355,7 +357,8 @@ def main(self, args): super().main(args) status = sourmash_plugin_branchwater.do_manysketch(args.fromfile_csv, args.param_string, - args.output) + args.output, + args.singleton) if status == 0: notify(f"...manysketch is done! results in '{args.output}'") return status diff --git a/src/utils.rs b/src/utils.rs index e95f865f..71179a3b 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -7,7 +7,7 @@ use anyhow::{anyhow, Context, Result}; use camino::Utf8Path as Path; use camino::Utf8PathBuf as PathBuf; use csv::Writer; -use serde::ser::Serializer; +// use serde::ser::Serializer; use serde::{Deserialize, Serialize}; use std::cmp::{Ordering, PartialOrd}; use std::collections::BinaryHeap; @@ -131,22 +131,51 @@ pub fn write_prefetch( Ok(()) } -pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result> { +enum CSVType { + Assembly, + Reads, + Unknown, +} + +fn detect_csv_type(headers: &csv::StringRecord) -> CSVType { + if headers.len() == 3 + && headers.get(0).unwrap() == "name" + && headers.get(1).unwrap() == "genome_filename" + && headers.get(2).unwrap() == "protein_filename" + { + CSVType::Assembly + } else if headers.len() == 3 + && headers.get(0).unwrap() == "name" + && headers.get(1).unwrap() == "read1" + && headers.get(2).unwrap() == "read2" + { + CSVType::Reads + } else { + CSVType::Unknown + } +} + +pub fn load_fasta_fromfile( + sketchlist_filename: String, +) -> Result, String)>> { let mut rdr = csv::Reader::from_path(sketchlist_filename)?; // Check for right header let headers = rdr.headers()?; - if headers.len() != 3 - || headers.get(0).unwrap() != "name" - || headers.get(1).unwrap() != "genome_filename" - || headers.get(2).unwrap() != "protein_filename" - { - return Err(anyhow!( - "Invalid header. Expected 'name,genome_filename,protein_filename', but got '{}'", + + match detect_csv_type(&headers) { + CSVType::Assembly => process_assembly_csv(rdr), + CSVType::Reads => process_reads_csv(rdr), + CSVType::Unknown => Err(anyhow!( + "Invalid header. Expected 'name,genome_filename,protein_filename' or 'name,read1,read2', but got '{}'", headers.iter().collect::>().join(",") - )); + )), } +} +fn process_assembly_csv( + mut rdr: csv::Reader, +) -> Result, String)>> { let mut results = Vec::new(); let mut row_count = 0; @@ -178,17 +207,21 @@ pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result Result, +) -> Result, String)>> { + let mut results = Vec::new(); + let mut processed_rows = std::collections::HashSet::new(); + let mut duplicate_count = 0; + + for result in rdr.records() { + let record = result?; + let row_string = record.iter().collect::>().join(","); + if processed_rows.contains(&row_string) { + duplicate_count += 1; + continue; + } + processed_rows.insert(row_string.clone()); + + let name = record + .get(0) + .ok_or_else(|| anyhow!("Missing 'name' field"))? + .to_string(); + let read1 = record + .get(1) + .ok_or_else(|| anyhow!("Missing 'read1' field"))?; + let read2 = record + .get(2) + .ok_or_else(|| anyhow!("Missing 'read2' field"))?; + let paths = vec![PathBuf::from(read1), PathBuf::from(read2)]; + results.push((name, paths, "alternate".to_string())); + } + + if duplicate_count > 0 { + println!("Warning: {} duplicated rows were skipped.", duplicate_count); + } + println!("Loaded alternate CSV variant."); + + Ok(results) +} + // Load all compatible minhashes from a collection into memory // also store sig name and md5 alongside, as we usually need those pub fn load_sketches( @@ -771,36 +842,6 @@ pub struct MultiSearchResult { pub max_containment_ani: Option, } -#[derive(Serialize)] -pub struct ManifestRow { - pub md5: String, - pub md5short: String, - pub ksize: u32, - pub moltype: String, - pub num: u32, - pub scaled: u64, - pub n_hashes: usize, - pub with_abundance: BoolPython, - pub name: String, - pub filename: String, - pub internal_location: String, -} - -// A wrapper type for booleans to customize serialization -pub struct BoolPython(bool); - -impl Serialize for BoolPython { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - match self.0 { - true => serializer.serialize_str("True"), - false => serializer.serialize_str("False"), - } - } -} - pub fn open_stdout_or_file(output: Option) -> Box { // if output is a file, use open_output_file if let Some(path) = output { From cdb08991dd24547f39f5a0496a2d450db40ae090 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 26 Feb 2024 14:22:08 -0800 Subject: [PATCH 02/11] set singleton name properly --- src/manysketch.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/manysketch.rs b/src/manysketch.rs index eaff89e6..4e169c1a 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -209,7 +209,12 @@ pub fn manysketch( // do we need to normalize to make sure all the bases are consistently capitalized? // let norm_seq = record.normalize(false); sigs.iter_mut().for_each(|sig| { - if !set_name { + if singleton { + let record_name = std::str::from_utf8(record.id()) + .expect("could not get record id"); + sig.set_name(record_name); + sig.set_filename(filename.as_str()); + } else if !set_name { sig.set_name(name); sig.set_filename(filename.as_str()); set_name = true; From 3ccf625da79a4b18c301dd05017e3ac85559dec1 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 26 Feb 2024 15:02:28 -0800 Subject: [PATCH 03/11] test singleton --- src/manysketch.rs | 37 +++++++++++++++++------------ src/python/tests/test_sketch.py | 41 ++++++++++++++++++++++++++++++--- 2 files changed, 60 insertions(+), 18 deletions(-) diff --git a/src/manysketch.rs b/src/manysketch.rs index 4e169c1a..70a4a936 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -168,6 +168,22 @@ pub fn manysketch( .par_iter() .filter_map(|(name, filenames, moltype)| { let mut allsigs = Vec::new(); + // build sig templates for these sketches from params, check if there are sigs to build + let sig_templates = build_siginfo(¶ms_vec, moltype); + // if no sigs to build, skip this iteration + if sig_templates.is_empty() { + skipped_paths.fetch_add(filenames.len(), atomic::Ordering::SeqCst); + processed_fastas.fetch_add(1, atomic::Ordering::SeqCst); + return None; + } + + let mut sigs = sig_templates.clone(); + // have name / filename been set for each sig yet? + let mut set_name = false; + // if merging multiple files, sourmash sets filename as last filename + let last_filename = filenames.last().unwrap(); + + // to do: consider changing reporting to per-sig, no matter how many fastas? but singleton... for filename in filenames { // increment processed_fastas counter; make 1-based for % reporting let i = processed_fastas.fetch_add(1, atomic::Ordering::SeqCst); @@ -182,27 +198,17 @@ pub fn manysketch( ); } - // build sig templates from params - let sig_templates = build_siginfo(¶ms_vec, moltype); - let mut sigs = sig_templates.clone(); - // if no sigs to build, skip - if sigs.is_empty() { - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - return None; - } - // Open fasta file reader let mut reader = match parse_fastx_file(filename) { Ok(r) => r, Err(err) => { eprintln!("Error opening file {}: {:?}", filename, err); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + failed_paths.fetch_add(1, atomic::Ordering::SeqCst); return None; } }; // parse fasta and add to signature - let mut set_name = false; while let Some(record_result) = reader.next() { match record_result { Ok(record) => { @@ -216,7 +222,8 @@ pub fn manysketch( sig.set_filename(filename.as_str()); } else if !set_name { sig.set_name(name); - sig.set_filename(filename.as_str()); + // sourmash sets filename to last filename if merging fastas + sig.set_filename(last_filename.as_str()); set_name = true; }; if moltype == "protein" { @@ -236,9 +243,9 @@ pub fn manysketch( sigs = sig_templates.clone(); } } - if !singleton { - allsigs.append(&mut sigs); - } + } + if !singleton { + allsigs.append(&mut sigs); } Some(allsigs) }) diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py index 88c99dcf..b03b25ef 100644 --- a/src/python/tests/test_sketch.py +++ b/src/python/tests/test_sketch.py @@ -208,9 +208,6 @@ def test_manysketch_skip_incompatible_fastas(runtmp, capfd): assert sig.minhash.ksize == 10 assert sig.minhash.scaled == 1 assert sig.md5sum() == "eb4467d11e0ecd2dbde4193bfc255310" - assert 'Starting file 2/4 (50%)' in captured.err - assert 'Starting file 3/4 (75%)' in captured.err - assert 'Starting file 4/4 (100%)' in captured.err assert 'DONE. Processed 4 fasta files' in captured.err assert 'WARNING: 3 fasta files skipped - no compatible signatures.' in captured.err @@ -452,3 +449,41 @@ def test_protein_zip_manifest(runtmp, capfd): assert sig.minhash.ksize == 10 # minhash stores k*3, but does the conversion back for us assert sig.minhash.moltype == 'protein' assert sig.minhash.scaled == 1 + + +def test_manysketch_singleton(runtmp): + fa_csv = runtmp.output('db-fa.txt') + + fa1 = get_test_data('short.fa') + fa2 = get_test_data('short2.fa') + fa3 = get_test_data('short3.fa') + + make_file_csv(fa_csv, [fa1, fa2, fa3]) + + output = runtmp.output('db.zip') + + runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, + '--param-str', "dna,k=31,scaled=1", "--singleton") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + print(sigs) + + assert len(sigs) == 4 + singleton_sketch = runtmp.output('short3.sig') + runtmp.sourmash('sketch', 'dna', fa3, '-o', singleton_sketch, + '--param-str', "dna,k=31,scaled=1", "--singleton") + ss_sketch = sourmash.load_signatures(singleton_sketch) + ss_sketch1 = next(ss_sketch) + ss_sketch2 = next(ss_sketch) + + expected_signames = ['shortName', 'tr1 4', 'firstname', 'other'] + for sig in sigs: + assert sig.name in expected_signames + if sig.name == 'firstname': + assert sig == ss_sketch1 + if sig.name == 'other': + assert sig == ss_sketch2 From 4c9edd3f2c0efe5928cebd12fa38e8124bb05be0 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 26 Feb 2024 16:10:16 -0800 Subject: [PATCH 04/11] fix fasta reporting --- src/manysketch.rs | 5 +- src/python/tests/test_sketch.py | 84 +++++++++++++++++++++++++++------ src/utils.rs | 82 +++++++++++++++++++------------- 3 files changed, 119 insertions(+), 52 deletions(-) diff --git a/src/manysketch.rs b/src/manysketch.rs index 70a4a936..3c5e8067 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -119,13 +119,12 @@ pub fn manysketch( output: String, singleton: bool, ) -> Result<(), Box> { - let fileinfo = match load_fasta_fromfile(filelist) { - Ok(result) => result, + let (fileinfo, n_fastas) = match load_fasta_fromfile(filelist) { + Ok((file_info, n_fastas)) => (file_info, n_fastas), Err(e) => bail!("Could not load fromfile csv. Underlying error: {}", e), }; // if no files to process, exit with error - let n_fastas = fileinfo.len(); if n_fastas == 0 { bail!("No files to load, exiting."); } diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py index b03b25ef..c2ff1742 100644 --- a/src/python/tests/test_sketch.py +++ b/src/python/tests/test_sketch.py @@ -12,7 +12,7 @@ def get_test_data(filename): return os.path.join(thisdir, 'test-data', filename) -def make_file_csv(filename, genome_paths, protein_paths = []): +def make_assembly_csv(filename, genome_paths, protein_paths = []): # equalize path lengths by adding "". names = [os.path.basename(x).split('.fa')[0] for x in genome_paths] if len(protein_paths) < len(genome_paths): @@ -26,6 +26,14 @@ def make_file_csv(filename, genome_paths, protein_paths = []): for name, genome_path, protein_path in zip(names, genome_paths, protein_paths): fp.write("{},{},{}\n".format(name, genome_path, protein_path)) +def make_reads_csv(filename, reads_tuples = []): + # reads tuples should be (name,read1,read2) + with open(filename, 'wt') as fp: + fp.write("name,read1,read2\n") + for (name, read1, read2) in reads_tuples: + print(f"{name},{read1},{read2}") + fp.write("{},{},{}\n".format(name, read1, read2)) + def test_installed(runtmp): with pytest.raises(utils.SourmashCommandFailed): @@ -41,7 +49,7 @@ def test_manysketch_simple(runtmp): fa2 = get_test_data('short2.fa') fa3 = get_test_data('short3.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) output = runtmp.output('db.zip') @@ -65,7 +73,7 @@ def test_manysketch_mult_k(runtmp): fa2 = get_test_data('short2.fa') fa3 = get_test_data('short3.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) output = runtmp.output('db.zip') @@ -89,7 +97,7 @@ def test_manysketch_mult_k_2(runtmp): fa2 = get_test_data('short2.fa') fa3 = get_test_data('short3.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) output = runtmp.output('db.zip') @@ -116,7 +124,7 @@ def test_manysketch_mult_moltype(runtmp): fa3 = get_test_data('short3.fa') protfa1 = get_test_data('short-protein.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) output = runtmp.output('db.zip') @@ -158,7 +166,7 @@ def test_manysketch_only_incompatible_fastas(runtmp, capfd): fa2 = get_test_data('short2.fa') fa3 = get_test_data('short3.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) output = runtmp.output('db.zip') @@ -185,7 +193,7 @@ def test_manysketch_skip_incompatible_fastas(runtmp, capfd): fa3 = get_test_data('short3.fa') protfa1 = get_test_data('short-protein.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) output = runtmp.output('db.zip') @@ -235,7 +243,7 @@ def test_manysketch_bad_fa_csv(runtmp, capfd): sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') - make_file_csv(siglist, [sig2, sig47, sig63]) + make_assembly_csv(siglist, [sig2, sig47, sig63]) output = runtmp.output('db.zip') @@ -273,7 +281,7 @@ def test_manysketch_bad_fa_csv_3(runtmp, capfd): protfa1 = get_test_data('short-protein.fa') # make file csv but don't fill empty protein rows with ,"" - make_file_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) g_fa = [fa1, fa2, fa3] p_fa = [protfa1] with open(fa_csv, 'wt') as fp: @@ -302,7 +310,7 @@ def test_manysketch_empty_fa_csv(runtmp, capfd): # test empty fa_csv file fa_csv = runtmp.output('fa.txt') output = runtmp.output('out.zip') - make_file_csv(fa_csv, []) # empty + make_assembly_csv(fa_csv, []) # empty with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'manysketch', fa_csv, @@ -321,7 +329,7 @@ def test_manysketch_duplicated_rows(runtmp, capfd): fa3 = get_test_data('short3.fa') protfa1 = get_test_data('short-protein.fa') - make_file_csv(fa_csv, [fa1, fa1, fa1, fa3]) + make_assembly_csv(fa_csv, [fa1, fa1, fa1, fa3]) output = runtmp.output('db.zip') @@ -350,7 +358,7 @@ def test_manysketch_N_in_dna(runtmp): fp.write(">bad\n") fp.write("ACAGTN\n") - make_file_csv(fa_csv, [fa1]) + make_assembly_csv(fa_csv, [fa1]) output = runtmp.output('db.zip') @@ -375,7 +383,7 @@ def test_zip_manifest(runtmp, capfd): fa2 = get_test_data('short2.fa') fa3 = get_test_data('short3.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) output = runtmp.output('db.zip') runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, @@ -414,7 +422,7 @@ def test_protein_zip_manifest(runtmp, capfd): fa1 = get_test_data('short.fa') fa2 = get_test_data('short-protein.fa') - make_file_csv(fa_csv, [fa1], [fa2]) + make_assembly_csv(fa_csv, [fa1], [fa2]) output = runtmp.output('db.zip') runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, @@ -458,7 +466,7 @@ def test_manysketch_singleton(runtmp): fa2 = get_test_data('short2.fa') fa3 = get_test_data('short3.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) output = runtmp.output('db.zip') @@ -487,3 +495,49 @@ def test_manysketch_singleton(runtmp): assert sig == ss_sketch1 if sig.name == 'other': assert sig == ss_sketch2 + + +def test_manysketch_reads(runtmp, capfd): + fa_csv = runtmp.output('db-fa.csv') + + fa1 = get_test_data('short.fa') + fa2 = get_test_data('short2.fa') + fa3 = get_test_data('short3.fa') + + make_reads_csv(fa_csv, [("short", fa1, fa2), ('short3', fa3, '')]) # make sure we can just do read1 alone + + output = runtmp.output('db.zip') + + runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, + '--param-str', "dna,k=31,scaled=1") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + captured = capfd.readouterr() + print(captured.out) + print(captured.err) + assert "Found 'reads' CSV, assuming all files are DNA." in captured.out + assert "Starting file 3/3 (100%)" in captured.err + assert "DONE. Processed 3 fasta files" in captured.err + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + print(sigs) + + assert len(sigs) == 2 + s1 = runtmp.output('short.sig') + runtmp.sourmash('sketch', 'dna', fa1, fa2, '-o', s1, + '--param-str', "k=31,scaled=1", '--name', 'short') + sig1 = sourmash.load_one_signature(s1) + s3 = runtmp.output('short3.sig') + runtmp.sourmash('sketch', 'dna', fa3, '-o', s3, + '--param-str', "k=31,scaled=1", '--name', 'short3') + sig2 = sourmash.load_one_signature(s3) + + expected_signames = ['short', 'short3'] + for sig in sigs: + assert sig.name in expected_signames + if sig.name == 'short': + assert sig == sig1 + if sig.name == 'short3': + assert sig == sig2 diff --git a/src/utils.rs b/src/utils.rs index 71179a3b..9a21f50c 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -7,7 +7,6 @@ use anyhow::{anyhow, Context, Result}; use camino::Utf8Path as Path; use camino::Utf8PathBuf as PathBuf; use csv::Writer; -// use serde::ser::Serializer; use serde::{Deserialize, Serialize}; use std::cmp::{Ordering, PartialOrd}; use std::collections::BinaryHeap; @@ -157,7 +156,7 @@ fn detect_csv_type(headers: &csv::StringRecord) -> CSVType { pub fn load_fasta_fromfile( sketchlist_filename: String, -) -> Result, String)>> { +) -> Result<(Vec<(String, Vec, String)>, usize)> { let mut rdr = csv::Reader::from_path(sketchlist_filename)?; // Check for right header @@ -175,7 +174,7 @@ pub fn load_fasta_fromfile( fn process_assembly_csv( mut rdr: csv::Reader, -) -> Result, String)>> { +) -> Result<(Vec<(String, Vec, String)>, usize)> { let mut results = Vec::new(); let mut row_count = 0; @@ -201,28 +200,27 @@ fn process_assembly_csv( .ok_or_else(|| anyhow!("Missing 'name' field"))? .to_string(); - let genome_filename = record - .get(1) - .ok_or_else(|| anyhow!("Missing 'genome_filename' field"))?; - if !genome_filename.is_empty() { - results.push(( - name.clone(), - // PathBuf::from(genome_filename), - vec![PathBuf::from(genome_filename)], - "dna".to_string(), - )); - genome_count += 1; + // Handle optional genome_filename + if let Some(genome_filename) = record.get(1) { + if !genome_filename.is_empty() { + results.push(( + name.clone(), + vec![PathBuf::from(genome_filename)], + "dna".to_string(), + )); + genome_count += 1; + } } - let protein_filename = record - .get(2) - .ok_or_else(|| anyhow!("Missing 'protein_filename' field"))?; - if !protein_filename.is_empty() { - results.push(( - name, - vec![PathBuf::from(protein_filename)], - "protein".to_string(), - )); - protein_count += 1; + // Handle optional protein_filename + if let Some(protein_filename) = record.get(2) { + if !protein_filename.is_empty() { + results.push(( + name, + vec![PathBuf::from(protein_filename)], + "protein".to_string(), + )); + protein_count += 1; + } } } // Print warning if there were duplicated rows. @@ -233,14 +231,18 @@ fn process_assembly_csv( "Loaded {} rows in total ({} genome and {} protein files)", row_count, genome_count, protein_count ); - Ok(results) + let n_fastas = genome_count + protein_count; + Ok((results, n_fastas)) } fn process_reads_csv( mut rdr: csv::Reader, -) -> Result, String)>> { + // ) -> Result, String)>> { +) -> Result<(Vec<(String, Vec, String)>, usize)> { let mut results = Vec::new(); let mut processed_rows = std::collections::HashSet::new(); + let mut read1_count = 0; + let mut read2_count = 0; let mut duplicate_count = 0; for result in rdr.records() { @@ -259,19 +261,31 @@ fn process_reads_csv( let read1 = record .get(1) .ok_or_else(|| anyhow!("Missing 'read1' field"))?; + read1_count += 1; + let read2 = record.get(2); // No error if missing + let mut paths = vec![PathBuf::from(read1)]; let read2 = record .get(2) - .ok_or_else(|| anyhow!("Missing 'read2' field"))?; - let paths = vec![PathBuf::from(read1), PathBuf::from(read2)]; - results.push((name, paths, "alternate".to_string())); + .and_then(|r2| if r2.is_empty() { None } else { Some(r2) }); + if let Some(r2) = read2 { + paths.push(PathBuf::from(r2)); + read2_count += 1; + } + results.push((name, paths, "dna".to_string())); } - if duplicate_count > 0 { - println!("Warning: {} duplicated rows were skipped.", duplicate_count); - } - println!("Loaded alternate CSV variant."); + println!("Found 'reads' CSV, assuming all files are DNA."); + println!( + "Loaded {} rows in total ({} with read1 and {} with read2), {} duplicates skipped.", + processed_rows.len(), + read1_count, + read2_count, + duplicate_count + ); + + let n_fastas = read1_count + read2_count; - Ok(results) + Ok((results, n_fastas)) } // Load all compatible minhashes from a collection into memory From 42a6ddfc4bbc28be9dc108d62ca98adf02c225a0 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 26 Feb 2024 17:22:19 -0800 Subject: [PATCH 05/11] cleanup --- src/utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils.rs b/src/utils.rs index 9a21f50c..76d34f8e 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -262,8 +262,8 @@ fn process_reads_csv( .get(1) .ok_or_else(|| anyhow!("Missing 'read1' field"))?; read1_count += 1; - let read2 = record.get(2); // No error if missing let mut paths = vec![PathBuf::from(read1)]; + // allow missing read2 let read2 = record .get(2) .and_then(|r2| if r2.is_empty() { None } else { Some(r2) }); From 8801772529e6f1d2756a64183fce4644f9630430 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 26 Feb 2024 17:44:36 -0800 Subject: [PATCH 06/11] test singleton with reads csv --- src/python/tests/test_sketch.py | 48 +++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py index c2ff1742..b05703b6 100644 --- a/src/python/tests/test_sketch.py +++ b/src/python/tests/test_sketch.py @@ -541,3 +541,51 @@ def test_manysketch_reads(runtmp, capfd): assert sig == sig1 if sig.name == 'short3': assert sig == sig2 + + +def test_manysketch_reads_singleton(runtmp, capfd): + fa_csv = runtmp.output('db-fa.csv') + + fa1 = get_test_data('short.fa') + fa2 = get_test_data('short2.fa') + fa3 = get_test_data('short3.fa') + + make_reads_csv(fa_csv, [("short", fa2, fa3), ]) + + output = runtmp.output('db.zip') + + runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, + '--param-str', "dna,k=31,scaled=1", '--singleton') + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + captured = capfd.readouterr() + print(captured.out) + print(captured.err) + assert "Found 'reads' CSV, assuming all files are DNA." in captured.out + assert "Starting file 2/2 (100%)" in captured.err + assert "DONE. Processed 2 fasta files" in captured.err + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + print(sigs) + + assert len(sigs) == 3 + s1 = runtmp.output('singleton.sig') + runtmp.sourmash('sketch', 'dna', fa2, fa3, '-o', s1, + '--param-str', "k=31,scaled=1", '--singleton') + ss = sourmash.load_signatures(s1) + + ss_sketch1 = next(ss) + ss_sketch2 = next(ss) + ss_sketch3 = next(ss) + + expected_signames = ['tr1 4', 'firstname', 'other'] + for sig in sigs: + assert sig.name in expected_signames + if sig.name == 'tr1 4': + assert sig == ss_sketch1 + elif sig.name == 'firstname': + assert sig == ss_sketch2 + elif sig.name == 'other': + assert sig == ss_sketch3 From b974118b4a6d7efa698755b5354c2cdee0435dda Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 26 Feb 2024 17:51:43 -0800 Subject: [PATCH 07/11] clean up comment --- src/manysketch.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/manysketch.rs b/src/manysketch.rs index 3c5e8067..3846caf4 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -182,7 +182,6 @@ pub fn manysketch( // if merging multiple files, sourmash sets filename as last filename let last_filename = filenames.last().unwrap(); - // to do: consider changing reporting to per-sig, no matter how many fastas? but singleton... for filename in filenames { // increment processed_fastas counter; make 1-based for % reporting let i = processed_fastas.fetch_add(1, atomic::Ordering::SeqCst); From b11d91231f87fd54f224c835be215729a3654832 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 27 Feb 2024 09:00:17 -0800 Subject: [PATCH 08/11] add some documentation! --- doc/README.md | 26 +++++++++++++++++-- .../sourmash_plugin_branchwater/__init__.py | 6 ++--- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/doc/README.md b/doc/README.md index d86b452d..f33cbdbd 100644 --- a/doc/README.md +++ b/doc/README.md @@ -88,9 +88,15 @@ When using a fromfile for search, we load all signatures into memory at the star ### Running `manysketch` -The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped sourmash signature collection (`zip`). `manysketch` uses one thread per input file, so it can (very) efficiently sketch many files at once; and, because sequence file parsing is entirely implemented in Rust, it is much, _much_ faster than `sourmash sketch` for large FASTQ files. +The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped sourmash signature collection (`zip`). `manysketch` uses one thread per input file, so it can (very) efficiently sketch many files at once; and, because sequence file parsing is entirely implemented in Rust, it is much, _much_ faster than `sourmash sketch` for large FASTQ files. However, it does not currently support translation, i.e. protein signature generation from DNA FASTA. -To run `manysketch`, you need to build a text file list of FASTA/FASTQ files, with one on each line (`manysketch.csv`, below). A simple way to do this for a directory is this command snippet: +To run `manysketch`, you need to build a text file list of FASTA/FASTQ files (see `manysketch.csv` example, below). + +The following formats are accepted: +- 3 columns: `name,genome_filename,protein_filename`. `genome_filename` entries considered DNA FASTA, `protein_filename` entries are considered protein FASTA. +- 3 columns: `name,read1,read2`. All entries considered DNA FASTA, and both `read1` and `read2` files are used as input for a single sketch with name `name`. + +A simple way to build a manysketch input file for a directory is this command snippet: ``` echo name,genome_filename,protein_filename > manysketch.csv for i in *.fa.gz @@ -117,6 +123,22 @@ sourmash scripts manysketch fa.csv -o fa.zip -p k=21,k=31,k=51,scaled=1000,abund ``` See [the sourmash sketch docs](https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-sketch-make-sourmash-signatures-from-sequence-data) for more information on param strings. +`manysketch` also supports building sketches for each record in a FASTA file independently (`--singleton`). + +You can run: + +``` +sourmash scripts manysketch manysketch.csv -o fa.zip --singleton +``` +The output will be written to `fa.zip` + +You can check if all signatures were written properly with +``` +sourmash sig summarize fa.zip +``` +> Here, the number of sketches per parameter combination should equal the total number of records in all input FASTA. +> The `name` column will not be used. Instead, each sketch will be named from the FASTA record name. + ### Running `multisearch` The `multisearch` command compares one or more query genomes, and one or more subject genomes. It differs from `manysearch` by loading all genomes into memory. diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index fef4c3ba..ff566fcb 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -328,8 +328,8 @@ class Branchwater_Manysketch(CommandLinePlugin): def __init__(self, p): super().__init__(p) - p.add_argument('fromfile_csv', help="a csv file containing paths to fasta files. \ - Columns must be: 'name,genome_filename,protein_filename'") + p.add_argument('fromfile_csv', help="a csv file containing paths to FASTA files. \ + Columns must be: 'name,genome_filename,protein_filename' or 'name,read1,read2'") p.add_argument('-o', '--output', required=True, help='output zip file for the signatures') p.add_argument('-p', '--param-string', action='append', type=str, default=[], @@ -337,7 +337,7 @@ def __init__(self, p): p.add_argument('-c', '--cores', default=0, type=int, help='number of cores to use (default is all available)') p.add_argument('-s', '--singleton', action="store_true", - help='build one sketch per fasta record') + help='build one sketch per FASTA record, i.e. multiple sketches per FASTA file') def main(self, args): print_version() From 444ec84a9dc3e6a537bd6ce8cffa39fe8d34fd5c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 27 Feb 2024 09:04:17 -0800 Subject: [PATCH 09/11] upd --- doc/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/README.md b/doc/README.md index f33cbdbd..6668b013 100644 --- a/doc/README.md +++ b/doc/README.md @@ -136,8 +136,8 @@ You can check if all signatures were written properly with ``` sourmash sig summarize fa.zip ``` -> Here, the number of sketches per parameter combination should equal the total number of records in all input FASTA. -> The `name` column will not be used. Instead, each sketch will be named from the FASTA record name. +The number of sketches per parameter combination should equal the total number of records in all input FASTA. +The `name` column will not be used. Instead, each sketch will be named from the FASTA record name. ### Running `multisearch` From c815d964838593cca0c575ea17a8c5f11c7e1bfa Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 27 Feb 2024 09:08:37 -0800 Subject: [PATCH 10/11] upd doc --- doc/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/README.md b/doc/README.md index 6668b013..bb8aacd9 100644 --- a/doc/README.md +++ b/doc/README.md @@ -90,10 +90,12 @@ When using a fromfile for search, we load all signatures into memory at the star The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped sourmash signature collection (`zip`). `manysketch` uses one thread per input file, so it can (very) efficiently sketch many files at once; and, because sequence file parsing is entirely implemented in Rust, it is much, _much_ faster than `sourmash sketch` for large FASTQ files. However, it does not currently support translation, i.e. protein signature generation from DNA FASTA. +#### specifying input FASTA + To run `manysketch`, you need to build a text file list of FASTA/FASTQ files (see `manysketch.csv` example, below). The following formats are accepted: -- 3 columns: `name,genome_filename,protein_filename`. `genome_filename` entries considered DNA FASTA, `protein_filename` entries are considered protein FASTA. +- 3 columns: `name,genome_filename,protein_filename`. `genome_filename` entries are considered DNA FASTA, `protein_filename` entries are considered protein FASTA. - 3 columns: `name,read1,read2`. All entries considered DNA FASTA, and both `read1` and `read2` files are used as input for a single sketch with name `name`. A simple way to build a manysketch input file for a directory is this command snippet: @@ -123,6 +125,8 @@ sourmash scripts manysketch fa.csv -o fa.zip -p k=21,k=31,k=51,scaled=1000,abund ``` See [the sourmash sketch docs](https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-sketch-make-sourmash-signatures-from-sequence-data) for more information on param strings. +#### singleton sketching + `manysketch` also supports building sketches for each record in a FASTA file independently (`--singleton`). You can run: From f16ad222085339381c7854586fe39d1facca48a6 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 27 Feb 2024 09:19:31 -0800 Subject: [PATCH 11/11] upd doc formatting --- doc/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/README.md b/doc/README.md index bb8aacd9..9767048d 100644 --- a/doc/README.md +++ b/doc/README.md @@ -95,8 +95,10 @@ The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped so To run `manysketch`, you need to build a text file list of FASTA/FASTQ files (see `manysketch.csv` example, below). The following formats are accepted: -- 3 columns: `name,genome_filename,protein_filename`. `genome_filename` entries are considered DNA FASTA, `protein_filename` entries are considered protein FASTA. -- 3 columns: `name,read1,read2`. All entries considered DNA FASTA, and both `read1` and `read2` files are used as input for a single sketch with name `name`. +- 3 columns: `name,genome_filename,protein_filename` + >`genome_filename` entries are considered DNA FASTA, `protein_filename` entries are considered protein FASTA. +- 3 columns: `name,read1,read2` + > All entries considered DNA FASTA, and both `read1` and `read2` files are used as input for a single sketch with name `name`. A simple way to build a manysketch input file for a directory is this command snippet: ``` @@ -127,7 +129,7 @@ See [the sourmash sketch docs](https://sourmash.readthedocs.io/en/latest/command #### singleton sketching -`manysketch` also supports building sketches for each record in a FASTA file independently (`--singleton`). +`manysketch` also supports building independent sketches for each record in a FASTA file (`--singleton`). You can run: