diff --git a/doc/README.md b/doc/README.md index d86b452d..9767048d 100644 --- a/doc/README.md +++ b/doc/README.md @@ -88,9 +88,19 @@ When using a fromfile for search, we load all signatures into memory at the star ### Running `manysketch` -The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped sourmash signature collection (`zip`). `manysketch` uses one thread per input file, so it can (very) efficiently sketch many files at once; and, because sequence file parsing is entirely implemented in Rust, it is much, _much_ faster than `sourmash sketch` for large FASTQ files. +The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped sourmash signature collection (`zip`). `manysketch` uses one thread per input file, so it can (very) efficiently sketch many files at once; and, because sequence file parsing is entirely implemented in Rust, it is much, _much_ faster than `sourmash sketch` for large FASTQ files. However, it does not currently support translation, i.e. protein signature generation from DNA FASTA. -To run `manysketch`, you need to build a text file list of FASTA/FASTQ files, with one on each line (`manysketch.csv`, below). A simple way to do this for a directory is this command snippet: +#### specifying input FASTA + +To run `manysketch`, you need to build a text file list of FASTA/FASTQ files (see `manysketch.csv` example, below). + +The following formats are accepted: +- 3 columns: `name,genome_filename,protein_filename` + >`genome_filename` entries are considered DNA FASTA, `protein_filename` entries are considered protein FASTA. +- 3 columns: `name,read1,read2` + > All entries considered DNA FASTA, and both `read1` and `read2` files are used as input for a single sketch with name `name`. + +A simple way to build a manysketch input file for a directory is this command snippet: ``` echo name,genome_filename,protein_filename > manysketch.csv for i in *.fa.gz @@ -117,6 +127,24 @@ sourmash scripts manysketch fa.csv -o fa.zip -p k=21,k=31,k=51,scaled=1000,abund ``` See [the sourmash sketch docs](https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-sketch-make-sourmash-signatures-from-sequence-data) for more information on param strings. +#### singleton sketching + +`manysketch` also supports building independent sketches for each record in a FASTA file (`--singleton`). + +You can run: + +``` +sourmash scripts manysketch manysketch.csv -o fa.zip --singleton +``` +The output will be written to `fa.zip` + +You can check if all signatures were written properly with +``` +sourmash sig summarize fa.zip +``` +The number of sketches per parameter combination should equal the total number of records in all input FASTA. +The `name` column will not be used. Instead, each sketch will be named from the FASTA record name. + ### Running `multisearch` The `multisearch` command compares one or more query genomes, and one or more subject genomes. It differs from `manysearch` by loading all genomes into memory. diff --git a/src/lib.rs b/src/lib.rs index 99264b87..5b7da2de 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -259,8 +259,13 @@ fn do_pairwise( } #[pyfunction] -fn do_manysketch(filelist: String, param_str: String, output: String) -> anyhow::Result { - match manysketch::manysketch(filelist, param_str, output) { +fn do_manysketch( + filelist: String, + param_str: String, + output: String, + singleton: bool, +) -> anyhow::Result { + match manysketch::manysketch(filelist, param_str, output, singleton) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/manysketch.rs b/src/manysketch.rs index 485ed2b3..3846caf4 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -117,14 +117,14 @@ pub fn manysketch( filelist: String, param_str: String, output: String, + singleton: bool, ) -> Result<(), Box> { - let fileinfo = match load_fasta_fromfile(filelist) { - Ok(result) => result, + let (fileinfo, n_fastas) = match load_fasta_fromfile(filelist) { + Ok((file_info, n_fastas)) => (file_info, n_fastas), Err(e) => bail!("Could not load fromfile csv. Underlying error: {}", e), }; // if no files to process, exit with error - let n_fastas = fileinfo.len(); if n_fastas == 0 { bail!("No files to load, exiting."); } @@ -165,70 +165,92 @@ pub fn manysketch( let send_result = fileinfo .par_iter() - .filter_map(|(name, filename, moltype)| { - // increment processed_fastas counter; make 1-based for % reporting - let i = processed_fastas.fetch_add(1, atomic::Ordering::SeqCst); - // progress report at threshold - if (i + 1) % reporting_threshold == 0 { - let percent_processed = (((i + 1) as f64 / n_fastas as f64) * 100.0).round(); - eprintln!( - "Starting file {}/{} ({}%)", - (i + 1), - n_fastas, - percent_processed - ); - } - - // build sig templates from params - let mut sigs = build_siginfo(¶ms_vec, moltype); - // if no sigs to build, skip - if sigs.is_empty() { - let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + .filter_map(|(name, filenames, moltype)| { + let mut allsigs = Vec::new(); + // build sig templates for these sketches from params, check if there are sigs to build + let sig_templates = build_siginfo(¶ms_vec, moltype); + // if no sigs to build, skip this iteration + if sig_templates.is_empty() { + skipped_paths.fetch_add(filenames.len(), atomic::Ordering::SeqCst); + processed_fastas.fetch_add(1, atomic::Ordering::SeqCst); return None; } - // Open fasta file reader - let mut reader = match parse_fastx_file(filename) { - Ok(r) => r, - Err(err) => { - eprintln!("Error opening file {}: {:?}", filename, err); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - return None; - } - }; - // parse fasta and add to signature + let mut sigs = sig_templates.clone(); + // have name / filename been set for each sig yet? let mut set_name = false; - while let Some(record_result) = reader.next() { - match record_result { - Ok(record) => { - // do we need to normalize to make sure all the bases are consistently capitalized? - // let norm_seq = record.normalize(false); - sigs.iter_mut().for_each(|sig| { - if !set_name { - sig.set_name(name); - sig.set_filename(filename.as_str()); - set_name = true; - }; - if moltype == "protein" { - sig.add_protein(&record.seq()) - .expect("Failed to add protein"); - } else { - sig.add_sequence(&record.seq(), true) - .expect("Failed to add sequence"); - // if not force, panics with 'N' in dna sequence - } - }); + // if merging multiple files, sourmash sets filename as last filename + let last_filename = filenames.last().unwrap(); + + for filename in filenames { + // increment processed_fastas counter; make 1-based for % reporting + let i = processed_fastas.fetch_add(1, atomic::Ordering::SeqCst); + // progress report at threshold + if (i + 1) % reporting_threshold == 0 { + let percent_processed = (((i + 1) as f64 / n_fastas as f64) * 100.0).round(); + eprintln!( + "Starting file {}/{} ({}%)", + (i + 1), + n_fastas, + percent_processed + ); + } + + // Open fasta file reader + let mut reader = match parse_fastx_file(filename) { + Ok(r) => r, + Err(err) => { + eprintln!("Error opening file {}: {:?}", filename, err); + failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + return None; + } + }; + + // parse fasta and add to signature + while let Some(record_result) = reader.next() { + match record_result { + Ok(record) => { + // do we need to normalize to make sure all the bases are consistently capitalized? + // let norm_seq = record.normalize(false); + sigs.iter_mut().for_each(|sig| { + if singleton { + let record_name = std::str::from_utf8(record.id()) + .expect("could not get record id"); + sig.set_name(record_name); + sig.set_filename(filename.as_str()); + } else if !set_name { + sig.set_name(name); + // sourmash sets filename to last filename if merging fastas + sig.set_filename(last_filename.as_str()); + set_name = true; + }; + if moltype == "protein" { + sig.add_protein(&record.seq()) + .expect("Failed to add protein"); + } else { + sig.add_sequence(&record.seq(), true) + .expect("Failed to add sequence"); + // if not force, panics with 'N' in dna sequence + } + }); + } + Err(err) => eprintln!("Error while processing record: {:?}", err), + } + if singleton { + allsigs.append(&mut sigs); + sigs = sig_templates.clone(); } - Err(err) => eprintln!("Error while processing record: {:?}", err), } } - - Some(sigs) + if !singleton { + allsigs.append(&mut sigs); + } + Some(allsigs) }) .try_for_each_with( send.clone(), - |s: &mut std::sync::Arc>, sigs| { - if let Err(e) = s.send(ZipMessage::SignatureData(sigs)) { + |s: &mut std::sync::Arc>, filled_sigs| { + if let Err(e) = s.send(ZipMessage::SignatureData(filled_sigs)) { Err(format!("Unable to send internal data: {:?}", e)) } else { Ok(()) diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index e1912817..ff566fcb 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -328,14 +328,16 @@ class Branchwater_Manysketch(CommandLinePlugin): def __init__(self, p): super().__init__(p) - p.add_argument('fromfile_csv', help="a csv file containing paths to fasta files. \ - Columns must be: 'name,genome_filename,protein_filename'") + p.add_argument('fromfile_csv', help="a csv file containing paths to FASTA files. \ + Columns must be: 'name,genome_filename,protein_filename' or 'name,read1,read2'") p.add_argument('-o', '--output', required=True, help='output zip file for the signatures') p.add_argument('-p', '--param-string', action='append', type=str, default=[], help='parameter string for sketching (default: k=31,scaled=1000)') p.add_argument('-c', '--cores', default=0, type=int, help='number of cores to use (default is all available)') + p.add_argument('-s', '--singleton', action="store_true", + help='build one sketch per FASTA record, i.e. multiple sketches per FASTA file') def main(self, args): print_version() @@ -355,7 +357,8 @@ def main(self, args): super().main(args) status = sourmash_plugin_branchwater.do_manysketch(args.fromfile_csv, args.param_string, - args.output) + args.output, + args.singleton) if status == 0: notify(f"...manysketch is done! results in '{args.output}'") return status diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py index 88c99dcf..b05703b6 100644 --- a/src/python/tests/test_sketch.py +++ b/src/python/tests/test_sketch.py @@ -12,7 +12,7 @@ def get_test_data(filename): return os.path.join(thisdir, 'test-data', filename) -def make_file_csv(filename, genome_paths, protein_paths = []): +def make_assembly_csv(filename, genome_paths, protein_paths = []): # equalize path lengths by adding "". names = [os.path.basename(x).split('.fa')[0] for x in genome_paths] if len(protein_paths) < len(genome_paths): @@ -26,6 +26,14 @@ def make_file_csv(filename, genome_paths, protein_paths = []): for name, genome_path, protein_path in zip(names, genome_paths, protein_paths): fp.write("{},{},{}\n".format(name, genome_path, protein_path)) +def make_reads_csv(filename, reads_tuples = []): + # reads tuples should be (name,read1,read2) + with open(filename, 'wt') as fp: + fp.write("name,read1,read2\n") + for (name, read1, read2) in reads_tuples: + print(f"{name},{read1},{read2}") + fp.write("{},{},{}\n".format(name, read1, read2)) + def test_installed(runtmp): with pytest.raises(utils.SourmashCommandFailed): @@ -41,7 +49,7 @@ def test_manysketch_simple(runtmp): fa2 = get_test_data('short2.fa') fa3 = get_test_data('short3.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) output = runtmp.output('db.zip') @@ -65,7 +73,7 @@ def test_manysketch_mult_k(runtmp): fa2 = get_test_data('short2.fa') fa3 = get_test_data('short3.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) output = runtmp.output('db.zip') @@ -89,7 +97,7 @@ def test_manysketch_mult_k_2(runtmp): fa2 = get_test_data('short2.fa') fa3 = get_test_data('short3.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) output = runtmp.output('db.zip') @@ -116,7 +124,7 @@ def test_manysketch_mult_moltype(runtmp): fa3 = get_test_data('short3.fa') protfa1 = get_test_data('short-protein.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) output = runtmp.output('db.zip') @@ -158,7 +166,7 @@ def test_manysketch_only_incompatible_fastas(runtmp, capfd): fa2 = get_test_data('short2.fa') fa3 = get_test_data('short3.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) output = runtmp.output('db.zip') @@ -185,7 +193,7 @@ def test_manysketch_skip_incompatible_fastas(runtmp, capfd): fa3 = get_test_data('short3.fa') protfa1 = get_test_data('short-protein.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) output = runtmp.output('db.zip') @@ -208,9 +216,6 @@ def test_manysketch_skip_incompatible_fastas(runtmp, capfd): assert sig.minhash.ksize == 10 assert sig.minhash.scaled == 1 assert sig.md5sum() == "eb4467d11e0ecd2dbde4193bfc255310" - assert 'Starting file 2/4 (50%)' in captured.err - assert 'Starting file 3/4 (75%)' in captured.err - assert 'Starting file 4/4 (100%)' in captured.err assert 'DONE. Processed 4 fasta files' in captured.err assert 'WARNING: 3 fasta files skipped - no compatible signatures.' in captured.err @@ -238,7 +243,7 @@ def test_manysketch_bad_fa_csv(runtmp, capfd): sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') - make_file_csv(siglist, [sig2, sig47, sig63]) + make_assembly_csv(siglist, [sig2, sig47, sig63]) output = runtmp.output('db.zip') @@ -276,7 +281,7 @@ def test_manysketch_bad_fa_csv_3(runtmp, capfd): protfa1 = get_test_data('short-protein.fa') # make file csv but don't fill empty protein rows with ,"" - make_file_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1]) g_fa = [fa1, fa2, fa3] p_fa = [protfa1] with open(fa_csv, 'wt') as fp: @@ -305,7 +310,7 @@ def test_manysketch_empty_fa_csv(runtmp, capfd): # test empty fa_csv file fa_csv = runtmp.output('fa.txt') output = runtmp.output('out.zip') - make_file_csv(fa_csv, []) # empty + make_assembly_csv(fa_csv, []) # empty with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'manysketch', fa_csv, @@ -324,7 +329,7 @@ def test_manysketch_duplicated_rows(runtmp, capfd): fa3 = get_test_data('short3.fa') protfa1 = get_test_data('short-protein.fa') - make_file_csv(fa_csv, [fa1, fa1, fa1, fa3]) + make_assembly_csv(fa_csv, [fa1, fa1, fa1, fa3]) output = runtmp.output('db.zip') @@ -353,7 +358,7 @@ def test_manysketch_N_in_dna(runtmp): fp.write(">bad\n") fp.write("ACAGTN\n") - make_file_csv(fa_csv, [fa1]) + make_assembly_csv(fa_csv, [fa1]) output = runtmp.output('db.zip') @@ -378,7 +383,7 @@ def test_zip_manifest(runtmp, capfd): fa2 = get_test_data('short2.fa') fa3 = get_test_data('short3.fa') - make_file_csv(fa_csv, [fa1, fa2, fa3]) + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) output = runtmp.output('db.zip') runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, @@ -417,7 +422,7 @@ def test_protein_zip_manifest(runtmp, capfd): fa1 = get_test_data('short.fa') fa2 = get_test_data('short-protein.fa') - make_file_csv(fa_csv, [fa1], [fa2]) + make_assembly_csv(fa_csv, [fa1], [fa2]) output = runtmp.output('db.zip') runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, @@ -452,3 +457,135 @@ def test_protein_zip_manifest(runtmp, capfd): assert sig.minhash.ksize == 10 # minhash stores k*3, but does the conversion back for us assert sig.minhash.moltype == 'protein' assert sig.minhash.scaled == 1 + + +def test_manysketch_singleton(runtmp): + fa_csv = runtmp.output('db-fa.txt') + + fa1 = get_test_data('short.fa') + fa2 = get_test_data('short2.fa') + fa3 = get_test_data('short3.fa') + + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) + + output = runtmp.output('db.zip') + + runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, + '--param-str', "dna,k=31,scaled=1", "--singleton") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + print(sigs) + + assert len(sigs) == 4 + singleton_sketch = runtmp.output('short3.sig') + runtmp.sourmash('sketch', 'dna', fa3, '-o', singleton_sketch, + '--param-str', "dna,k=31,scaled=1", "--singleton") + ss_sketch = sourmash.load_signatures(singleton_sketch) + ss_sketch1 = next(ss_sketch) + ss_sketch2 = next(ss_sketch) + + expected_signames = ['shortName', 'tr1 4', 'firstname', 'other'] + for sig in sigs: + assert sig.name in expected_signames + if sig.name == 'firstname': + assert sig == ss_sketch1 + if sig.name == 'other': + assert sig == ss_sketch2 + + +def test_manysketch_reads(runtmp, capfd): + fa_csv = runtmp.output('db-fa.csv') + + fa1 = get_test_data('short.fa') + fa2 = get_test_data('short2.fa') + fa3 = get_test_data('short3.fa') + + make_reads_csv(fa_csv, [("short", fa1, fa2), ('short3', fa3, '')]) # make sure we can just do read1 alone + + output = runtmp.output('db.zip') + + runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, + '--param-str', "dna,k=31,scaled=1") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + captured = capfd.readouterr() + print(captured.out) + print(captured.err) + assert "Found 'reads' CSV, assuming all files are DNA." in captured.out + assert "Starting file 3/3 (100%)" in captured.err + assert "DONE. Processed 3 fasta files" in captured.err + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + print(sigs) + + assert len(sigs) == 2 + s1 = runtmp.output('short.sig') + runtmp.sourmash('sketch', 'dna', fa1, fa2, '-o', s1, + '--param-str', "k=31,scaled=1", '--name', 'short') + sig1 = sourmash.load_one_signature(s1) + s3 = runtmp.output('short3.sig') + runtmp.sourmash('sketch', 'dna', fa3, '-o', s3, + '--param-str', "k=31,scaled=1", '--name', 'short3') + sig2 = sourmash.load_one_signature(s3) + + expected_signames = ['short', 'short3'] + for sig in sigs: + assert sig.name in expected_signames + if sig.name == 'short': + assert sig == sig1 + if sig.name == 'short3': + assert sig == sig2 + + +def test_manysketch_reads_singleton(runtmp, capfd): + fa_csv = runtmp.output('db-fa.csv') + + fa1 = get_test_data('short.fa') + fa2 = get_test_data('short2.fa') + fa3 = get_test_data('short3.fa') + + make_reads_csv(fa_csv, [("short", fa2, fa3), ]) + + output = runtmp.output('db.zip') + + runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, + '--param-str', "dna,k=31,scaled=1", '--singleton') + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + captured = capfd.readouterr() + print(captured.out) + print(captured.err) + assert "Found 'reads' CSV, assuming all files are DNA." in captured.out + assert "Starting file 2/2 (100%)" in captured.err + assert "DONE. Processed 2 fasta files" in captured.err + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + print(sigs) + + assert len(sigs) == 3 + s1 = runtmp.output('singleton.sig') + runtmp.sourmash('sketch', 'dna', fa2, fa3, '-o', s1, + '--param-str', "k=31,scaled=1", '--singleton') + ss = sourmash.load_signatures(s1) + + ss_sketch1 = next(ss) + ss_sketch2 = next(ss) + ss_sketch3 = next(ss) + + expected_signames = ['tr1 4', 'firstname', 'other'] + for sig in sigs: + assert sig.name in expected_signames + if sig.name == 'tr1 4': + assert sig == ss_sketch1 + elif sig.name == 'firstname': + assert sig == ss_sketch2 + elif sig.name == 'other': + assert sig == ss_sketch3 diff --git a/src/utils.rs b/src/utils.rs index e95f865f..76d34f8e 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -7,7 +7,6 @@ use anyhow::{anyhow, Context, Result}; use camino::Utf8Path as Path; use camino::Utf8PathBuf as PathBuf; use csv::Writer; -use serde::ser::Serializer; use serde::{Deserialize, Serialize}; use std::cmp::{Ordering, PartialOrd}; use std::collections::BinaryHeap; @@ -131,22 +130,51 @@ pub fn write_prefetch( Ok(()) } -pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result> { +enum CSVType { + Assembly, + Reads, + Unknown, +} + +fn detect_csv_type(headers: &csv::StringRecord) -> CSVType { + if headers.len() == 3 + && headers.get(0).unwrap() == "name" + && headers.get(1).unwrap() == "genome_filename" + && headers.get(2).unwrap() == "protein_filename" + { + CSVType::Assembly + } else if headers.len() == 3 + && headers.get(0).unwrap() == "name" + && headers.get(1).unwrap() == "read1" + && headers.get(2).unwrap() == "read2" + { + CSVType::Reads + } else { + CSVType::Unknown + } +} + +pub fn load_fasta_fromfile( + sketchlist_filename: String, +) -> Result<(Vec<(String, Vec, String)>, usize)> { let mut rdr = csv::Reader::from_path(sketchlist_filename)?; // Check for right header let headers = rdr.headers()?; - if headers.len() != 3 - || headers.get(0).unwrap() != "name" - || headers.get(1).unwrap() != "genome_filename" - || headers.get(2).unwrap() != "protein_filename" - { - return Err(anyhow!( - "Invalid header. Expected 'name,genome_filename,protein_filename', but got '{}'", + + match detect_csv_type(&headers) { + CSVType::Assembly => process_assembly_csv(rdr), + CSVType::Reads => process_reads_csv(rdr), + CSVType::Unknown => Err(anyhow!( + "Invalid header. Expected 'name,genome_filename,protein_filename' or 'name,read1,read2', but got '{}'", headers.iter().collect::>().join(",") - )); + )), } +} +fn process_assembly_csv( + mut rdr: csv::Reader, +) -> Result<(Vec<(String, Vec, String)>, usize)> { let mut results = Vec::new(); let mut row_count = 0; @@ -172,24 +200,27 @@ pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result Result, + // ) -> Result, String)>> { +) -> Result<(Vec<(String, Vec, String)>, usize)> { + let mut results = Vec::new(); + let mut processed_rows = std::collections::HashSet::new(); + let mut read1_count = 0; + let mut read2_count = 0; + let mut duplicate_count = 0; + + for result in rdr.records() { + let record = result?; + let row_string = record.iter().collect::>().join(","); + if processed_rows.contains(&row_string) { + duplicate_count += 1; + continue; + } + processed_rows.insert(row_string.clone()); + + let name = record + .get(0) + .ok_or_else(|| anyhow!("Missing 'name' field"))? + .to_string(); + let read1 = record + .get(1) + .ok_or_else(|| anyhow!("Missing 'read1' field"))?; + read1_count += 1; + let mut paths = vec![PathBuf::from(read1)]; + // allow missing read2 + let read2 = record + .get(2) + .and_then(|r2| if r2.is_empty() { None } else { Some(r2) }); + if let Some(r2) = read2 { + paths.push(PathBuf::from(r2)); + read2_count += 1; + } + results.push((name, paths, "dna".to_string())); + } + + println!("Found 'reads' CSV, assuming all files are DNA."); + println!( + "Loaded {} rows in total ({} with read1 and {} with read2), {} duplicates skipped.", + processed_rows.len(), + read1_count, + read2_count, + duplicate_count + ); + + let n_fastas = read1_count + read2_count; + + Ok((results, n_fastas)) } // Load all compatible minhashes from a collection into memory @@ -771,36 +856,6 @@ pub struct MultiSearchResult { pub max_containment_ani: Option, } -#[derive(Serialize)] -pub struct ManifestRow { - pub md5: String, - pub md5short: String, - pub ksize: u32, - pub moltype: String, - pub num: u32, - pub scaled: u64, - pub n_hashes: usize, - pub with_abundance: BoolPython, - pub name: String, - pub filename: String, - pub internal_location: String, -} - -// A wrapper type for booleans to customize serialization -pub struct BoolPython(bool); - -impl Serialize for BoolPython { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - match self.0 { - true => serializer.serialize_str("True"), - false => serializer.serialize_str("False"), - } - } -} - pub fn open_stdout_or_file(output: Option) -> Box { // if output is a file, use open_output_file if let Some(path) = output {