sourmash-bio · bluegenes · Feb 27, 2024 · Feb 26, 2024 · Feb 26, 2024 · Feb 26, 2024
diff --git a/doc/README.md b/doc/README.md
@@ -88,9 +88,19 @@ When using a fromfile for search, we load all signatures into memory at the star
 
 ### Running `manysketch`
 
-The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped sourmash signature collection (`zip`). `manysketch` uses one thread per input file, so it can (very) efficiently sketch many files at once; and, because sequence file parsing is entirely implemented in Rust, it is much, _much_ faster than `sourmash sketch` for large FASTQ files.
+The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped sourmash signature collection (`zip`). `manysketch` uses one thread per input file, so it can (very) efficiently sketch many files at once; and, because sequence file parsing is entirely implemented in Rust, it is much, _much_ faster than `sourmash sketch` for large FASTQ files. However, it does not currently support translation, i.e. protein signature generation from DNA FASTA.
 
-To run `manysketch`, you need to build a text file list of FASTA/FASTQ files, with one on each line (`manysketch.csv`, below).  A simple way to do this for a directory is this command snippet:
+#### specifying input FASTA
+
+To run `manysketch`, you need to build a text file list of FASTA/FASTQ files (see `manysketch.csv` example, below).
+
+The following formats are accepted:
+- 3 columns: `name,genome_filename,protein_filename`
+  >`genome_filename` entries are considered DNA FASTA, `protein_filename` entries are considered protein FASTA.
+- 3 columns: `name,read1,read2`
+  > All entries considered DNA FASTA, and both `read1` and `read2` files are used as input for a single sketch with name `name`.
+
+A simple way to build a manysketch input file for a directory is this command snippet:
 ```
 echo name,genome_filename,protein_filename > manysketch.csv
 for i in *.fa.gz
@@ -117,6 +127,24 @@ sourmash scripts manysketch fa.csv -o fa.zip -p k=21,k=31,k=51,scaled=1000,abund
 ```
 See [the sourmash sketch docs](https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-sketch-make-sourmash-signatures-from-sequence-data) for more information on param strings.
 
+#### singleton sketching
+
+`manysketch` also supports building independent sketches for each record in a FASTA file (`--singleton`).
+
+You can run:
+
+```
+sourmash scripts manysketch manysketch.csv -o fa.zip --singleton
+```
+The output will be written to `fa.zip`
+
+You can check if all signatures were written properly with
+```
+sourmash sig summarize fa.zip
+```
+The number of sketches per parameter combination should equal the total number of records in all input FASTA.
+The `name` column will not be used. Instead, each sketch will be named from the FASTA record name.
+
 ### Running `multisearch`
 
 The `multisearch` command compares one or more query genomes, and one or more subject genomes. It differs from `manysearch` by loading all genomes into memory.

diff --git a/src/lib.rs b/src/lib.rs
@@ -259,8 +259,13 @@ fn do_pairwise(
 }
 
 #[pyfunction]
-fn do_manysketch(filelist: String, param_str: String, output: String) -> anyhow::Result<u8> {
-    match manysketch::manysketch(filelist, param_str, output) {
+fn do_manysketch(
+    filelist: String,
+    param_str: String,
+    output: String,
+    singleton: bool,
+) -> anyhow::Result<u8> {
+    match manysketch::manysketch(filelist, param_str, output, singleton) {
         Ok(_) => Ok(0),
         Err(e) => {
             eprintln!("Error: {e}");

diff --git a/src/manysketch.rs b/src/manysketch.rs
@@ -117,14 +117,14 @@ pub fn manysketch(
     filelist: String,
     param_str: String,
     output: String,
+    singleton: bool,
 ) -> Result<(), Box<dyn std::error::Error>> {
-    let fileinfo = match load_fasta_fromfile(filelist) {
-        Ok(result) => result,
+    let (fileinfo, n_fastas) = match load_fasta_fromfile(filelist) {
+        Ok((file_info, n_fastas)) => (file_info, n_fastas),
         Err(e) => bail!("Could not load fromfile csv. Underlying error: {}", e),
     };
 
     // if no files to process, exit with error
-    let n_fastas = fileinfo.len();
     if n_fastas == 0 {
         bail!("No files to load, exiting.");
     }
@@ -165,70 +165,92 @@ pub fn manysketch(
 
     let send_result = fileinfo
         .par_iter()
-        .filter_map(|(name, filename, moltype)| {
-            // increment processed_fastas counter; make 1-based for % reporting
-            let i = processed_fastas.fetch_add(1, atomic::Ordering::SeqCst);
-            // progress report at threshold
-            if (i + 1) % reporting_threshold == 0 {
-                let percent_processed = (((i + 1) as f64 / n_fastas as f64) * 100.0).round();
-                eprintln!(
-                    "Starting file {}/{} ({}%)",
-                    (i + 1),
-                    n_fastas,
-                    percent_processed
-                );
-            }
-
-            // build sig templates from params
-            let mut sigs = build_siginfo(&params_vec, moltype);
-            // if no sigs to build, skip
-            if sigs.is_empty() {
-                let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst);
+        .filter_map(|(name, filenames, moltype)| {
+            let mut allsigs = Vec::new();
+            // build sig templates for these sketches from params, check if there are sigs to build
+            let sig_templates = build_siginfo(&params_vec, moltype);
+            // if no sigs to build, skip this iteration
+            if sig_templates.is_empty() {
+                skipped_paths.fetch_add(filenames.len(), atomic::Ordering::SeqCst);
+                processed_fastas.fetch_add(1, atomic::Ordering::SeqCst);
                 return None;
             }
 
-            // Open fasta file reader
-            let mut reader = match parse_fastx_file(filename) {
-                Ok(r) => r,
-                Err(err) => {
-                    eprintln!("Error opening file {}: {:?}", filename, err);
-                    let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst);
-                    return None;
-                }
-            };
-            // parse fasta and add to signature
+            let mut sigs = sig_templates.clone();
+            // have name / filename been set for each sig yet?
             let mut set_name = false;
-            while let Some(record_result) = reader.next() {
-                match record_result {
-                    Ok(record) => {
-                        // do we need to normalize to make sure all the bases are consistently capitalized?
-                        // let norm_seq = record.normalize(false);
-                        sigs.iter_mut().for_each(|sig| {
-                            if !set_name {
-                                sig.set_name(name);
-                                sig.set_filename(filename.as_str());
-                                set_name = true;
-                            };
-                            if moltype == "protein" {
-                                sig.add_protein(&record.seq())
-                                    .expect("Failed to add protein");
-                            } else {
-                                sig.add_sequence(&record.seq(), true)
-                                    .expect("Failed to add sequence");
-                                // if not force, panics with 'N' in dna sequence
-                            }
-                        });
+            // if merging multiple files, sourmash sets filename as last filename
+            let last_filename = filenames.last().unwrap();
+
+            for filename in filenames {
+                // increment processed_fastas counter; make 1-based for % reporting
+                let i = processed_fastas.fetch_add(1, atomic::Ordering::SeqCst);
+                // progress report at threshold
+                if (i + 1) % reporting_threshold == 0 {
+                    let percent_processed = (((i + 1) as f64 / n_fastas as f64) * 100.0).round();
+                    eprintln!(
+                        "Starting file {}/{} ({}%)",
+                        (i + 1),
+                        n_fastas,
+                        percent_processed
+                    );
+                }
+
+                // Open fasta file reader
+                let mut reader = match parse_fastx_file(filename) {
+                    Ok(r) => r,
+                    Err(err) => {
+                        eprintln!("Error opening file {}: {:?}", filename, err);
+                        failed_paths.fetch_add(1, atomic::Ordering::SeqCst);
+                        return None;
+                    }
+                };
+
+                // parse fasta and add to signature
+                while let Some(record_result) = reader.next() {
+                    match record_result {
+                        Ok(record) => {
+                            // do we need to normalize to make sure all the bases are consistently capitalized?
+                            // let norm_seq = record.normalize(false);
+                            sigs.iter_mut().for_each(|sig| {
+                                if singleton {
+                                    let record_name = std::str::from_utf8(record.id())
+                                        .expect("could not get record id");
+                                    sig.set_name(record_name);
+                                    sig.set_filename(filename.as_str());
+                                } else if !set_name {
+                                    sig.set_name(name);
+                                    // sourmash sets filename to last filename if merging fastas
+                                    sig.set_filename(last_filename.as_str());
+                                    set_name = true;
+                                };
+                                if moltype == "protein" {
+                                    sig.add_protein(&record.seq())
+                                        .expect("Failed to add protein");
+                                } else {
+                                    sig.add_sequence(&record.seq(), true)
+                                        .expect("Failed to add sequence");
+                                    // if not force, panics with 'N' in dna sequence
+                                }
+                            });
+                        }
+                        Err(err) => eprintln!("Error while processing record: {:?}", err),
+                    }
+                    if singleton {
+                        allsigs.append(&mut sigs);
+                        sigs = sig_templates.clone();
                     }
-                    Err(err) => eprintln!("Error while processing record: {:?}", err),
                 }
             }
-
-            Some(sigs)
+            if !singleton {
+                allsigs.append(&mut sigs);
+            }
+            Some(allsigs)
         })
         .try_for_each_with(
             send.clone(),
-            |s: &mut std::sync::Arc<std::sync::mpsc::SyncSender<ZipMessage>>, sigs| {
-                if let Err(e) = s.send(ZipMessage::SignatureData(sigs)) {
+            |s: &mut std::sync::Arc<std::sync::mpsc::SyncSender<ZipMessage>>, filled_sigs| {
+                if let Err(e) = s.send(ZipMessage::SignatureData(filled_sigs)) {
                     Err(format!("Unable to send internal data: {:?}", e))
                 } else {
                     Ok(())

diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py
@@ -328,14 +328,16 @@ class Branchwater_Manysketch(CommandLinePlugin):
 
     def __init__(self, p):
         super().__init__(p)
-        p.add_argument('fromfile_csv', help="a csv file containing paths to fasta files. \
-                        Columns must be: 'name,genome_filename,protein_filename'")
+        p.add_argument('fromfile_csv', help="a csv file containing paths to FASTA files. \
+                        Columns must be: 'name,genome_filename,protein_filename' or 'name,read1,read2'")
         p.add_argument('-o', '--output', required=True,
                        help='output zip file for the signatures')
         p.add_argument('-p', '--param-string', action='append', type=str, default=[],
                           help='parameter string for sketching (default: k=31,scaled=1000)')
         p.add_argument('-c', '--cores', default=0, type=int,
                        help='number of cores to use (default is all available)')
+        p.add_argument('-s', '--singleton', action="store_true",
+                       help='build one sketch per FASTA record, i.e. multiple sketches per FASTA file')
 
     def main(self, args):
         print_version()
@@ -355,7 +357,8 @@ def main(self, args):
         super().main(args)
         status = sourmash_plugin_branchwater.do_manysketch(args.fromfile_csv,
                                                            args.param_string,
-                                                           args.output)
+                                                           args.output,
+                                                           args.singleton)
         if status == 0:
             notify(f"...manysketch is done! results in '{args.output}'")
         return status