From 999683d1f2d13b5f74a0ca7772dd26527ac16647 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 26 Feb 2024 14:13:56 -0800
Subject: [PATCH 01/11] upd manysketch for multiple files

---
 src/lib.rs                                    |   9 +-
 src/manysketch.rs                             | 122 +++++++++--------
 .../sourmash_plugin_branchwater/__init__.py   |   5 +-
 src/utils.rs                                  | 127 ++++++++++++------
 4 files changed, 162 insertions(+), 101 deletions(-)
diff --git a/src/lib.rs b/src/lib.rs
index 99264b87..5b7da2de 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -259,8 +259,13 @@ fn do_pairwise(
 }
 
 #[pyfunction]
-fn do_manysketch(filelist: String, param_str: String, output: String) -> anyhow::Result<u8> {
-    match manysketch::manysketch(filelist, param_str, output) {
+fn do_manysketch(
+    filelist: String,
+    param_str: String,
+    output: String,
+    singleton: bool,
+) -> anyhow::Result<u8> {
+    match manysketch::manysketch(filelist, param_str, output, singleton) {
         Ok(_) => Ok(0),
         Err(e) => {
             eprintln!("Error: {e}");
diff --git a/src/manysketch.rs b/src/manysketch.rs
index 485ed2b3..eaff89e6 100644
--- a/src/manysketch.rs
+++ b/src/manysketch.rs
@@ -117,6 +117,7 @@ pub fn manysketch(
     filelist: String,
     param_str: String,
     output: String,
+    singleton: bool,
 ) -> Result<(), Box<dyn std::error::Error>> {
     let fileinfo = match load_fasta_fromfile(filelist) {
         Ok(result) => result,
@@ -165,70 +166,81 @@ pub fn manysketch(
 
     let send_result = fileinfo
         .par_iter()
-        .filter_map(|(name, filename, moltype)| {
-            // increment processed_fastas counter; make 1-based for % reporting
-            let i = processed_fastas.fetch_add(1, atomic::Ordering::SeqCst);
-            // progress report at threshold
-            if (i + 1) % reporting_threshold == 0 {
-                let percent_processed = (((i + 1) as f64 / n_fastas as f64) * 100.0).round();
-                eprintln!(
-                    "Starting file {}/{} ({}%)",
-                    (i + 1),
-                    n_fastas,
-                    percent_processed
-                );
-            }
-
-            // build sig templates from params
-            let mut sigs = build_siginfo(&params_vec, moltype);
-            // if no sigs to build, skip
-            if sigs.is_empty() {
-                let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst);
-                return None;
-            }
+        .filter_map(|(name, filenames, moltype)| {
+            let mut allsigs = Vec::new();
+            for filename in filenames {
+                // increment processed_fastas counter; make 1-based for % reporting
+                let i = processed_fastas.fetch_add(1, atomic::Ordering::SeqCst);
+                // progress report at threshold
+                if (i + 1) % reporting_threshold == 0 {
+                    let percent_processed = (((i + 1) as f64 / n_fastas as f64) * 100.0).round();
+                    eprintln!(
+                        "Starting file {}/{} ({}%)",
+                        (i + 1),
+                        n_fastas,
+                        percent_processed
+                    );
+                }
 
-            // Open fasta file reader
-            let mut reader = match parse_fastx_file(filename) {
-                Ok(r) => r,
-                Err(err) => {
-                    eprintln!("Error opening file {}: {:?}", filename, err);
-                    let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst);
+                // build sig templates from params
+                let sig_templates = build_siginfo(&params_vec, moltype);
+                let mut sigs = sig_templates.clone();
+                // if no sigs to build, skip
+                if sigs.is_empty() {
+                    let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst);
                     return None;
                 }
-            };
-            // parse fasta and add to signature
-            let mut set_name = false;
-            while let Some(record_result) = reader.next() {
-                match record_result {
-                    Ok(record) => {
-                        // do we need to normalize to make sure all the bases are consistently capitalized?
-                        // let norm_seq = record.normalize(false);
-                        sigs.iter_mut().for_each(|sig| {
-                            if !set_name {
-                                sig.set_name(name);
-                                sig.set_filename(filename.as_str());
-                                set_name = true;
-                            };
-                            if moltype == "protein" {
-                                sig.add_protein(&record.seq())
-                                    .expect("Failed to add protein");
-                            } else {
-                                sig.add_sequence(&record.seq(), true)
-                                    .expect("Failed to add sequence");
-                                // if not force, panics with 'N' in dna sequence
-                            }
-                        });
+
+                // Open fasta file reader
+                let mut reader = match parse_fastx_file(filename) {
+                    Ok(r) => r,
+                    Err(err) => {
+                        eprintln!("Error opening file {}: {:?}", filename, err);
+                        let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst);
+                        return None;
+                    }
+                };
+
+                // parse fasta and add to signature
+                let mut set_name = false;
+                while let Some(record_result) = reader.next() {
+                    match record_result {
+                        Ok(record) => {
+                            // do we need to normalize to make sure all the bases are consistently capitalized?
+                            // let norm_seq = record.normalize(false);
+                            sigs.iter_mut().for_each(|sig| {
+                                if !set_name {
+                                    sig.set_name(name);
+                                    sig.set_filename(filename.as_str());
+                                    set_name = true;
+                                };
+                                if moltype == "protein" {
+                                    sig.add_protein(&record.seq())
+                                        .expect("Failed to add protein");
+                                } else {
+                                    sig.add_sequence(&record.seq(), true)
+                                        .expect("Failed to add sequence");
+                                    // if not force, panics with 'N' in dna sequence
+                                }
+                            });
+                        }
+                        Err(err) => eprintln!("Error while processing record: {:?}", err),
+                    }
+                    if singleton {
+                        allsigs.append(&mut sigs);
+                        sigs = sig_templates.clone();
                     }
-                    Err(err) => eprintln!("Error while processing record: {:?}", err),
+                }
+                if !singleton {
+                    allsigs.append(&mut sigs);
                 }
             }
-
-            Some(sigs)
+            Some(allsigs)
         })
         .try_for_each_with(
             send.clone(),
-            |s: &mut std::sync::Arc<std::sync::mpsc::SyncSender<ZipMessage>>, sigs| {
-                if let Err(e) = s.send(ZipMessage::SignatureData(sigs)) {
+            |s: &mut std::sync::Arc<std::sync::mpsc::SyncSender<ZipMessage>>, filled_sigs| {
+                if let Err(e) = s.send(ZipMessage::SignatureData(filled_sigs)) {
                     Err(format!("Unable to send internal data: {:?}", e))
                 } else {
                     Ok(())
diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py
index e1912817..fef4c3ba 100755
--- a/src/python/sourmash_plugin_branchwater/__init__.py
+++ b/src/python/sourmash_plugin_branchwater/__init__.py
@@ -336,6 +336,8 @@ def __init__(self, p):
                           help='parameter string for sketching (default: k=31,scaled=1000)')
         p.add_argument('-c', '--cores', default=0, type=int,
                        help='number of cores to use (default is all available)')
+        p.add_argument('-s', '--singleton', action="store_true",
+                       help='build one sketch per fasta record')
 
     def main(self, args):
         print_version()
@@ -355,7 +357,8 @@ def main(self, args):
         super().main(args)
         status = sourmash_plugin_branchwater.do_manysketch(args.fromfile_csv,
                                                            args.param_string,
-                                                           args.output)
+                                                           args.output,
+                                                           args.singleton)
         if status == 0:
             notify(f"...manysketch is done! results in '{args.output}'")
         return status
diff --git a/src/utils.rs b/src/utils.rs
index e95f865f..71179a3b 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -7,7 +7,7 @@ use anyhow::{anyhow, Context, Result};
 use camino::Utf8Path as Path;
 use camino::Utf8PathBuf as PathBuf;
 use csv::Writer;
-use serde::ser::Serializer;
+// use serde::ser::Serializer;
 use serde::{Deserialize, Serialize};
 use std::cmp::{Ordering, PartialOrd};
 use std::collections::BinaryHeap;
@@ -131,22 +131,51 @@ pub fn write_prefetch(
     Ok(())
 }
 
-pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result<Vec<(String, PathBuf, String)>> {
+enum CSVType {
+    Assembly,
+    Reads,
+    Unknown,
+}
+
+fn detect_csv_type(headers: &csv::StringRecord) -> CSVType {
+    if headers.len() == 3
+        && headers.get(0).unwrap() == "name"
+        && headers.get(1).unwrap() == "genome_filename"
+        && headers.get(2).unwrap() == "protein_filename"
+    {
+        CSVType::Assembly
+    } else if headers.len() == 3
+        && headers.get(0).unwrap() == "name"
+        && headers.get(1).unwrap() == "read1"
+        && headers.get(2).unwrap() == "read2"
+    {
+        CSVType::Reads
+    } else {
+        CSVType::Unknown
+    }
+}
+
+pub fn load_fasta_fromfile(
+    sketchlist_filename: String,
+) -> Result<Vec<(String, Vec<PathBuf>, String)>> {
     let mut rdr = csv::Reader::from_path(sketchlist_filename)?;
 
     // Check for right header
     let headers = rdr.headers()?;
-    if headers.len() != 3
-        || headers.get(0).unwrap() != "name"
-        || headers.get(1).unwrap() != "genome_filename"
-        || headers.get(2).unwrap() != "protein_filename"
-    {
-        return Err(anyhow!(
-            "Invalid header. Expected 'name,genome_filename,protein_filename', but got '{}'",
+
+    match detect_csv_type(&headers) {
+        CSVType::Assembly => process_assembly_csv(rdr),
+        CSVType::Reads => process_reads_csv(rdr),
+        CSVType::Unknown => Err(anyhow!(
+            "Invalid header. Expected 'name,genome_filename,protein_filename' or 'name,read1,read2', but got '{}'",
             headers.iter().collect::<Vec<_>>().join(",")
-        ));
+        )),
     }
+}
 
+fn process_assembly_csv(
+    mut rdr: csv::Reader<std::fs::File>,
+) -> Result<Vec<(String, Vec<PathBuf>, String)>> {
     let mut results = Vec::new();
 
     let mut row_count = 0;
@@ -178,17 +207,21 @@ pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result<Vec<(String, P
         if !genome_filename.is_empty() {
             results.push((
                 name.clone(),
-                PathBuf::from(genome_filename),
+                // PathBuf::from(genome_filename),
+                vec![PathBuf::from(genome_filename)],
                 "dna".to_string(),
             ));
             genome_count += 1;
         }
-
         let protein_filename = record
             .get(2)
             .ok_or_else(|| anyhow!("Missing 'protein_filename' field"))?;
         if !protein_filename.is_empty() {
-            results.push((name, PathBuf::from(protein_filename), "protein".to_string()));
+            results.push((
+                name,
+                vec![PathBuf::from(protein_filename)],
+                "protein".to_string(),
+            ));
             protein_count += 1;
         }
     }
@@ -203,6 +236,44 @@ pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result<Vec<(String, P
     Ok(results)
 }
 
+fn process_reads_csv(
+    mut rdr: csv::Reader<std::fs::File>,
+) -> Result<Vec<(String, Vec<PathBuf>, String)>> {
+    let mut results = Vec::new();
+    let mut processed_rows = std::collections::HashSet::new();
+    let mut duplicate_count = 0;
+
+    for result in rdr.records() {
+        let record = result?;
+        let row_string = record.iter().collect::<Vec<_>>().join(",");
+        if processed_rows.contains(&row_string) {
+            duplicate_count += 1;
+            continue;
+        }
+        processed_rows.insert(row_string.clone());
+
+        let name = record
+            .get(0)
+            .ok_or_else(|| anyhow!("Missing 'name' field"))?
+            .to_string();
+        let read1 = record
+            .get(1)
+            .ok_or_else(|| anyhow!("Missing 'read1' field"))?;
+        let read2 = record
+            .get(2)
+            .ok_or_else(|| anyhow!("Missing 'read2' field"))?;
+        let paths = vec![PathBuf::from(read1), PathBuf::from(read2)];
+        results.push((name, paths, "alternate".to_string()));
+    }
+
+    if duplicate_count > 0 {
+        println!("Warning: {} duplicated rows were skipped.", duplicate_count);
+    }
+    println!("Loaded alternate CSV variant.");
+
+    Ok(results)
+}
+
 // Load all compatible minhashes from a collection into memory
 // also store sig name and md5 alongside, as we usually need those
 pub fn load_sketches(
@@ -771,36 +842,6 @@ pub struct MultiSearchResult {
     pub max_containment_ani: Option<f64>,
 }
 
-#[derive(Serialize)]
-pub struct ManifestRow {
-    pub md5: String,
-    pub md5short: String,
-    pub ksize: u32,
-    pub moltype: String,
-    pub num: u32,
-    pub scaled: u64,
-    pub n_hashes: usize,
-    pub with_abundance: BoolPython,
-    pub name: String,
-    pub filename: String,
-    pub internal_location: String,
-}
-
-// A wrapper type for booleans to customize serialization
-pub struct BoolPython(bool);
-
-impl Serialize for BoolPython {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        match self.0 {
-            true => serializer.serialize_str("True"),
-            false => serializer.serialize_str("False"),
-        }
-    }
-}
-
 pub fn open_stdout_or_file(output: Option<String>) -> Box<dyn Write + Send + 'static> {
     // if output is a file, use open_output_file
     if let Some(path) = output {

From cdb08991dd24547f39f5a0496a2d450db40ae090 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 26 Feb 2024 14:22:08 -0800
Subject: [PATCH 02/11] set singleton name properly

---
 src/manysketch.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/manysketch.rs b/src/manysketch.rs
index eaff89e6..4e169c1a 100644
--- a/src/manysketch.rs
+++ b/src/manysketch.rs
@@ -209,7 +209,12 @@ pub fn manysketch(
                             // do we need to normalize to make sure all the bases are consistently capitalized?
                             // let norm_seq = record.normalize(false);
                             sigs.iter_mut().for_each(|sig| {
-                                if !set_name {
+                                if singleton {
+                                    let record_name = std::str::from_utf8(record.id())
+                                        .expect("could not get record id");
+                                    sig.set_name(record_name);
+                                    sig.set_filename(filename.as_str());
+                                } else if !set_name {
                                     sig.set_name(name);
                                     sig.set_filename(filename.as_str());
                                     set_name = true;

From 3ccf625da79a4b18c301dd05017e3ac85559dec1 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 26 Feb 2024 15:02:28 -0800
Subject: [PATCH 03/11] test singleton

---
 src/manysketch.rs               | 37 +++++++++++++++++------------
 src/python/tests/test_sketch.py | 41 ++++++++++++++++++++++++++++++---
 2 files changed, 60 insertions(+), 18 deletions(-)

diff --git a/src/manysketch.rs b/src/manysketch.rs
index 4e169c1a..70a4a936 100644
--- a/src/manysketch.rs
+++ b/src/manysketch.rs
@@ -168,6 +168,22 @@ pub fn manysketch(
         .par_iter()
         .filter_map(|(name, filenames, moltype)| {
             let mut allsigs = Vec::new();
+            // build sig templates for these sketches from params, check if there are sigs to build
+            let sig_templates = build_siginfo(&params_vec, moltype);
+            // if no sigs to build, skip this iteration
+            if sig_templates.is_empty() {
+                skipped_paths.fetch_add(filenames.len(), atomic::Ordering::SeqCst);
+                processed_fastas.fetch_add(1, atomic::Ordering::SeqCst);
+                return None;
+            }
+
+            let mut sigs = sig_templates.clone();
+            // have name / filename been set for each sig yet?
+            let mut set_name = false;
+            // if merging multiple files, sourmash sets filename as last filename
+            let last_filename = filenames.last().unwrap();
+
+            // to do: consider changing reporting to per-sig, no matter how many fastas? but singleton...
             for filename in filenames {
                 // increment processed_fastas counter; make 1-based for % reporting
                 let i = processed_fastas.fetch_add(1, atomic::Ordering::SeqCst);
@@ -182,27 +198,17 @@ pub fn manysketch(
                     );
                 }
 
-                // build sig templates from params
-                let sig_templates = build_siginfo(&params_vec, moltype);
-                let mut sigs = sig_templates.clone();
-                // if no sigs to build, skip
-                if sigs.is_empty() {
-                    let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst);
-                    return None;
-                }
-
                 // Open fasta file reader
                 let mut reader = match parse_fastx_file(filename) {
                     Ok(r) => r,
                     Err(err) => {
                         eprintln!("Error opening file {}: {:?}", filename, err);
-                        let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst);
+                        failed_paths.fetch_add(1, atomic::Ordering::SeqCst);
                         return None;
                     }
                 };
 
                 // parse fasta and add to signature
-                let mut set_name = false;
                 while let Some(record_result) = reader.next() {
                     match record_result {
                         Ok(record) => {
@@ -216,7 +222,8 @@ pub fn manysketch(
                                     sig.set_filename(filename.as_str());
                                 } else if !set_name {
                                     sig.set_name(name);
-                                    sig.set_filename(filename.as_str());
+                                    // sourmash sets filename to last filename if merging fastas
+                                    sig.set_filename(last_filename.as_str());
                                     set_name = true;
                                 };
                                 if moltype == "protein" {
@@ -236,9 +243,9 @@ pub fn manysketch(
                         sigs = sig_templates.clone();
                     }
                 }
-                if !singleton {
-                    allsigs.append(&mut sigs);
-                }
+            }
+            if !singleton {
+                allsigs.append(&mut sigs);
             }
             Some(allsigs)
         })
diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py
index 88c99dcf..b03b25ef 100644
--- a/src/python/tests/test_sketch.py
+++ b/src/python/tests/test_sketch.py
@@ -208,9 +208,6 @@ def test_manysketch_skip_incompatible_fastas(runtmp, capfd):
         assert sig.minhash.ksize == 10
         assert sig.minhash.scaled == 1
         assert sig.md5sum() == "eb4467d11e0ecd2dbde4193bfc255310"
-    assert 'Starting file 2/4 (50%)' in captured.err
-    assert 'Starting file 3/4 (75%)' in captured.err
-    assert 'Starting file 4/4 (100%)' in captured.err
     assert 'DONE. Processed 4 fasta files' in captured.err
     assert 'WARNING: 3 fasta files skipped - no compatible signatures.' in captured.err
 
@@ -452,3 +449,41 @@ def test_protein_zip_manifest(runtmp, capfd):
         assert sig.minhash.ksize == 10 # minhash stores k*3, but does the conversion back for us
         assert sig.minhash.moltype == 'protein'
         assert sig.minhash.scaled == 1
+
+
+def test_manysketch_singleton(runtmp):
+    fa_csv = runtmp.output('db-fa.txt')
+
+    fa1 = get_test_data('short.fa')
+    fa2 = get_test_data('short2.fa')
+    fa3 = get_test_data('short3.fa')
+
+    make_file_csv(fa_csv, [fa1, fa2, fa3])
+
+    output = runtmp.output('db.zip')
+
+    runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output,
+                    '--param-str', "dna,k=31,scaled=1", "--singleton")
+
+    assert os.path.exists(output)
+    assert not runtmp.last_result.out # stdout should be empty
+
+    idx = sourmash.load_file_as_index(output)
+    sigs = list(idx.signatures())
+    print(sigs)
+
+    assert len(sigs) == 4
+    singleton_sketch = runtmp.output('short3.sig')
+    runtmp.sourmash('sketch', 'dna', fa3, '-o', singleton_sketch,
+                    '--param-str', "dna,k=31,scaled=1", "--singleton")
+    ss_sketch = sourmash.load_signatures(singleton_sketch)
+    ss_sketch1 = next(ss_sketch)
+    ss_sketch2 = next(ss_sketch)
+
+    expected_signames = ['shortName', 'tr1 4', 'firstname', 'other']
+    for sig in sigs:
+        assert sig.name in expected_signames
+        if sig.name == 'firstname':
+            assert sig == ss_sketch1
+        if sig.name == 'other':
+            assert sig == ss_sketch2

From 4c9edd3f2c0efe5928cebd12fa38e8124bb05be0 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 26 Feb 2024 16:10:16 -0800
Subject: [PATCH 04/11] fix fasta reporting

---
 src/manysketch.rs               |  5 +-
 src/python/tests/test_sketch.py | 84 +++++++++++++++++++++++++++------
 src/utils.rs                    | 82 +++++++++++++++++++-------------
 3 files changed, 119 insertions(+), 52 deletions(-)

diff --git a/src/manysketch.rs b/src/manysketch.rs
index 70a4a936..3c5e8067 100644
--- a/src/manysketch.rs
+++ b/src/manysketch.rs
@@ -119,13 +119,12 @@ pub fn manysketch(
     output: String,
     singleton: bool,
 ) -> Result<(), Box<dyn std::error::Error>> {
-    let fileinfo = match load_fasta_fromfile(filelist) {
-        Ok(result) => result,
+    let (fileinfo, n_fastas) = match load_fasta_fromfile(filelist) {
+        Ok((file_info, n_fastas)) => (file_info, n_fastas),
         Err(e) => bail!("Could not load fromfile csv. Underlying error: {}", e),
     };
 
     // if no files to process, exit with error
-    let n_fastas = fileinfo.len();
     if n_fastas == 0 {
         bail!("No files to load, exiting.");
     }
diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py
index b03b25ef..c2ff1742 100644
--- a/src/python/tests/test_sketch.py
+++ b/src/python/tests/test_sketch.py
@@ -12,7 +12,7 @@ def get_test_data(filename):
     return os.path.join(thisdir, 'test-data', filename)
 
 
-def make_file_csv(filename, genome_paths, protein_paths = []):
+def make_assembly_csv(filename, genome_paths, protein_paths = []):
     # equalize path lengths by adding "".
     names = [os.path.basename(x).split('.fa')[0] for x in genome_paths]
     if len(protein_paths) < len(genome_paths):
@@ -26,6 +26,14 @@ def make_file_csv(filename, genome_paths, protein_paths = []):
         for name, genome_path, protein_path in zip(names, genome_paths, protein_paths):
             fp.write("{},{},{}\n".format(name, genome_path, protein_path))
 
+def make_reads_csv(filename, reads_tuples = []):
+    # reads tuples should be (name,read1,read2)
+    with open(filename, 'wt') as fp:
+        fp.write("name,read1,read2\n")
+        for (name, read1, read2) in reads_tuples:
+            print(f"{name},{read1},{read2}")
+            fp.write("{},{},{}\n".format(name, read1, read2))
+
 
 def test_installed(runtmp):
     with pytest.raises(utils.SourmashCommandFailed):
@@ -41,7 +49,7 @@ def test_manysketch_simple(runtmp):
     fa2 = get_test_data('short2.fa')
     fa3 = get_test_data('short3.fa')
 
-    make_file_csv(fa_csv, [fa1, fa2, fa3])
+    make_assembly_csv(fa_csv, [fa1, fa2, fa3])
 
     output = runtmp.output('db.zip')
 
@@ -65,7 +73,7 @@ def test_manysketch_mult_k(runtmp):
     fa2 = get_test_data('short2.fa')
     fa3 = get_test_data('short3.fa')
 
-    make_file_csv(fa_csv, [fa1, fa2, fa3])
+    make_assembly_csv(fa_csv, [fa1, fa2, fa3])
 
     output = runtmp.output('db.zip')
 
@@ -89,7 +97,7 @@ def test_manysketch_mult_k_2(runtmp):
     fa2 = get_test_data('short2.fa')
     fa3 = get_test_data('short3.fa')
 
-    make_file_csv(fa_csv, [fa1, fa2, fa3])
+    make_assembly_csv(fa_csv, [fa1, fa2, fa3])
 
     output = runtmp.output('db.zip')
 
@@ -116,7 +124,7 @@ def test_manysketch_mult_moltype(runtmp):
     fa3 = get_test_data('short3.fa')
     protfa1 = get_test_data('short-protein.fa')
 
-    make_file_csv(fa_csv, [fa1, fa2, fa3], [protfa1])
+    make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1])
 
     output = runtmp.output('db.zip')
 
@@ -158,7 +166,7 @@ def test_manysketch_only_incompatible_fastas(runtmp, capfd):
     fa2 = get_test_data('short2.fa')
     fa3 = get_test_data('short3.fa')
 
-    make_file_csv(fa_csv, [fa1, fa2, fa3])
+    make_assembly_csv(fa_csv, [fa1, fa2, fa3])
 
     output = runtmp.output('db.zip')
 
@@ -185,7 +193,7 @@ def test_manysketch_skip_incompatible_fastas(runtmp, capfd):
     fa3 = get_test_data('short3.fa')
     protfa1 = get_test_data('short-protein.fa')
 
-    make_file_csv(fa_csv, [fa1, fa2, fa3], [protfa1])
+    make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1])
 
     output = runtmp.output('db.zip')
 
@@ -235,7 +243,7 @@ def test_manysketch_bad_fa_csv(runtmp, capfd):
     sig47 = get_test_data('47.fa.sig.gz')
     sig63 = get_test_data('63.fa.sig.gz')
 
-    make_file_csv(siglist, [sig2, sig47, sig63])
+    make_assembly_csv(siglist, [sig2, sig47, sig63])
 
     output = runtmp.output('db.zip')
 
@@ -273,7 +281,7 @@ def test_manysketch_bad_fa_csv_3(runtmp, capfd):
     protfa1 = get_test_data('short-protein.fa')
 
     # make file csv but don't fill empty protein rows with ,""
-    make_file_csv(fa_csv, [fa1, fa2, fa3], [protfa1])
+    make_assembly_csv(fa_csv, [fa1, fa2, fa3], [protfa1])
     g_fa = [fa1, fa2, fa3]
     p_fa = [protfa1]
     with open(fa_csv, 'wt') as fp:
@@ -302,7 +310,7 @@ def test_manysketch_empty_fa_csv(runtmp, capfd):
     # test empty fa_csv file
     fa_csv = runtmp.output('fa.txt')
     output = runtmp.output('out.zip')
-    make_file_csv(fa_csv, []) # empty
+    make_assembly_csv(fa_csv, []) # empty
 
     with pytest.raises(utils.SourmashCommandFailed):
         runtmp.sourmash('scripts', 'manysketch', fa_csv,
@@ -321,7 +329,7 @@ def test_manysketch_duplicated_rows(runtmp, capfd):
     fa3 = get_test_data('short3.fa')
     protfa1 = get_test_data('short-protein.fa')
 
-    make_file_csv(fa_csv, [fa1, fa1, fa1, fa3])
+    make_assembly_csv(fa_csv, [fa1, fa1, fa1, fa3])
 
     output = runtmp.output('db.zip')
 
@@ -350,7 +358,7 @@ def test_manysketch_N_in_dna(runtmp):
         fp.write(">bad\n")
         fp.write("ACAGTN\n")
 
-    make_file_csv(fa_csv, [fa1])
+    make_assembly_csv(fa_csv, [fa1])
 
     output = runtmp.output('db.zip')
 
@@ -375,7 +383,7 @@ def test_zip_manifest(runtmp, capfd):
     fa2 = get_test_data('short2.fa')
     fa3 = get_test_data('short3.fa')
 
-    make_file_csv(fa_csv, [fa1, fa2, fa3])
+    make_assembly_csv(fa_csv, [fa1, fa2, fa3])
     output = runtmp.output('db.zip')
 
     runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output,
@@ -414,7 +422,7 @@ def test_protein_zip_manifest(runtmp, capfd):
     fa1 = get_test_data('short.fa')
     fa2 = get_test_data('short-protein.fa')
 
-    make_file_csv(fa_csv, [fa1], [fa2])
+    make_assembly_csv(fa_csv, [fa1], [fa2])
     output = runtmp.output('db.zip')
 
     runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output,
@@ -458,7 +466,7 @@ def test_manysketch_singleton(runtmp):
     fa2 = get_test_data('short2.fa')
     fa3 = get_test_data('short3.fa')
 
-    make_file_csv(fa_csv, [fa1, fa2, fa3])
+    make_assembly_csv(fa_csv, [fa1, fa2, fa3])
 
     output = runtmp.output('db.zip')
 
@@ -487,3 +495,49 @@ def test_manysketch_singleton(runtmp):
             assert sig == ss_sketch1
         if sig.name == 'other':
             assert sig == ss_sketch2
+
+
+def test_manysketch_reads(runtmp, capfd):
+    fa_csv = runtmp.output('db-fa.csv')
+
+    fa1 = get_test_data('short.fa')
+    fa2 = get_test_data('short2.fa')
+    fa3 = get_test_data('short3.fa')
+
+    make_reads_csv(fa_csv, [("short", fa1, fa2), ('short3', fa3, '')]) # make sure we can just do read1 alone
+
+    output = runtmp.output('db.zip')
+
+    runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output,
+                    '--param-str', "dna,k=31,scaled=1")
+
+    assert os.path.exists(output)
+    assert not runtmp.last_result.out # stdout should be empty
+    captured = capfd.readouterr()
+    print(captured.out)
+    print(captured.err)
+    assert "Found 'reads' CSV, assuming all files are DNA." in captured.out
+    assert "Starting file 3/3 (100%)" in captured.err
+    assert "DONE. Processed 3 fasta files" in captured.err
+
+    idx = sourmash.load_file_as_index(output)
+    sigs = list(idx.signatures())
+    print(sigs)
+
+    assert len(sigs) == 2
+    s1 = runtmp.output('short.sig')
+    runtmp.sourmash('sketch', 'dna', fa1, fa2, '-o', s1,
+                    '--param-str', "k=31,scaled=1", '--name', 'short')
+    sig1 = sourmash.load_one_signature(s1)
+    s3 = runtmp.output('short3.sig')
+    runtmp.sourmash('sketch', 'dna', fa3, '-o', s3,
+                    '--param-str', "k=31,scaled=1", '--name', 'short3')
+    sig2 = sourmash.load_one_signature(s3)
+
+    expected_signames = ['short', 'short3']
+    for sig in sigs:
+        assert sig.name in expected_signames
+        if sig.name == 'short':
+            assert sig == sig1
+        if sig.name == 'short3':
+            assert sig == sig2
diff --git a/src/utils.rs b/src/utils.rs
index 71179a3b..9a21f50c 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -7,7 +7,6 @@ use anyhow::{anyhow, Context, Result};
 use camino::Utf8Path as Path;
 use camino::Utf8PathBuf as PathBuf;
 use csv::Writer;
-// use serde::ser::Serializer;
 use serde::{Deserialize, Serialize};
 use std::cmp::{Ordering, PartialOrd};
 use std::collections::BinaryHeap;
@@ -157,7 +156,7 @@ fn detect_csv_type(headers: &csv::StringRecord) -> CSVType {
 
 pub fn load_fasta_fromfile(
     sketchlist_filename: String,
-) -> Result<Vec<(String, Vec<PathBuf>, String)>> {
+) -> Result<(Vec<(String, Vec<PathBuf>, String)>, usize)> {
     let mut rdr = csv::Reader::from_path(sketchlist_filename)?;
 
     // Check for right header
@@ -175,7 +174,7 @@ pub fn load_fasta_fromfile(
 
 fn process_assembly_csv(
     mut rdr: csv::Reader<std::fs::File>,
-) -> Result<Vec<(String, Vec<PathBuf>, String)>> {
+) -> Result<(Vec<(String, Vec<PathBuf>, String)>, usize)> {
     let mut results = Vec::new();
 
     let mut row_count = 0;
@@ -201,28 +200,27 @@ fn process_assembly_csv(
             .ok_or_else(|| anyhow!("Missing 'name' field"))?
             .to_string();
 
-        let genome_filename = record
-            .get(1)
-            .ok_or_else(|| anyhow!("Missing 'genome_filename' field"))?;
-        if !genome_filename.is_empty() {
-            results.push((
-                name.clone(),
-                // PathBuf::from(genome_filename),
-                vec![PathBuf::from(genome_filename)],
-                "dna".to_string(),
-            ));
-            genome_count += 1;
+        // Handle optional genome_filename
+        if let Some(genome_filename) = record.get(1) {
+            if !genome_filename.is_empty() {
+                results.push((
+                    name.clone(),
+                    vec![PathBuf::from(genome_filename)],
+                    "dna".to_string(),
+                ));
+                genome_count += 1;
+            }
         }
-        let protein_filename = record
-            .get(2)
-            .ok_or_else(|| anyhow!("Missing 'protein_filename' field"))?;
-        if !protein_filename.is_empty() {
-            results.push((
-                name,
-                vec![PathBuf::from(protein_filename)],
-                "protein".to_string(),
-            ));
-            protein_count += 1;
+        // Handle optional protein_filename
+        if let Some(protein_filename) = record.get(2) {
+            if !protein_filename.is_empty() {
+                results.push((
+                    name,
+                    vec![PathBuf::from(protein_filename)],
+                    "protein".to_string(),
+                ));
+                protein_count += 1;
+            }
         }
     }
     // Print warning if there were duplicated rows.
@@ -233,14 +231,18 @@ fn process_assembly_csv(
         "Loaded {} rows in total ({} genome and {} protein files)",
         row_count, genome_count, protein_count
     );
-    Ok(results)
+    let n_fastas = genome_count + protein_count;
+    Ok((results, n_fastas))
 }
 
 fn process_reads_csv(
     mut rdr: csv::Reader<std::fs::File>,
-) -> Result<Vec<(String, Vec<PathBuf>, String)>> {
+    // ) -> Result<Vec<(String, Vec<PathBuf>, String)>> {
+) -> Result<(Vec<(String, Vec<PathBuf>, String)>, usize)> {
     let mut results = Vec::new();
     let mut processed_rows = std::collections::HashSet::new();
+    let mut read1_count = 0;
+    let mut read2_count = 0;
     let mut duplicate_count = 0;
 
     for result in rdr.records() {
@@ -259,19 +261,31 @@ fn process_reads_csv(
         let read1 = record
             .get(1)
             .ok_or_else(|| anyhow!("Missing 'read1' field"))?;
+        read1_count += 1;
+        let read2 = record.get(2); // No error if missing
+        let mut paths = vec![PathBuf::from(read1)];
         let read2 = record
             .get(2)
-            .ok_or_else(|| anyhow!("Missing 'read2' field"))?;
-        let paths = vec![PathBuf::from(read1), PathBuf::from(read2)];
-        results.push((name, paths, "alternate".to_string()));
+            .and_then(|r2| if r2.is_empty() { None } else { Some(r2) });
+        if let Some(r2) = read2 {
+            paths.push(PathBuf::from(r2));
+            read2_count += 1;
+        }
+        results.push((name, paths, "dna".to_string()));
     }
 
-    if duplicate_count > 0 {
-        println!("Warning: {} duplicated rows were skipped.", duplicate_count);
-    }
-    println!("Loaded alternate CSV variant.");
+    println!("Found 'reads' CSV, assuming all files are DNA.");
+    println!(
+        "Loaded {} rows in total ({} with read1 and {} with read2), {} duplicates skipped.",
+        processed_rows.len(),
+        read1_count,
+        read2_count,
+        duplicate_count
+    );
+
+    let n_fastas = read1_count + read2_count;
 
-    Ok(results)
+    Ok((results, n_fastas))
 }
 
 // Load all compatible minhashes from a collection into memory

From 42a6ddfc4bbc28be9dc108d62ca98adf02c225a0 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 26 Feb 2024 17:22:19 -0800
Subject: [PATCH 05/11] cleanup

---
 src/utils.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.rs b/src/utils.rs
index 9a21f50c..76d34f8e 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -262,8 +262,8 @@ fn process_reads_csv(
             .get(1)
             .ok_or_else(|| anyhow!("Missing 'read1' field"))?;
         read1_count += 1;
-        let read2 = record.get(2); // No error if missing
         let mut paths = vec![PathBuf::from(read1)];
+        // allow missing read2
         let read2 = record
             .get(2)
             .and_then(|r2| if r2.is_empty() { None } else { Some(r2) });

From 8801772529e6f1d2756a64183fce4644f9630430 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 26 Feb 2024 17:44:36 -0800
Subject: [PATCH 06/11] test singleton with reads csv

---
 src/python/tests/test_sketch.py | 48 +++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py
index c2ff1742..b05703b6 100644
--- a/src/python/tests/test_sketch.py
+++ b/src/python/tests/test_sketch.py
@@ -541,3 +541,51 @@ def test_manysketch_reads(runtmp, capfd):
             assert sig == sig1
         if sig.name == 'short3':
             assert sig == sig2
+
+
+def test_manysketch_reads_singleton(runtmp, capfd):
+    fa_csv = runtmp.output('db-fa.csv')
+
+    fa1 = get_test_data('short.fa')
+    fa2 = get_test_data('short2.fa')
+    fa3 = get_test_data('short3.fa')
+
+    make_reads_csv(fa_csv, [("short", fa2, fa3), ])
+
+    output = runtmp.output('db.zip')
+
+    runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output,
+                    '--param-str', "dna,k=31,scaled=1", '--singleton')
+
+    assert os.path.exists(output)
+    assert not runtmp.last_result.out # stdout should be empty
+    captured = capfd.readouterr()
+    print(captured.out)
+    print(captured.err)
+    assert "Found 'reads' CSV, assuming all files are DNA." in captured.out
+    assert "Starting file 2/2 (100%)" in captured.err
+    assert "DONE. Processed 2 fasta files" in captured.err
+
+    idx = sourmash.load_file_as_index(output)
+    sigs = list(idx.signatures())
+    print(sigs)
+
+    assert len(sigs) == 3
+    s1 = runtmp.output('singleton.sig')
+    runtmp.sourmash('sketch', 'dna', fa2, fa3, '-o', s1,
+                    '--param-str', "k=31,scaled=1", '--singleton')
+    ss = sourmash.load_signatures(s1)
+
+    ss_sketch1 = next(ss)
+    ss_sketch2 = next(ss)
+    ss_sketch3 = next(ss)
+
+    expected_signames = ['tr1 4', 'firstname', 'other']
+    for sig in sigs:
+        assert sig.name in expected_signames
+        if sig.name == 'tr1 4':
+            assert sig == ss_sketch1
+        elif sig.name == 'firstname':
+            assert sig == ss_sketch2
+        elif sig.name == 'other':
+            assert sig == ss_sketch3

From b974118b4a6d7efa698755b5354c2cdee0435dda Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 26 Feb 2024 17:51:43 -0800
Subject: [PATCH 07/11] clean up comment

---
 src/manysketch.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/manysketch.rs b/src/manysketch.rs
index 3c5e8067..3846caf4 100644
--- a/src/manysketch.rs
+++ b/src/manysketch.rs
@@ -182,7 +182,6 @@ pub fn manysketch(
             // if merging multiple files, sourmash sets filename as last filename
             let last_filename = filenames.last().unwrap();
 
-            // to do: consider changing reporting to per-sig, no matter how many fastas? but singleton...
             for filename in filenames {
                 // increment processed_fastas counter; make 1-based for % reporting
                 let i = processed_fastas.fetch_add(1, atomic::Ordering::SeqCst);

From b11d91231f87fd54f224c835be215729a3654832 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Tue, 27 Feb 2024 09:00:17 -0800
Subject: [PATCH 08/11] add some documentation!

---
 doc/README.md                                 | 26 +++++++++++++++++--
 .../sourmash_plugin_branchwater/__init__.py   |  6 ++---
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/doc/README.md b/doc/README.md
index d86b452d..f33cbdbd 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -88,9 +88,15 @@ When using a fromfile for search, we load all signatures into memory at the star
 
 ### Running `manysketch`
 
-The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped sourmash signature collection (`zip`). `manysketch` uses one thread per input file, so it can (very) efficiently sketch many files at once; and, because sequence file parsing is entirely implemented in Rust, it is much, _much_ faster than `sourmash sketch` for large FASTQ files.
+The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped sourmash signature collection (`zip`). `manysketch` uses one thread per input file, so it can (very) efficiently sketch many files at once; and, because sequence file parsing is entirely implemented in Rust, it is much, _much_ faster than `sourmash sketch` for large FASTQ files. However, it does not currently support translation, i.e. protein signature generation from DNA FASTA.
 
-To run `manysketch`, you need to build a text file list of FASTA/FASTQ files, with one on each line (`manysketch.csv`, below).  A simple way to do this for a directory is this command snippet:
+To run `manysketch`, you need to build a text file list of FASTA/FASTQ files (see `manysketch.csv` example, below).
+
+The following formats are accepted:
+- 3 columns: `name,genome_filename,protein_filename`. `genome_filename` entries considered DNA FASTA, `protein_filename` entries are considered protein FASTA.
+- 3 columns: `name,read1,read2`. All entries considered DNA FASTA, and both `read1` and `read2` files are used as input for a single sketch with name `name`.
+
+A simple way to build a manysketch input file for a directory is this command snippet:
 ```
 echo name,genome_filename,protein_filename > manysketch.csv
 for i in *.fa.gz
@@ -117,6 +123,22 @@ sourmash scripts manysketch fa.csv -o fa.zip -p k=21,k=31,k=51,scaled=1000,abund
 ```
 See [the sourmash sketch docs](https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-sketch-make-sourmash-signatures-from-sequence-data) for more information on param strings.
 
+`manysketch` also supports building sketches for each record in a FASTA file independently (`--singleton`).
+
+You can run:
+
+```
+sourmash scripts manysketch manysketch.csv -o fa.zip --singleton
+```
+The output will be written to `fa.zip`
+
+You can check if all signatures were written properly with
+```
+sourmash sig summarize fa.zip
+```
+> Here, the number of sketches per parameter combination should equal the total number of records in all input FASTA.
+> The `name` column will not be used. Instead, each sketch will be named from the FASTA record name.
+
 ### Running `multisearch`
 
 The `multisearch` command compares one or more query genomes, and one or more subject genomes. It differs from `manysearch` by loading all genomes into memory.
diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py
index fef4c3ba..ff566fcb 100755
--- a/src/python/sourmash_plugin_branchwater/__init__.py
+++ b/src/python/sourmash_plugin_branchwater/__init__.py
@@ -328,8 +328,8 @@ class Branchwater_Manysketch(CommandLinePlugin):
 
     def __init__(self, p):
         super().__init__(p)
-        p.add_argument('fromfile_csv', help="a csv file containing paths to fasta files. \
-                        Columns must be: 'name,genome_filename,protein_filename'")
+        p.add_argument('fromfile_csv', help="a csv file containing paths to FASTA files. \
+                        Columns must be: 'name,genome_filename,protein_filename' or 'name,read1,read2'")
         p.add_argument('-o', '--output', required=True,
                        help='output zip file for the signatures')
         p.add_argument('-p', '--param-string', action='append', type=str, default=[],
@@ -337,7 +337,7 @@ def __init__(self, p):
         p.add_argument('-c', '--cores', default=0, type=int,
                        help='number of cores to use (default is all available)')
         p.add_argument('-s', '--singleton', action="store_true",
-                       help='build one sketch per fasta record')
+                       help='build one sketch per FASTA record, i.e. multiple sketches per FASTA file')
 
     def main(self, args):
         print_version()

From 444ec84a9dc3e6a537bd6ce8cffa39fe8d34fd5c Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Tue, 27 Feb 2024 09:04:17 -0800
Subject: [PATCH 09/11] upd

---
 doc/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/README.md b/doc/README.md
index f33cbdbd..6668b013 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -136,8 +136,8 @@ You can check if all signatures were written properly with
 ```
 sourmash sig summarize fa.zip
 ```
-> Here, the number of sketches per parameter combination should equal the total number of records in all input FASTA.
-> The `name` column will not be used. Instead, each sketch will be named from the FASTA record name.
+The number of sketches per parameter combination should equal the total number of records in all input FASTA.
+The `name` column will not be used. Instead, each sketch will be named from the FASTA record name.
 
 ### Running `multisearch`
 

From c815d964838593cca0c575ea17a8c5f11c7e1bfa Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Tue, 27 Feb 2024 09:08:37 -0800
Subject: [PATCH 10/11] upd doc

---
 doc/README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/README.md b/doc/README.md
index 6668b013..bb8aacd9 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -90,10 +90,12 @@ When using a fromfile for search, we load all signatures into memory at the star
 
 The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped sourmash signature collection (`zip`). `manysketch` uses one thread per input file, so it can (very) efficiently sketch many files at once; and, because sequence file parsing is entirely implemented in Rust, it is much, _much_ faster than `sourmash sketch` for large FASTQ files. However, it does not currently support translation, i.e. protein signature generation from DNA FASTA.
 
+#### specifying input FASTA
+
 To run `manysketch`, you need to build a text file list of FASTA/FASTQ files (see `manysketch.csv` example, below).
 
 The following formats are accepted:
-- 3 columns: `name,genome_filename,protein_filename`. `genome_filename` entries considered DNA FASTA, `protein_filename` entries are considered protein FASTA.
+- 3 columns: `name,genome_filename,protein_filename`. `genome_filename` entries are considered DNA FASTA, `protein_filename` entries are considered protein FASTA.
 - 3 columns: `name,read1,read2`. All entries considered DNA FASTA, and both `read1` and `read2` files are used as input for a single sketch with name `name`.
 
 A simple way to build a manysketch input file for a directory is this command snippet:
@@ -123,6 +125,8 @@ sourmash scripts manysketch fa.csv -o fa.zip -p k=21,k=31,k=51,scaled=1000,abund
 ```
 See [the sourmash sketch docs](https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-sketch-make-sourmash-signatures-from-sequence-data) for more information on param strings.
 
+#### singleton sketching
+
 `manysketch` also supports building sketches for each record in a FASTA file independently (`--singleton`).
 
 You can run:

From f16ad222085339381c7854586fe39d1facca48a6 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Tue, 27 Feb 2024 09:19:31 -0800
Subject: [PATCH 11/11] upd doc formatting

---
 doc/README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/doc/README.md b/doc/README.md
index bb8aacd9..9767048d 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -95,8 +95,10 @@ The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped so
 To run `manysketch`, you need to build a text file list of FASTA/FASTQ files (see `manysketch.csv` example, below).
 
 The following formats are accepted:
-- 3 columns: `name,genome_filename,protein_filename`. `genome_filename` entries are considered DNA FASTA, `protein_filename` entries are considered protein FASTA.
-- 3 columns: `name,read1,read2`. All entries considered DNA FASTA, and both `read1` and `read2` files are used as input for a single sketch with name `name`.
+- 3 columns: `name,genome_filename,protein_filename`
+  >`genome_filename` entries are considered DNA FASTA, `protein_filename` entries are considered protein FASTA.
+- 3 columns: `name,read1,read2`
+  > All entries considered DNA FASTA, and both `read1` and `read2` files are used as input for a single sketch with name `name`.
 
 A simple way to build a manysketch input file for a directory is this command snippet:
 ```
@@ -127,7 +129,7 @@ See [the sourmash sketch docs](https://sourmash.readthedocs.io/en/latest/command
 
 #### singleton sketching
 
-`manysketch` also supports building sketches for each record in a FASTA file independently (`--singleton`).
+`manysketch` also supports building independent sketches for each record in a FASTA file (`--singleton`).
 
 You can run: