From e5572fdb2d22223839c5282e1f710d9584c6c676 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Tue, 13 Feb 2024 15:53:35 -0800
Subject: [PATCH 01/12] update docs

---
 doc/README.md | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)
diff --git a/doc/README.md b/doc/README.md
index 3544df9e..25f98088 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -1,18 +1,23 @@
 # manysketch, fastgather, fastmultigather, multisearch, and manysearch - an introduction
 
-This repository implements five sourmash plugins, `manysketch`, `fastgather`, `fastmultigather`, `multisearch`, and `manysearch`. These plugins make use of multithreading in Rust to provide very fast implementations of `sketch`, `search`, and `gather`. With large databases, these commands can be hundreds to thousands of times faster, and 10-50x lower memory, than sourmash.
+This repository implements six sourmash plugins, `manysketch`, `fastgather`, `fastmultigather`, `multisearch`, `pairwise`, and `manysearch`. These plugins make use of multithreading in Rust to provide very fast implementations of `sketch`, `search`, and `gather`. With large databases, these commands can be hundreds to thousands of times faster, and 10-50x lower memory, than sourmash.
 
 The main *drawback* to these plugin commands is that their inputs and outputs are not as rich as the native sourmash commands. This may mean that your input files need to be prepared differently. The output may currently be most useful as a prefilter in conjunction with regular sourmash commands - see the instructions below for using `fastgather` to create picklists for sourmash.
 
 ## Input file formats
 
 
-All four search/gather commands accept zip files, manifest files, or _text files containing lists of signature files_ ("fromfiles") for the search database. `multisearch`, `manysearch` and `fastmultigather` also use either zips, manifests, or "fromfiles" for queries, too. All commands now accept single signature files as well, though this is only useful for single-query input.
+All five search/gather commands accept zip files, manifest files, or _text files containing lists of signature files_ ("fromfiles") for the search database. `multisearch`, `manysearch` and `fastmultigather` also use either zipfiles, manifests, or "fromfiles" for queries, too. All commands now accept single signature files as well, though this is only useful for single-query input.
+
+**As of v0.9.0, we strongly recommend using zipfiles or manifests over
+"fromfiles" due to internal changes in sketch loading; this reverses
+earlier recommendations to use fromfiles!**
 
 `manysketch` takes as input a CSV file with columns `name,genome_filename,protein_filename`. If you don't have `protein_filename` entries, be sure to include the trailing comma so the CSV reader can process the file correctly.
 
 ### Using zip files or manifest files
 
+Zip files are compressed collections of sourmash sketches. When created with sourmash, they also contain manifests. They are generally the most efficient option for loading and storing sourmash signatures.
 
 Manifest files are csv files with all information about sourmash signature parameters. Having a manifest allows us to select sketches relevant to the search (e.g. by k-mer size, scaled factor, etc) and perform checks without loading the sketches themselves into memory. We then only load the actual sketches (and optionally, downsample to a lower scaled value) when we're ready to use them.
 
@@ -21,12 +26,14 @@ If you have a `sourmash` zip file of signatures, it already contains a manifest
 If you'd like to generate a standalone `manifest` file from your signatures, you can do it like so:
 
 ```
-sourmash sig manifest <sigs> -o sigs.manifest.csv
+sourmash sig manifest <sigfile> -o sigfile.manifest.csv
 ```
-> Here, `sigs` can be any type of sourmash input, including a signature file or `pathlist`
+> Here, `sigfile` can be any type of sourmash input, including a signature file or `pathlist`
 
 ### Using "fromfiles"
 
+**Note: We no longer recommend using "fromfiles". Use zip files instead.**
+
 To prepare a **signature** fromfile from a database, first you need to split the database into individual files:
 ```
 mkdir gtdb-reps-rs214-k21/
@@ -179,13 +186,15 @@ Each command does things slightly differently, with implications for CPU and dis
 
 `manysketch` loads one sequence file from disk per thread and sketches it using all signature params simultaneously.
 
-`manysearch` loads all the queries at the beginning, and then loads one database sketch from disk per thread. The compute-per-database-sketch is dominated by I/O. So your number of threads should be chosen with care for disk load. We typically limit it to `-c 32` for shared disks.
+`manysearch` loads all the queries at the beginning, and then loads one database sketch from disk per thread. The compute-per-database-sketch is dominated by I/O. So your number of threads should be chosen with care for disk load. We typically limit it to `-c 32` for shared disks. We suggest using a manifest CSV file for the database sketches.
+
+`multisearch` loads all the queries and database sketches once, at the beginning, and then uses multithreading to search across all matching sequences. For large databases it is extremely efficient at using all available cores. So 128 threads or more should work fine! Zipfiles and manifests should work well.
 
-`multisearch` loads all the queries and database sketches once, at the beginning, and then uses multithreading to search across all matching sequences. For large databases it is extremely efficient at using all available cores. So 128 threads or more should work fine!
+`pairwise` acts just like `multisearch`, but only loads one file (and then does all comparisons between all pairs within that file).
 
-Like `multisearch`, `fastgather` loads everything at the beginning, and then uses multithreading to search across all matching sequences. For large databases it is extremely efficient at using all available cores. So 128 threads or more should work fine!
+Like `multisearch` and `pairwise`, `fastgather` loads everything at the beginning, and then uses multithreading to search across all matching sequences. For large databases it is extremely efficient at using all available cores. So 128 threads or more should work fine! We suggest using zipfile or manifests for the database.
 
-`fastmultigather` loads the entire database once, and then loads one query from disk per thread. The compute-per-query can be significant, though, so multithreading efficiency here is less dependent on I/O and the disk is less likely to be saturated with many threads. We suggest limiting threads to between 32 and 64 to decrease shared disk load.
+`fastmultigather` loads the entire database once, and then loads one query from disk per thread. The compute-per-query can be significant, though, so multithreading efficiency here is less dependent on I/O and the disk is less likely to be saturated with many threads. We suggest limiting threads to between 32 and 64 to decrease shared disk load. We also suggest using a zipfile or manifest for the database rather than a pathlist.
 
 ## Appendix 1 - `index` to create a low-memory index
 
@@ -193,4 +202,4 @@ The command `sourmash scripts index` makes an on-disk inverted index
 for low memory fast search. Indexing takes a while, but then search
 takes fewer resources.
 
-Currently only fastmultigather and manysearch can use this kind of index.
+Currently only fastmultigather and manysearch can use this kind of inde.

From 6adca21264cae22cc3d0718775b66cdb3af70f64 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Tue, 13 Feb 2024 16:01:55 -0800
Subject: [PATCH 02/12] more

---
 doc/README.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/doc/README.md b/doc/README.md
index 25f98088..422a56d9 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -105,6 +105,11 @@ sourmash scripts multisearch list.query.txt list.gtdb-rs214-k21.txt  -o query.x.
 
 The results file here, `query.x.gtdb-reps.csv`, will have 8 columns: `query` and `query_md5`, `match` and `match_md5`, and `containment`, `jaccard`, `max_containment`, and `intersect_hashes`.
 
+The `pairwise` command does the same thing as `multisearch` but takes
+only a single file, which it uses to calculate all pairwise
+comparisons. Since the comparisons are symmetric, it is a bit more than
+twice as fast as `multisearch`.
+
 ### Running `fastgather`
 
 The `fastgather` command is a much faster version of `sourmash gather`.
@@ -202,4 +207,8 @@ The command `sourmash scripts index` makes an on-disk inverted index
 for low memory fast search. Indexing takes a while, but then search
 takes fewer resources.
 
-Currently only fastmultigather and manysearch can use this kind of inde.
+Currently only `fastmultigather` and `manysearch` can use this kind of index.
+
+We suggest using the extension `.rocksdb` for these databases, as we
+use [RocksDB](https://rocksdb.org/) for the underlying database storage
+mechanism.

From 87839638060ac80136d71b3df1d8f8ffac6ba571 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 14 Feb 2024 07:30:33 -0800
Subject: [PATCH 03/12] remove errant dbg messages for now

---
 src/utils.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/utils.rs b/src/utils.rs
index a25fd890..7f2b6b77 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -458,7 +458,7 @@ pub fn load_collection(
     };
 
     let collection = collection.or_else(|| {
-        dbg!("attempting to load as manifest");
+        // dbg!("attempting to load as manifest");
         match collection_from_manifest(&sigpath, &report_type) {
             Ok(coll) => Some((coll, 0)),
             Err(e) => {
@@ -469,7 +469,7 @@ pub fn load_collection(
     });
 
     let collection = collection.or_else(|| {
-        dbg!("attempting to load as signature");
+        // dbg!("attempting to load as signature");
         match collection_from_signature(&sigpath, &report_type) {
             Ok(coll) => Some((coll, 0)),
             Err(e) => {
@@ -480,7 +480,7 @@ pub fn load_collection(
     });
 
     let collection = collection.or_else(|| {
-        dbg!("attempting to load as pathlist");
+        // dbg!("attempting to load as pathlist");
         match collection_from_pathlist(&sigpath, &report_type) {
             Ok((coll, n_failed)) => Some((coll, n_failed)),
             Err(e) => {

From 9a995b9c9d4b188be8794738038ff6816adaaac5 Mon Sep 17 00:00:00 2001
From: HackMD <37423+hackmd-hub[bot]@users.noreply.github.com>
Date: Thu, 15 Feb 2024 14:37:23 +0000
Subject: [PATCH 04/12] last changed at Feb 15, 2024 6:36 AM, pushed by C.
 Titus Brown

---
 doc/README.md | 103 ++++++++++++++++++++++++--------------------------
 1 file changed, 49 insertions(+), 54 deletions(-)

diff --git a/doc/README.md b/doc/README.md
index 422a56d9..f4d223ce 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -1,19 +1,33 @@
-# manysketch, fastgather, fastmultigather, multisearch, and manysearch - an introduction
+# The branchwater plugin for sourmash
 
-This repository implements six sourmash plugins, `manysketch`, `fastgather`, `fastmultigather`, `multisearch`, `pairwise`, and `manysearch`. These plugins make use of multithreading in Rust to provide very fast implementations of `sketch`, `search`, and `gather`. With large databases, these commands can be hundreds to thousands of times faster, and 10-50x lower memory, than sourmash.
+@CTB TODO: add [link] stuff
 
-The main *drawback* to these plugin commands is that their inputs and outputs are not as rich as the native sourmash commands. This may mean that your input files need to be prepared differently. The output may currently be most useful as a prefilter in conjunction with regular sourmash commands - see the instructions below for using `fastgather` to create picklists for sourmash.
+| command | functionality | docs |
+| -------- | -------- | -------- |
+| `manysketch` | Rapidly build sketches for many input files     | [link]     |
+| `fastgather` | Multithreaded `gather` of **one** metagenome against a database| [link]
+| `fastmultigather` | Multithreaded `gather` of **multiple** metagenomes against a database | [link]
+| `manysearch` | Multithreaded containment search for many queries in many large metagenomes | [link]
+| `multisearch` | Multithreaded comparison of multiple sketches, in memory | [link]
+| `pairwise` | Multithreaded pairwise comparison of multiple sketches, in memory | [link]
 
-## Input file formats
+This repository implements multithreaded plugins for [sourmash](https://sourmash.readthedocs.io/) that provide very fast implementations of `sketch`, `search`, and `gather`. These commands are typically hundreds to thousands of times faster, and 10-50x lower memory, than the current sourmash code.
+
+The main *drawback* to these plugin commands is that their inputs and outputs are not as rich as the native sourmash commands. This means that your input files may need to be prepared differently, and the output may in some cases be most useful as a prefilter in conjunction with regular sourmash commands - see the instructions below for using `fastgather` to create picklists for sourmash.
 
+## Input file formats
 
-All five search/gather commands accept zip files, manifest files, or _text files containing lists of signature files_ ("fromfiles") for the search database. `multisearch`, `manysearch` and `fastmultigather` also use either zipfiles, manifests, or "fromfiles" for queries, too. All commands now accept single signature files as well, though this is only useful for single-query input.
+sourmash supports a variety of different storage formats for sketches (see [sourmash docs](https://sourmash.readthedocs.io/en/latest/command-line.html#choosing-signature-output-formats)), and the branchwater plugin works some (but not all) of them. Branchwater _also_ supports an additional database type, a RocksDB-based inverted index, that is not yet supported by sourmash (through v4.8.6).
 
-**As of v0.9.0, we strongly recommend using zipfiles or manifests over
-"fromfiles" due to internal changes in sketch loading; this reverses
-earlier recommendations to use fromfiles!**
+**As of v0.9.0, we recommend using zip files or manifest CSVs whenever you need to provide multiple sketches.** Prior to v0.9.0, we suggest pathlists, but these now incur substantial overhead.
 
-`manysketch` takes as input a CSV file with columns `name,genome_filename,protein_filename`. If you don't have `protein_filename` entries, be sure to include the trailing comma so the CSV reader can process the file correctly.
+| command | query input | database format |
+| -------- | -------- | -------- |
+| `gather`     | Single metagenome in sig, zip, manifest CSV, or fromfile     | Zip, manifest CSV, or fromfile |
+| `fastmultigather` | Multiple metagenomes in sig, zip, manifest CSV, or fromfile | Zip, manifest CSV, fromfile, or rocksdb index |
+| `manysearch` | Multiple genomes in sig, zip, manifest CSV, or fromfile | Zip, manifest CSV, fromfile, or rocksdb index |
+| `multisearch` | Multiple sketches in sig, zip, manifest CSV, or fromfile | Multiple sketches in sig, zip, manifest CSV, or fromfile |
+| `pairwise` | Multiple sketches in sig, zip, manifest CSV, or fromfile | N/A 
 
 ### Using zip files or manifest files
 
@@ -28,7 +42,9 @@ If you'd like to generate a standalone `manifest` file from your signatures, you
 ```
 sourmash sig manifest <sigfile> -o sigfile.manifest.csv
 ```
-> Here, `sigfile` can be any type of sourmash input, including a signature file or `pathlist`
+> Here, `sigfile` can be any type of sourmash input, including a signature file or fromfile.
+
+@CTB: fix fromfile vs pathlist stuff, explain manifests more/diff/better.
 
 ### Using "fromfiles"
 
@@ -53,7 +69,7 @@ When using these files for search, we have no a priori information about the par
 
 ### Running `manysketch`
 
-The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped sourmash signature collection (`zip`). `manysketch` uses one thread per input file, so it can (very) efficiently sketch many files at once; and, because sequence file parsing is entirely implemented in Rust, it is much faster than `sourmash sketch` for large FASTQ files.
+The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped sourmash signature collection (`zip`). `manysketch` uses one thread per input file, so it can (very) efficiently sketch many files at once; and, because sequence file parsing is entirely implemented in Rust, it is much, _much_ faster than `sourmash sketch` for large FASTQ files.
 
 To run `manysketch`, you need to build a text file list of FASTA/FASTQ files, with one on each line (`manysketch.csv`, below).  A simple way to do this for a directory is this command snippet:
 ```
@@ -64,7 +80,6 @@ echo $i,$i,
 done >> manysketch.csv
 ```
 
-
 You can then run:
 
 ```
@@ -81,78 +96,68 @@ To modify sketching parameters, use `--param-str` or `-p` and provide valid para
 ```
 sourmash scripts manysketch fa.csv -o fa.zip -p k=21,k=31,k=51,scaled=1000,abund -p protein,k=10,scaled=200
 ```
-
+See [the sourmash sketch docs](https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-sketch-make-sourmash-signatures-from-sequence-data) for more information on param strings.
 
 ### Running `multisearch`
 
 The `multisearch` command compares one or more query genomes, and one or more subject genomes. It differs from `manysearch` by loading all genomes into memory.
 
-`multisearch` takes two input collections (zip or "fromfiles"), and outputs a CSV:
+`multisearch` takes two input collections and outputs a CSV:
 ```
-sourmash scripts multisearch query-list.txt podar-ref-list.txt -o results.csv
+sourmash scripts multisearch query.sig.gz database.zip -o results.csv
 ```
 
-To run it, you need to provide two collections of signature files. If you create a fromfile as above with GTDB reps, you can generate a query fromfile like so:
-
-```
-head -10 list.gtdb-reps-rs214-k21.txt > list.query.txt
-```
-and then run `multisearch` like so:
-
-```
-sourmash scripts multisearch list.query.txt list.gtdb-rs214-k21.txt  -o query.x.gtdb-reps.csv -k 21 --cores 4
-```
 
-The results file here, `query.x.gtdb-reps.csv`, will have 8 columns: `query` and `query_md5`, `match` and `match_md5`, and `containment`, `jaccard`, `max_containment`, and `intersect_hashes`.
+The results file `results.csv`, will have 8 columns: `query` and `query_md5`, `match` and `match_md5`, and `containment`, `jaccard`, `max_containment`, and `intersect_hashes`.
 
-The `pairwise` command does the same thing as `multisearch` but takes
-only a single file, which it uses to calculate all pairwise
-comparisons. Since the comparisons are symmetric, it is a bit more than
+The `pairwise` command does the same comparisons as `multisearch` but takes
+only a single collection of sketches, for which it calculates all the pairwise comparisons. Since the comparisons are symmetric, it approximately
 twice as fast as `multisearch`.
 
 ### Running `fastgather`
 
 The `fastgather` command is a much faster version of `sourmash gather`.
 
-`fastgather` takes a single query metagenome (in any file format) and an input collection (zip or "fromfile") as database, and outputs a CSV:
+`fastgather` takes a single query metagenome and a database, and outputs a CSV:
 ```
-sourmash scripts fastgather query.sig.gz podar-ref-list.txt -o results.csv --cores 4
+sourmash scripts fastgather query.sig.gz database.zip -o results.csv --cores 4
 ```
 
 #### Using `fastgather` to create a picklist for `sourmash gather`
 
 One handy use case for `fastgather` is to create a picklist that can be used by `sourmash gather`. This makes full use of the speed of `fastgather` while producing a complete set of `gather` outputs.
 
-For example, if `list.gtdb-rs214-k21.txt` contains the paths to all GTDB RS214 genomes in `sig.gz` files, as above, then the following command will do a complete gather against GTDB:
+For example, if you run a complete `gather` against GTDB rs214,
 
 ```
 sourmash scripts fastgather SRR606249.trim.sig.gz \
-    list.gtdb-rs214-k21.txt -o SRR606249.fastgather.csv -k 21
+    gdtb-rs214-k21.zip -o SRR606249.fastgather.csv -k 21
 ```
 
-This CSV file can then be used as a picklist for `sourmash gather` like so:
+The resulting CSV file can then be used as a picklist for `sourmash gather` like so:
 
 ```
-sourmash gather SRR606249.trim.sig.gz /group/ctbrowngrp/sourmash-db/gtdb-rs214/gtdb-rs214-k21.zip \
+sourmash gather SRR606249.trim.sig.gz gtdb-rs214-k21.zip \
     --picklist SRR606249.fastgather.csv:match_name:ident \
     -o SRR606249.gather.csv
 ```
 
-Here the picklist should be used on a sourmash collection that contains a manifest - this will prevent sourmash from loading any sketches other than the ones in the fastgather CSV file. We recommend using zip file databases - manifests are produced automatically when `-o filename.zip` is used with `sketch dna`, and they also be prepared with `sourmash sig cat`. (If you are using a GTDB database, as above, then you already have a manifest!)
-
 #### Example of picklist usage
 
-A complete example Snakefile implementing the above workflow is available [in the 2023-swine-usda](https://github.com/ctb/2023-swine-usda/blob/main/Snakefile) repository. Note, it is slightly out of date at the moment!
+A complete example Snakefile implementing the above workflow is available [in the sourmash-slainte Snakefile](https://github.com/dib-lab/sourmash-slainte/blob/main/Snakefile).
 
 ### Running `fastmultigather`
 
 `fastmultigather` takes a collection of query metagenomes and a collection of sketches as a database, and outputs many CSVs:
 ```
-sourmash scripts fastmultigather query-list.txt podar-ref-lists.txt --cores 4
+sourmash scripts fastmultigather queries.manifest.csv database.zip --cores 4
 ```
+We suggest using a manifest CSV for the queries.
 
 The main advantage that `fastmultigather` has over running `fastgather` on multiple queries is that you only load the database files once, which can be a significant time savings for large databases!
 
+@CTB: NTP, is this comment on loading still true?
+
 #### Output files for `fastmultigather`
 
 `fastmultigather` will output two CSV files for each query, a `prefetch` file containing all overlapping matches between that query and the database, and a `gather` file containing the minimum metagenome cover for that query in the database.
@@ -167,19 +172,9 @@ The `manysearch` command compares one or more collections of query sketches, and
 
 `manysearch` takes two collections as input, and outputs a CSV:
 ```
-sourmash scripts manysearch query-list.txt podar-ref-list.txt -o results.csv
-```
-
-To run it, you need to provide two "fromfiles" containing lists of paths to signature files (`.sig` or `.sig.gz`). If you create a fromfile as above with GTDB reps, you can generate a query fromfile like so:
-
-```
-head -10 list.gtdb-reps-rs214-k21.txt > list.query.txt
-```
-and then run `manysearch` like so:
-
-```
-sourmash scripts manysearch list.query.txt list.gtdb-rs214-k21.txt  -o query.x.gtdb-reps.csv -k 21 --cores 4
+sourmash scripts manysearch queries.zip metagenomes.manifest.csv -o results.csv
 ```
+We suggest using a manifest CSV for the metagenome collection.
 
 The results file here, `query.x.gtdb-reps.csv`, will have 8 columns: `query` and `query_md5`, `match` and `match_md5`, and `containment`, `jaccard`, `max_containment`, and `intersect_hashes`.
 
@@ -187,8 +182,6 @@ The results file here, `query.x.gtdb-reps.csv`, will have 8 columns: `query` and
 
 Each command does things slightly differently, with implications for CPU and disk load. You can measure threading efficiency with `/usr/bin/time -v` on Linux systems, and disk load by number of complaints received when running.
 
-(The below info is for fromfile lists. If you are using mastiff indexes, very different performance parameters apply. We will update here as we benchmark and improve!)
-
 `manysketch` loads one sequence file from disk per thread and sketches it using all signature params simultaneously.
 
 `manysearch` loads all the queries at the beginning, and then loads one database sketch from disk per thread. The compute-per-database-sketch is dominated by I/O. So your number of threads should be chosen with care for disk load. We typically limit it to `-c 32` for shared disks. We suggest using a manifest CSV file for the database sketches.
@@ -199,7 +192,9 @@ Each command does things slightly differently, with implications for CPU and dis
 
 Like `multisearch` and `pairwise`, `fastgather` loads everything at the beginning, and then uses multithreading to search across all matching sequences. For large databases it is extremely efficient at using all available cores. So 128 threads or more should work fine! We suggest using zipfile or manifests for the database.
 
-`fastmultigather` loads the entire database once, and then loads one query from disk per thread. The compute-per-query can be significant, though, so multithreading efficiency here is less dependent on I/O and the disk is less likely to be saturated with many threads. We suggest limiting threads to between 32 and 64 to decrease shared disk load. We also suggest using a zipfile or manifest for the database rather than a pathlist.
+@CTB: NTP, is this "loads everything at the beginning" comment still true?
+
+`fastmultigather` loads the entire database once, and then loads one query from disk per thread. The compute-per-query can be significant, though, so multithreading efficiency here is less dependent on I/O and the disk is less likely to be saturated with many threads. We suggest limiting threads to between 32 and 64 to decrease shared disk load.
 
 ## Appendix 1 - `index` to create a low-memory index
 

From 5b7e907a58fb192a255e798e20f394a3f40b37c2 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 15 Feb 2024 06:45:59 -0800
Subject: [PATCH 05/12] try revert #209

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index fca3c5e4..410dab6e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,7 +16,7 @@ sourmash = { version = "0.12.1", features = ["branchwater"] }
 serde_json = "1.0.113"
 niffler = "2.4.0"
 log = "0.4.14"
-env_logger = "0.11.1"
+env_logger = "0.10.2"
 simple-error = "0.3.0"
 anyhow = "1.0.79"
 zip = { version = "0.6", default-features = false, features = ["deflate"] }

From 472b3b8d7980caecfd20db9c9bd524c8122e330d Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 15 Feb 2024 06:46:29 -0800
Subject: [PATCH 06/12] upd Cargo.lock

---
 Cargo.lock | 131 +++++++++++++++++++++++++----------------------------
 1 file changed, 62 insertions(+), 69 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 51cea82e..a7fe7a49 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -49,54 +49,12 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "anstream"
-version = "0.6.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e2e1ebcb11de5c03c67de28a7df593d32191b44939c482e97702baaaa6ab6a5"
-dependencies = [
- "anstyle",
- "anstyle-parse",
- "anstyle-query",
- "anstyle-wincon",
- "colorchoice",
- "utf8parse",
-]
-
 [[package]]
 name = "anstyle"
 version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87"
 
-[[package]]
-name = "anstyle-parse"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"
-dependencies = [
- "utf8parse",
-]
-
-[[package]]
-name = "anstyle-query"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648"
-dependencies = [
- "windows-sys 0.52.0",
-]
-
-[[package]]
-name = "anstyle-wincon"
-version = "3.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7"
-dependencies = [
- "anstyle",
- "windows-sys 0.52.0",
-]
-
 [[package]]
 name = "anyhow"
 version = "1.0.79"
@@ -359,12 +317,6 @@ dependencies = [
  "csv",
 ]
 
-[[package]]
-name = "colorchoice"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
-
 [[package]]
 name = "core-foundation-sys"
 version = "0.8.6"
@@ -465,27 +417,17 @@ dependencies = [
  "syn 2.0.48",
 ]
 
-[[package]]
-name = "env_filter"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea"
-dependencies = [
- "log",
- "regex",
-]
-
 [[package]]
 name = "env_logger"
-version = "0.11.1"
+version = "0.10.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05e7cf40684ae96ade6232ed84582f40ce0a66efcd43a5117aef610534f8e0b8"
+checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580"
 dependencies = [
- "anstream",
- "anstyle",
- "env_filter",
  "humantime",
+ "is-terminal",
  "log",
+ "regex",
+ "termcolor",
 ]
 
 [[package]]
@@ -581,6 +523,12 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
+[[package]]
+name = "hermit-abi"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd5256b483761cd23699d0da46cc6fd2ee3be420bbe6d020ae4a091e70b7e9fd"
+
 [[package]]
 name = "histogram"
 version = "0.9.0"
@@ -634,6 +582,17 @@ dependencies = [
  "smallvec",
 ]
 
+[[package]]
+name = "is-terminal"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "itertools"
 version = "0.12.0"
@@ -1553,6 +1512,15 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "termcolor"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
+dependencies = [
+ "winapi-util",
+]
+
 [[package]]
 name = "termtree"
 version = "0.4.1"
@@ -1637,12 +1605,6 @@ version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce"
 
-[[package]]
-name = "utf8parse"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
-
 [[package]]
 name = "uuid"
 version = "1.7.0"
@@ -1757,6 +1719,37 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-util"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
 [[package]]
 name = "windows-core"
 version = "0.52.0"

From 13c5d13a4b89dd56f0736e05b9a11da6f1234d8a Mon Sep 17 00:00:00 2001
From: HackMD <37423+hackmd-hub[bot]@users.noreply.github.com>
Date: Sat, 17 Feb 2024 14:50:20 +0000
Subject: [PATCH 07/12] last changed at Feb 17, 2024 6:50 AM, pushed by C.
 Titus Brown

---
 doc/README.md | 80 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 47 insertions(+), 33 deletions(-)

diff --git a/doc/README.md b/doc/README.md
index f4d223ce..903a6786 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -1,15 +1,13 @@
 # The branchwater plugin for sourmash
 
-@CTB TODO: add [link] stuff
-
 | command | functionality | docs |
 | -------- | -------- | -------- |
-| `manysketch` | Rapidly build sketches for many input files     | [link]     |
-| `fastgather` | Multithreaded `gather` of **one** metagenome against a database| [link]
-| `fastmultigather` | Multithreaded `gather` of **multiple** metagenomes against a database | [link]
-| `manysearch` | Multithreaded containment search for many queries in many large metagenomes | [link]
-| `multisearch` | Multithreaded comparison of multiple sketches, in memory | [link]
-| `pairwise` | Multithreaded pairwise comparison of multiple sketches, in memory | [link]
+| `manysketch` | Rapidly build sketches for many input files     | [link](#Running-manysketch)     |
+| `fastgather` | Multithreaded `gather` of **one** metagenome against a database| [link](#Running-fastgather)
+| `fastmultigather` | Multithreaded `gather` of **multiple** metagenomes against a database | [link](#Running-fastmultigather)
+| `manysearch` | Multithreaded containment search for many queries in many large metagenomes | [link](#Running-manysearch)
+| `multisearch` | Multithreaded comparison of multiple sketches, in memory | [link](#Running-multisearch)
+| `pairwise` | Multithreaded pairwise comparison of multiple sketches, in memory | [link](#Running-multisearch)
 
 This repository implements multithreaded plugins for [sourmash](https://sourmash.readthedocs.io/) that provide very fast implementations of `sketch`, `search`, and `gather`. These commands are typically hundreds to thousands of times faster, and 10-50x lower memory, than the current sourmash code.
 
@@ -19,7 +17,7 @@ The main *drawback* to these plugin commands is that their inputs and outputs ar
 
 sourmash supports a variety of different storage formats for sketches (see [sourmash docs](https://sourmash.readthedocs.io/en/latest/command-line.html#choosing-signature-output-formats)), and the branchwater plugin works some (but not all) of them. Branchwater _also_ supports an additional database type, a RocksDB-based inverted index, that is not yet supported by sourmash (through v4.8.6).
 
-**As of v0.9.0, we recommend using zip files or manifest CSVs whenever you need to provide multiple sketches.** Prior to v0.9.0, we suggest pathlists, but these now incur substantial overhead.
+**As of v0.9.0, we recommend using zip files or manifest CSVs whenever you need to provide multiple sketches.** Prior to v0.9.0, we suggest fromfiles, but these now incur substantial overhead.
 
 | command | query input | database format |
 | -------- | -------- | -------- |
@@ -29,41 +27,61 @@ sourmash supports a variety of different storage formats for sketches (see [sour
 | `multisearch` | Multiple sketches in sig, zip, manifest CSV, or fromfile | Multiple sketches in sig, zip, manifest CSV, or fromfile |
 | `pairwise` | Multiple sketches in sig, zip, manifest CSV, or fromfile | N/A 
 
-### Using zip files or manifest files
-
-Zip files are compressed collections of sourmash sketches. When created with sourmash, they also contain manifests. They are generally the most efficient option for loading and storing sourmash signatures.
+### Using zipfiles
 
-Manifest files are csv files with all information about sourmash signature parameters. Having a manifest allows us to select sketches relevant to the search (e.g. by k-mer size, scaled factor, etc) and perform checks without loading the sketches themselves into memory. We then only load the actual sketches (and optionally, downsample to a lower scaled value) when we're ready to use them.
+When working with large collections of small sketches such as genomes, we suggest using zipfiles as produced by sourmash (e.g. using `sourmash sig cat`). Zip files have a few nice features:
 
-If you have a `sourmash` zip file of signatures, it already contains a manifest that we can use internally.
+* sketches are compressed in zip files;
+* zip files can contain many sketches, including incompatible types (e.g. multiple k-mer sizes);
+* zip files contain "manifests" listing their contents;
+* subsets of zip files can be efficiently selected and loaded depending on what is needed;
+* in particular, _single_ sketches can be loaded on demand, supporting lower memory requirements for certain kinds of searches.
 
-If you'd like to generate a standalone `manifest` file from your signatures, you can do it like so:
+For all these reasons, zip files are the most efficient and effective basic storage type for sketches in sourmash, and as of the branchwater plugin v0.9.0, they are fully supported!
 
+You can create zipfiles with sourmash like so:
 ```
-sourmash sig manifest <sigfile> -o sigfile.manifest.csv
+sourmash sig cat <list of sketches> -o sigs.zip
 ```
-> Here, `sigfile` can be any type of sourmash input, including a signature file or fromfile.
 
-@CTB: fix fromfile vs pathlist stuff, explain manifests more/diff/better.
+### Using manifests instead of zip files - why and when?
 
-### Using "fromfiles"
+There are various places where we recommend using manifests instead of zip files. Why?
+
+Well, first, if you are using a zip file created by sourmash, you are already using a manifest! And you will get all of the benefits described above!
 
-**Note: We no longer recommend using "fromfiles". Use zip files instead.**
+But if you want to use a collection of multiple very large metagenomes (as search targets in `manysearch`, or as queries in `fastmultigather`), then standalone manifests might be a good solution for you.
 
-To prepare a **signature** fromfile from a database, first you need to split the database into individual files:
+This is for two specific reasons:
+* first, metagenome sketches are often extremely large (100s of MBs to GBs), and it is not ideal to zip many large sketches into a single zip file;
+* second, both `manysearch` and `fastmultigather` take a single argument that specifies collections of metagenomes which need to be loaded on demand, because they cannot fit into memory;
+
+so the question becomes, how do you provide collections of large metagenomes to `manysearch` and `fastmultigather` in a single filename?
+
+And the answer is: manifests. Manifests are a sourmash filetype that contains information about sketches without containing the actual sketch content, and they can be used as "catalogs" of sketch content.
+
+The branchwater plugin supports manifest CSVs.  These can be created from lists of sketches by using `sourmash sig collect` or `sourmash sig manifest`; for example,
 ```
-mkdir gtdb-reps-rs214-k21/
-cd gtdb-reps-rs214-k21/
-sourmash sig split -k 21 /group/ctbrowngrp/sourmash-db/gtdb-rs214/gtdb-rs214-reps.k21.zip -E .sig.gz
-cd ..
+sourmash sig manifest <from file> -o manifest.csv
 ```
+will create a manifest CSV from a list of sketches.
+
+### Using RocksDB inverted indexes
 
-and then build a "fromfile":
+The branchwater plugin also supports a database type that is not yet supported by sourmash: inverted indexes stored in a RocksDB database. These indexes provide fast and low-memory lookups when searching very large datasets, and are used for the branchwater petabase scale search hosted at [branchwater.sourmash.bio](https://branchwater.sourmash.bio). 
+
+Some commands - `fastmultigather` and `manysearch` - support using these RocksDB-based inverted indexes. They can be created by running `sourmash scripts index`.
+
+### Using "fromfiles"
+
+**Note: We no longer recommend using "fromfiles". Use zip files or manifests instead.**
+
+You can make a fromfile by listing a collection of .sig.gz files like so:
 ```
-find gtdb-reps-rs214-k21/ -name "*.sig.gz" -type f > list.gtdb-reps-rs214-k21.txt
+find /path/to/directory/ -name "*.sig.gz" -type f > directory.txt
 ```
 
-When using these files for search, we have no a priori information about the parameters used for each sketch, so we load all signatures into memory at the start in order to generate a manifest. To avoid memory issues, the signatures are not kept in memory, but instead re-loaded as described below for each command (see: Notes on concurrency and efficiency). This makes using `pathlists` less efficient than `zip` files or `manifests`.
+When using a fromfile for search, we load all signatures into memory at the start in order to generate a manifest. To avoid memory issues, the signatures are not kept in memory, but instead re-loaded as described below for each command (see: Notes on concurrency and efficiency). This makes using fromfiles less efficient than `zip` files or manifests (as of v0.9.0).
 
 ## Running the commands
 
@@ -154,9 +172,7 @@ sourmash scripts fastmultigather queries.manifest.csv database.zip --cores 4
 ```
 We suggest using a manifest CSV for the queries.
 
-The main advantage that `fastmultigather` has over running `fastgather` on multiple queries is that you only load the database files once, which can be a significant time savings for large databases!
-
-@CTB: NTP, is this comment on loading still true?
+The main advantage that `fastmultigather` has over running `fastgather` on multiple queries is that you only load the database files once with `fastmultigather`, which can be a significant time savings for large databases!
 
 #### Output files for `fastmultigather`
 
@@ -192,8 +208,6 @@ Each command does things slightly differently, with implications for CPU and dis
 
 Like `multisearch` and `pairwise`, `fastgather` loads everything at the beginning, and then uses multithreading to search across all matching sequences. For large databases it is extremely efficient at using all available cores. So 128 threads or more should work fine! We suggest using zipfile or manifests for the database.
 
-@CTB: NTP, is this "loads everything at the beginning" comment still true?
-
 `fastmultigather` loads the entire database once, and then loads one query from disk per thread. The compute-per-query can be significant, though, so multithreading efficiency here is less dependent on I/O and the disk is less likely to be saturated with many threads. We suggest limiting threads to between 32 and 64 to decrease shared disk load.
 
 ## Appendix 1 - `index` to create a low-memory index

From 46575c2701fda714ecde693d0153f9ab72d349fc Mon Sep 17 00:00:00 2001
From: HackMD <37423+hackmd-hub[bot]@users.noreply.github.com>
Date: Sun, 18 Feb 2024 01:28:13 +0000
Subject: [PATCH 08/12] last changed at Feb 17, 2024 5:28 PM, pushed by C.
 Titus Brown

---
 doc/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/README.md b/doc/README.md
index 903a6786..bf601c1a 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -9,7 +9,7 @@
 | `multisearch` | Multithreaded comparison of multiple sketches, in memory | [link](#Running-multisearch)
 | `pairwise` | Multithreaded pairwise comparison of multiple sketches, in memory | [link](#Running-multisearch)
 
-This repository implements multithreaded plugins for [sourmash](https://sourmash.readthedocs.io/) that provide very fast implementations of `sketch`, `search`, and `gather`. These commands are typically hundreds to thousands of times faster, and 10-50x lower memory, than the current sourmash code.
+This repository implements multithreaded plugins for [sourmash](https://sourmash.readthedocs.io/) that provide very fast implementations of `sketch`, `search`, and `gather`. These commands are typically hundreds to thousands of times faster, and 10-50x lower memory, than the current sourmash code. For example, a `gather` of SRR606249 with sourmash v4.8.6 against GTDB rs214 takes 40 minutes and 14 GB of RAM, while `fastgather` takes only 2 minutes and 2 GB of RAM.
 
 The main *drawback* to these plugin commands is that their inputs and outputs are not as rich as the native sourmash commands. This means that your input files may need to be prepared differently, and the output may in some cases be most useful as a prefilter in conjunction with regular sourmash commands - see the instructions below for using `fastgather` to create picklists for sourmash.
 
@@ -49,7 +49,7 @@ sourmash sig cat <list of sketches> -o sigs.zip
 There are various places where we recommend using manifests instead of zip files. Why?
 
 Well, first, if you are using a zip file created by sourmash, you are already using a manifest! And you will get all of the benefits described above!
-
+ 
 But if you want to use a collection of multiple very large metagenomes (as search targets in `manysearch`, or as queries in `fastmultigather`), then standalone manifests might be a good solution for you.
 
 This is for two specific reasons:

From 3a1aab73dd1d5e5bbc217eb6aa7a382913734dc9 Mon Sep 17 00:00:00 2001
From: HackMD <37423+hackmd-hub[bot]@users.noreply.github.com>
Date: Sun, 18 Feb 2024 01:29:57 +0000
Subject: [PATCH 09/12] last changed at Feb 17, 2024 5:29 PM, pushed by C.
 Titus Brown

---
 doc/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/README.md b/doc/README.md
index bf601c1a..cd6a6d31 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -9,7 +9,7 @@
 | `multisearch` | Multithreaded comparison of multiple sketches, in memory | [link](#Running-multisearch)
 | `pairwise` | Multithreaded pairwise comparison of multiple sketches, in memory | [link](#Running-multisearch)
 
-This repository implements multithreaded plugins for [sourmash](https://sourmash.readthedocs.io/) that provide very fast implementations of `sketch`, `search`, and `gather`. These commands are typically hundreds to thousands of times faster, and 10-50x lower memory, than the current sourmash code. For example, a `gather` of SRR606249 with sourmash v4.8.6 against GTDB rs214 takes 40 minutes and 14 GB of RAM, while `fastgather` takes only 2 minutes and 2 GB of RAM.
+This repository implements multithreaded plugins for [sourmash](https://sourmash.readthedocs.io/) that provide very fast implementations of `sketch`, `search`, and `gather`. These commands are typically hundreds to thousands of times faster, and 10-50x lower memory, than the current sourmash code. For example, a `gather` of SRR606249 with sourmash v4.8.6 against GTDB rs214 takes 40 minutes and 14 GB of RAM, while `fastgather` with 64 cores takes only 2 minutes and 2 GB of RAM.
 
 The main *drawback* to these plugin commands is that their inputs and outputs are not as rich as the native sourmash commands. This means that your input files may need to be prepared differently, and the output may in some cases be most useful as a prefilter in conjunction with regular sourmash commands - see the instructions below for using `fastgather` to create picklists for sourmash.
 
@@ -17,7 +17,7 @@ The main *drawback* to these plugin commands is that their inputs and outputs ar
 
 sourmash supports a variety of different storage formats for sketches (see [sourmash docs](https://sourmash.readthedocs.io/en/latest/command-line.html#choosing-signature-output-formats)), and the branchwater plugin works some (but not all) of them. Branchwater _also_ supports an additional database type, a RocksDB-based inverted index, that is not yet supported by sourmash (through v4.8.6).
 
-**As of v0.9.0, we recommend using zip files or manifest CSVs whenever you need to provide multiple sketches.** Prior to v0.9.0, we suggest fromfiles, but these now incur substantial overhead.
+**As of v0.9.0, we recommend using zip files or manifest CSVs whenever you need to provide multiple sketches.**
 
 | command | query input | database format |
 | -------- | -------- | -------- |

From e564f9c68e2130c657c9486de4b6d19d060a0e91 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 17 Feb 2024 17:32:59 -0800
Subject: [PATCH 10/12] bump to v0.9.0

---
 Cargo.lock | 2 +-
 Cargo.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a7fe7a49..7e5c3b28 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1438,7 +1438,7 @@ dependencies = [
 
 [[package]]
 name = "sourmash_plugin_branchwater"
-version = "0.8.7-dev"
+version = "0.9.0"
 dependencies = [
  "anyhow",
  "assert_cmd",
diff --git a/Cargo.toml b/Cargo.toml
index 410dab6e..cfeb163e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "sourmash_plugin_branchwater"
-version = "0.8.7-dev"
+version = "0.9.0"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

From 3fce462921e3204cc88d229fe31d9c1630596be4 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Mon, 19 Feb 2024 08:19:13 -0800
Subject: [PATCH 11/12] Apply suggestions from code review

Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>
---
 doc/README.md | 5 +++--
 src/utils.rs  | 3 ---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/doc/README.md b/doc/README.md
index cd6a6d31..fdfa2178 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -15,12 +15,13 @@ The main *drawback* to these plugin commands is that their inputs and outputs ar
 
 ## Input file formats
 
-sourmash supports a variety of different storage formats for sketches (see [sourmash docs](https://sourmash.readthedocs.io/en/latest/command-line.html#choosing-signature-output-formats)), and the branchwater plugin works some (but not all) of them. Branchwater _also_ supports an additional database type, a RocksDB-based inverted index, that is not yet supported by sourmash (through v4.8.6).
+sourmash supports a variety of different storage formats for sketches (see [sourmash docs](https://sourmash.readthedocs.io/en/latest/command-line.html#choosing-signature-output-formats)), and the branchwater plugin works with some (but not all) of them. Branchwater _also_ supports an additional database type, a RocksDB-based inverted index, that is not yet supported by sourmash (through v4.8.6).
 
 **As of v0.9.0, we recommend using zip files or manifest CSVs whenever you need to provide multiple sketches.**
 
 | command | query input | database format |
 | -------- | -------- | -------- |
+| `manysketch`     | CSV with input fasta/fastq paths (details below)    | _produces_ Zip database |
 | `gather`     | Single metagenome in sig, zip, manifest CSV, or fromfile     | Zip, manifest CSV, or fromfile |
 | `fastmultigather` | Multiple metagenomes in sig, zip, manifest CSV, or fromfile | Zip, manifest CSV, fromfile, or rocksdb index |
 | `manysearch` | Multiple genomes in sig, zip, manifest CSV, or fromfile | Zip, manifest CSV, fromfile, or rocksdb index |
@@ -29,7 +30,7 @@ sourmash supports a variety of different storage formats for sketches (see [sour
 
 ### Using zipfiles
 
-When working with large collections of small sketches such as genomes, we suggest using zipfiles as produced by sourmash (e.g. using `sourmash sig cat`). Zip files have a few nice features:
+When working with large collections of small sketches such as genomes, we suggest using zipfiles as produced by sourmash (e.g. using `sourmash sig cat` or `manysketch`). Zip files have a few nice features:
 
 * sketches are compressed in zip files;
 * zip files can contain many sketches, including incompatible types (e.g. multiple k-mer sizes);
diff --git a/src/utils.rs b/src/utils.rs
index 94a338e6..e4a81d0c 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -466,7 +466,6 @@ pub fn load_collection(
     };
 
     let collection = collection.or_else(|| {
-        // dbg!("attempting to load as manifest");
         match collection_from_manifest(&sigpath, &report_type) {
             Ok(coll) => Some((coll, 0)),
             Err(e) => {
@@ -477,7 +476,6 @@ pub fn load_collection(
     });
 
     let collection = collection.or_else(|| {
-        // dbg!("attempting to load as signature");
         match collection_from_signature(&sigpath, &report_type) {
             Ok(coll) => Some((coll, 0)),
             Err(e) => {
@@ -488,7 +486,6 @@ pub fn load_collection(
     });
 
     let collection = collection.or_else(|| {
-        // dbg!("attempting to load as pathlist");
         match collection_from_pathlist(&sigpath, &report_type) {
             Ok((coll, n_failed)) => Some((coll, n_failed)),
             Err(e) => {

From 5a4c7e67238cc991161b100951b15eaa2d983e53 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Mon, 19 Feb 2024 08:30:06 -0800
Subject: [PATCH 12/12] cargo fmt

---
 src/utils.rs | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/utils.rs b/src/utils.rs
index ba0f9978..f4ed1727 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -465,35 +465,32 @@ pub fn load_collection(
         None
     };
 
-    let collection = collection.or_else(|| {
-        match collection_from_manifest(&sigpath, &report_type) {
+    let collection =
+        collection.or_else(|| match collection_from_manifest(&sigpath, &report_type) {
             Ok(coll) => Some((coll, 0)),
             Err(e) => {
                 last_error = Some(e);
                 None
             }
-        }
-    });
+        });
 
-    let collection = collection.or_else(|| {
-        match collection_from_signature(&sigpath, &report_type) {
+    let collection =
+        collection.or_else(|| match collection_from_signature(&sigpath, &report_type) {
             Ok(coll) => Some((coll, 0)),
             Err(e) => {
                 last_error = Some(e);
                 None
             }
-        }
-    });
+        });
 
-    let collection = collection.or_else(|| {
-        match collection_from_pathlist(&sigpath, &report_type) {
+    let collection =
+        collection.or_else(|| match collection_from_pathlist(&sigpath, &report_type) {
             Ok((coll, n_failed)) => Some((coll, n_failed)),
             Err(e) => {
                 last_error = Some(e);
                 None
             }
-        }
-    });
+        });
 
     match collection {
         Some((coll, n_failed)) => {