From 752a5d5c2d67fa6ae605a2e534d355b51595ef83 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 19 Feb 2024 08:33:09 -0800 Subject: [PATCH] MRG: update docs for v0.9.0 (#213) * update docs * more * remove errant dbg messages for now * last changed at Feb 15, 2024 6:36 AM, pushed by C. Titus Brown * try revert #209 * upd Cargo.lock * last changed at Feb 17, 2024 6:50 AM, pushed by C. Titus Brown * last changed at Feb 17, 2024 5:28 PM, pushed by C. Titus Brown * last changed at Feb 17, 2024 5:29 PM, pushed by C. Titus Brown * bump to v0.9.0 * Apply suggestions from code review Co-authored-by: Tessa Pierce Ward --------- Co-authored-by: HackMD <37423+hackmd-hub[bot]@users.noreply.github.com> Co-authored-by: Tessa Pierce Ward --- Cargo.lock | 133 ++++++++++++++++++++---------------------- Cargo.toml | 4 +- doc/README.md | 158 +++++++++++++++++++++++++++++--------------------- src/utils.rs | 24 +++----- 4 files changed, 167 insertions(+), 152 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 51cea82e..7e5c3b28 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -49,54 +49,12 @@ dependencies = [ "libc", ] -[[package]] -name = "anstream" -version = "0.6.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e2e1ebcb11de5c03c67de28a7df593d32191b44939c482e97702baaaa6ab6a5" -dependencies = [ - "anstyle", - "anstyle-parse", - "anstyle-query", - "anstyle-wincon", - "colorchoice", - "utf8parse", -] - [[package]] name = "anstyle" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" -[[package]] -name = "anstyle-parse" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" -dependencies = [ - "utf8parse", -] - -[[package]] -name = "anstyle-query" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" -dependencies = [ - "windows-sys 0.52.0", -] - -[[package]] -name = "anstyle-wincon" -version = "3.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" -dependencies = [ - "anstyle", - "windows-sys 0.52.0", -] - [[package]] name = "anyhow" version = "1.0.79" @@ -359,12 +317,6 @@ dependencies = [ "csv", ] -[[package]] -name = "colorchoice" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" - [[package]] name = "core-foundation-sys" version = "0.8.6" @@ -465,27 +417,17 @@ dependencies = [ "syn 2.0.48", ] -[[package]] -name = "env_filter" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea" -dependencies = [ - "log", - "regex", -] - [[package]] name = "env_logger" -version = "0.11.1" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05e7cf40684ae96ade6232ed84582f40ce0a66efcd43a5117aef610534f8e0b8" +checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" dependencies = [ - "anstream", - "anstyle", - "env_filter", "humantime", + "is-terminal", "log", + "regex", + "termcolor", ] [[package]] @@ -581,6 +523,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "hermit-abi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd5256b483761cd23699d0da46cc6fd2ee3be420bbe6d020ae4a091e70b7e9fd" + [[package]] name = "histogram" version = "0.9.0" @@ -634,6 +582,17 @@ dependencies = [ "smallvec", ] +[[package]] +name = "is-terminal" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "itertools" version = "0.12.0" @@ -1479,7 +1438,7 @@ dependencies = [ [[package]] name = "sourmash_plugin_branchwater" -version = "0.8.7-dev" +version = "0.9.0" dependencies = [ "anyhow", "assert_cmd", @@ -1553,6 +1512,15 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "termcolor" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" +dependencies = [ + "winapi-util", +] + [[package]] name = "termtree" version = "0.4.1" @@ -1637,12 +1605,6 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" -[[package]] -name = "utf8parse" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" - [[package]] name = "uuid" version = "1.7.0" @@ -1757,6 +1719,37 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-core" version = "0.52.0" diff --git a/Cargo.toml b/Cargo.toml index fca3c5e4..cfeb163e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sourmash_plugin_branchwater" -version = "0.8.7-dev" +version = "0.9.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -16,7 +16,7 @@ sourmash = { version = "0.12.1", features = ["branchwater"] } serde_json = "1.0.113" niffler = "2.4.0" log = "0.4.14" -env_logger = "0.11.1" +env_logger = "0.10.2" simple-error = "0.3.0" anyhow = "1.0.79" zip = { version = "0.6", default-features = false, features = ["deflate"] } diff --git a/doc/README.md b/doc/README.md index 3544df9e..fdfa2178 100644 --- a/doc/README.md +++ b/doc/README.md @@ -1,52 +1,94 @@ -# manysketch, fastgather, fastmultigather, multisearch, and manysearch - an introduction +# The branchwater plugin for sourmash -This repository implements five sourmash plugins, `manysketch`, `fastgather`, `fastmultigather`, `multisearch`, and `manysearch`. These plugins make use of multithreading in Rust to provide very fast implementations of `sketch`, `search`, and `gather`. With large databases, these commands can be hundreds to thousands of times faster, and 10-50x lower memory, than sourmash. +| command | functionality | docs | +| -------- | -------- | -------- | +| `manysketch` | Rapidly build sketches for many input files | [link](#Running-manysketch) | +| `fastgather` | Multithreaded `gather` of **one** metagenome against a database| [link](#Running-fastgather) +| `fastmultigather` | Multithreaded `gather` of **multiple** metagenomes against a database | [link](#Running-fastmultigather) +| `manysearch` | Multithreaded containment search for many queries in many large metagenomes | [link](#Running-manysearch) +| `multisearch` | Multithreaded comparison of multiple sketches, in memory | [link](#Running-multisearch) +| `pairwise` | Multithreaded pairwise comparison of multiple sketches, in memory | [link](#Running-multisearch) -The main *drawback* to these plugin commands is that their inputs and outputs are not as rich as the native sourmash commands. This may mean that your input files need to be prepared differently. The output may currently be most useful as a prefilter in conjunction with regular sourmash commands - see the instructions below for using `fastgather` to create picklists for sourmash. +This repository implements multithreaded plugins for [sourmash](https://sourmash.readthedocs.io/) that provide very fast implementations of `sketch`, `search`, and `gather`. These commands are typically hundreds to thousands of times faster, and 10-50x lower memory, than the current sourmash code. For example, a `gather` of SRR606249 with sourmash v4.8.6 against GTDB rs214 takes 40 minutes and 14 GB of RAM, while `fastgather` with 64 cores takes only 2 minutes and 2 GB of RAM. -## Input file formats +The main *drawback* to these plugin commands is that their inputs and outputs are not as rich as the native sourmash commands. This means that your input files may need to be prepared differently, and the output may in some cases be most useful as a prefilter in conjunction with regular sourmash commands - see the instructions below for using `fastgather` to create picklists for sourmash. +## Input file formats -All four search/gather commands accept zip files, manifest files, or _text files containing lists of signature files_ ("fromfiles") for the search database. `multisearch`, `manysearch` and `fastmultigather` also use either zips, manifests, or "fromfiles" for queries, too. All commands now accept single signature files as well, though this is only useful for single-query input. +sourmash supports a variety of different storage formats for sketches (see [sourmash docs](https://sourmash.readthedocs.io/en/latest/command-line.html#choosing-signature-output-formats)), and the branchwater plugin works with some (but not all) of them. Branchwater _also_ supports an additional database type, a RocksDB-based inverted index, that is not yet supported by sourmash (through v4.8.6). -`manysketch` takes as input a CSV file with columns `name,genome_filename,protein_filename`. If you don't have `protein_filename` entries, be sure to include the trailing comma so the CSV reader can process the file correctly. +**As of v0.9.0, we recommend using zip files or manifest CSVs whenever you need to provide multiple sketches.** -### Using zip files or manifest files +| command | query input | database format | +| -------- | -------- | -------- | +| `manysketch` | CSV with input fasta/fastq paths (details below) | _produces_ Zip database | +| `gather` | Single metagenome in sig, zip, manifest CSV, or fromfile | Zip, manifest CSV, or fromfile | +| `fastmultigather` | Multiple metagenomes in sig, zip, manifest CSV, or fromfile | Zip, manifest CSV, fromfile, or rocksdb index | +| `manysearch` | Multiple genomes in sig, zip, manifest CSV, or fromfile | Zip, manifest CSV, fromfile, or rocksdb index | +| `multisearch` | Multiple sketches in sig, zip, manifest CSV, or fromfile | Multiple sketches in sig, zip, manifest CSV, or fromfile | +| `pairwise` | Multiple sketches in sig, zip, manifest CSV, or fromfile | N/A +### Using zipfiles -Manifest files are csv files with all information about sourmash signature parameters. Having a manifest allows us to select sketches relevant to the search (e.g. by k-mer size, scaled factor, etc) and perform checks without loading the sketches themselves into memory. We then only load the actual sketches (and optionally, downsample to a lower scaled value) when we're ready to use them. +When working with large collections of small sketches such as genomes, we suggest using zipfiles as produced by sourmash (e.g. using `sourmash sig cat` or `manysketch`). Zip files have a few nice features: -If you have a `sourmash` zip file of signatures, it already contains a manifest that we can use internally. +* sketches are compressed in zip files; +* zip files can contain many sketches, including incompatible types (e.g. multiple k-mer sizes); +* zip files contain "manifests" listing their contents; +* subsets of zip files can be efficiently selected and loaded depending on what is needed; +* in particular, _single_ sketches can be loaded on demand, supporting lower memory requirements for certain kinds of searches. -If you'd like to generate a standalone `manifest` file from your signatures, you can do it like so: +For all these reasons, zip files are the most efficient and effective basic storage type for sketches in sourmash, and as of the branchwater plugin v0.9.0, they are fully supported! +You can create zipfiles with sourmash like so: ``` -sourmash sig manifest -o sigs.manifest.csv +sourmash sig cat -o sigs.zip ``` -> Here, `sigs` can be any type of sourmash input, including a signature file or `pathlist` -### Using "fromfiles" +### Using manifests instead of zip files - why and when? + +There are various places where we recommend using manifests instead of zip files. Why? + +Well, first, if you are using a zip file created by sourmash, you are already using a manifest! And you will get all of the benefits described above! + +But if you want to use a collection of multiple very large metagenomes (as search targets in `manysearch`, or as queries in `fastmultigather`), then standalone manifests might be a good solution for you. -To prepare a **signature** fromfile from a database, first you need to split the database into individual files: +This is for two specific reasons: +* first, metagenome sketches are often extremely large (100s of MBs to GBs), and it is not ideal to zip many large sketches into a single zip file; +* second, both `manysearch` and `fastmultigather` take a single argument that specifies collections of metagenomes which need to be loaded on demand, because they cannot fit into memory; + +so the question becomes, how do you provide collections of large metagenomes to `manysearch` and `fastmultigather` in a single filename? + +And the answer is: manifests. Manifests are a sourmash filetype that contains information about sketches without containing the actual sketch content, and they can be used as "catalogs" of sketch content. + +The branchwater plugin supports manifest CSVs. These can be created from lists of sketches by using `sourmash sig collect` or `sourmash sig manifest`; for example, ``` -mkdir gtdb-reps-rs214-k21/ -cd gtdb-reps-rs214-k21/ -sourmash sig split -k 21 /group/ctbrowngrp/sourmash-db/gtdb-rs214/gtdb-rs214-reps.k21.zip -E .sig.gz -cd .. +sourmash sig manifest -o manifest.csv ``` +will create a manifest CSV from a list of sketches. + +### Using RocksDB inverted indexes -and then build a "fromfile": +The branchwater plugin also supports a database type that is not yet supported by sourmash: inverted indexes stored in a RocksDB database. These indexes provide fast and low-memory lookups when searching very large datasets, and are used for the branchwater petabase scale search hosted at [branchwater.sourmash.bio](https://branchwater.sourmash.bio). + +Some commands - `fastmultigather` and `manysearch` - support using these RocksDB-based inverted indexes. They can be created by running `sourmash scripts index`. + +### Using "fromfiles" + +**Note: We no longer recommend using "fromfiles". Use zip files or manifests instead.** + +You can make a fromfile by listing a collection of .sig.gz files like so: ``` -find gtdb-reps-rs214-k21/ -name "*.sig.gz" -type f > list.gtdb-reps-rs214-k21.txt +find /path/to/directory/ -name "*.sig.gz" -type f > directory.txt ``` -When using these files for search, we have no a priori information about the parameters used for each sketch, so we load all signatures into memory at the start in order to generate a manifest. To avoid memory issues, the signatures are not kept in memory, but instead re-loaded as described below for each command (see: Notes on concurrency and efficiency). This makes using `pathlists` less efficient than `zip` files or `manifests`. +When using a fromfile for search, we load all signatures into memory at the start in order to generate a manifest. To avoid memory issues, the signatures are not kept in memory, but instead re-loaded as described below for each command (see: Notes on concurrency and efficiency). This makes using fromfiles less efficient than `zip` files or manifests (as of v0.9.0). ## Running the commands ### Running `manysketch` -The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped sourmash signature collection (`zip`). `manysketch` uses one thread per input file, so it can (very) efficiently sketch many files at once; and, because sequence file parsing is entirely implemented in Rust, it is much faster than `sourmash sketch` for large FASTQ files. +The `manysketch` command sketches one or more FASTA/FASTQ files into a zipped sourmash signature collection (`zip`). `manysketch` uses one thread per input file, so it can (very) efficiently sketch many files at once; and, because sequence file parsing is entirely implemented in Rust, it is much, _much_ faster than `sourmash sketch` for large FASTQ files. To run `manysketch`, you need to build a text file list of FASTA/FASTQ files, with one on each line (`manysketch.csv`, below). A simple way to do this for a directory is this command snippet: ``` @@ -57,7 +99,6 @@ echo $i,$i, done >> manysketch.csv ``` - You can then run: ``` @@ -74,72 +115,65 @@ To modify sketching parameters, use `--param-str` or `-p` and provide valid para ``` sourmash scripts manysketch fa.csv -o fa.zip -p k=21,k=31,k=51,scaled=1000,abund -p protein,k=10,scaled=200 ``` - +See [the sourmash sketch docs](https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-sketch-make-sourmash-signatures-from-sequence-data) for more information on param strings. ### Running `multisearch` The `multisearch` command compares one or more query genomes, and one or more subject genomes. It differs from `manysearch` by loading all genomes into memory. -`multisearch` takes two input collections (zip or "fromfiles"), and outputs a CSV: +`multisearch` takes two input collections and outputs a CSV: ``` -sourmash scripts multisearch query-list.txt podar-ref-list.txt -o results.csv +sourmash scripts multisearch query.sig.gz database.zip -o results.csv ``` -To run it, you need to provide two collections of signature files. If you create a fromfile as above with GTDB reps, you can generate a query fromfile like so: -``` -head -10 list.gtdb-reps-rs214-k21.txt > list.query.txt -``` -and then run `multisearch` like so: - -``` -sourmash scripts multisearch list.query.txt list.gtdb-rs214-k21.txt -o query.x.gtdb-reps.csv -k 21 --cores 4 -``` +The results file `results.csv`, will have 8 columns: `query` and `query_md5`, `match` and `match_md5`, and `containment`, `jaccard`, `max_containment`, and `intersect_hashes`. -The results file here, `query.x.gtdb-reps.csv`, will have 8 columns: `query` and `query_md5`, `match` and `match_md5`, and `containment`, `jaccard`, `max_containment`, and `intersect_hashes`. +The `pairwise` command does the same comparisons as `multisearch` but takes +only a single collection of sketches, for which it calculates all the pairwise comparisons. Since the comparisons are symmetric, it approximately +twice as fast as `multisearch`. ### Running `fastgather` The `fastgather` command is a much faster version of `sourmash gather`. -`fastgather` takes a single query metagenome (in any file format) and an input collection (zip or "fromfile") as database, and outputs a CSV: +`fastgather` takes a single query metagenome and a database, and outputs a CSV: ``` -sourmash scripts fastgather query.sig.gz podar-ref-list.txt -o results.csv --cores 4 +sourmash scripts fastgather query.sig.gz database.zip -o results.csv --cores 4 ``` #### Using `fastgather` to create a picklist for `sourmash gather` One handy use case for `fastgather` is to create a picklist that can be used by `sourmash gather`. This makes full use of the speed of `fastgather` while producing a complete set of `gather` outputs. -For example, if `list.gtdb-rs214-k21.txt` contains the paths to all GTDB RS214 genomes in `sig.gz` files, as above, then the following command will do a complete gather against GTDB: +For example, if you run a complete `gather` against GTDB rs214, ``` sourmash scripts fastgather SRR606249.trim.sig.gz \ - list.gtdb-rs214-k21.txt -o SRR606249.fastgather.csv -k 21 + gdtb-rs214-k21.zip -o SRR606249.fastgather.csv -k 21 ``` -This CSV file can then be used as a picklist for `sourmash gather` like so: +The resulting CSV file can then be used as a picklist for `sourmash gather` like so: ``` -sourmash gather SRR606249.trim.sig.gz /group/ctbrowngrp/sourmash-db/gtdb-rs214/gtdb-rs214-k21.zip \ +sourmash gather SRR606249.trim.sig.gz gtdb-rs214-k21.zip \ --picklist SRR606249.fastgather.csv:match_name:ident \ -o SRR606249.gather.csv ``` -Here the picklist should be used on a sourmash collection that contains a manifest - this will prevent sourmash from loading any sketches other than the ones in the fastgather CSV file. We recommend using zip file databases - manifests are produced automatically when `-o filename.zip` is used with `sketch dna`, and they also be prepared with `sourmash sig cat`. (If you are using a GTDB database, as above, then you already have a manifest!) - #### Example of picklist usage -A complete example Snakefile implementing the above workflow is available [in the 2023-swine-usda](https://github.com/ctb/2023-swine-usda/blob/main/Snakefile) repository. Note, it is slightly out of date at the moment! +A complete example Snakefile implementing the above workflow is available [in the sourmash-slainte Snakefile](https://github.com/dib-lab/sourmash-slainte/blob/main/Snakefile). ### Running `fastmultigather` `fastmultigather` takes a collection of query metagenomes and a collection of sketches as a database, and outputs many CSVs: ``` -sourmash scripts fastmultigather query-list.txt podar-ref-lists.txt --cores 4 +sourmash scripts fastmultigather queries.manifest.csv database.zip --cores 4 ``` +We suggest using a manifest CSV for the queries. -The main advantage that `fastmultigather` has over running `fastgather` on multiple queries is that you only load the database files once, which can be a significant time savings for large databases! +The main advantage that `fastmultigather` has over running `fastgather` on multiple queries is that you only load the database files once with `fastmultigather`, which can be a significant time savings for large databases! #### Output files for `fastmultigather` @@ -155,19 +189,9 @@ The `manysearch` command compares one or more collections of query sketches, and `manysearch` takes two collections as input, and outputs a CSV: ``` -sourmash scripts manysearch query-list.txt podar-ref-list.txt -o results.csv -``` - -To run it, you need to provide two "fromfiles" containing lists of paths to signature files (`.sig` or `.sig.gz`). If you create a fromfile as above with GTDB reps, you can generate a query fromfile like so: - -``` -head -10 list.gtdb-reps-rs214-k21.txt > list.query.txt -``` -and then run `manysearch` like so: - -``` -sourmash scripts manysearch list.query.txt list.gtdb-rs214-k21.txt -o query.x.gtdb-reps.csv -k 21 --cores 4 +sourmash scripts manysearch queries.zip metagenomes.manifest.csv -o results.csv ``` +We suggest using a manifest CSV for the metagenome collection. The results file here, `query.x.gtdb-reps.csv`, will have 8 columns: `query` and `query_md5`, `match` and `match_md5`, and `containment`, `jaccard`, `max_containment`, and `intersect_hashes`. @@ -175,15 +199,15 @@ The results file here, `query.x.gtdb-reps.csv`, will have 8 columns: `query` and Each command does things slightly differently, with implications for CPU and disk load. You can measure threading efficiency with `/usr/bin/time -v` on Linux systems, and disk load by number of complaints received when running. -(The below info is for fromfile lists. If you are using mastiff indexes, very different performance parameters apply. We will update here as we benchmark and improve!) - `manysketch` loads one sequence file from disk per thread and sketches it using all signature params simultaneously. -`manysearch` loads all the queries at the beginning, and then loads one database sketch from disk per thread. The compute-per-database-sketch is dominated by I/O. So your number of threads should be chosen with care for disk load. We typically limit it to `-c 32` for shared disks. +`manysearch` loads all the queries at the beginning, and then loads one database sketch from disk per thread. The compute-per-database-sketch is dominated by I/O. So your number of threads should be chosen with care for disk load. We typically limit it to `-c 32` for shared disks. We suggest using a manifest CSV file for the database sketches. -`multisearch` loads all the queries and database sketches once, at the beginning, and then uses multithreading to search across all matching sequences. For large databases it is extremely efficient at using all available cores. So 128 threads or more should work fine! +`multisearch` loads all the queries and database sketches once, at the beginning, and then uses multithreading to search across all matching sequences. For large databases it is extremely efficient at using all available cores. So 128 threads or more should work fine! Zipfiles and manifests should work well. -Like `multisearch`, `fastgather` loads everything at the beginning, and then uses multithreading to search across all matching sequences. For large databases it is extremely efficient at using all available cores. So 128 threads or more should work fine! +`pairwise` acts just like `multisearch`, but only loads one file (and then does all comparisons between all pairs within that file). + +Like `multisearch` and `pairwise`, `fastgather` loads everything at the beginning, and then uses multithreading to search across all matching sequences. For large databases it is extremely efficient at using all available cores. So 128 threads or more should work fine! We suggest using zipfile or manifests for the database. `fastmultigather` loads the entire database once, and then loads one query from disk per thread. The compute-per-query can be significant, though, so multithreading efficiency here is less dependent on I/O and the disk is less likely to be saturated with many threads. We suggest limiting threads to between 32 and 64 to decrease shared disk load. @@ -193,4 +217,8 @@ The command `sourmash scripts index` makes an on-disk inverted index for low memory fast search. Indexing takes a while, but then search takes fewer resources. -Currently only fastmultigather and manysearch can use this kind of index. +Currently only `fastmultigather` and `manysearch` can use this kind of index. + +We suggest using the extension `.rocksdb` for these databases, as we +use [RocksDB](https://rocksdb.org/) for the underlying database storage +mechanism. diff --git a/src/utils.rs b/src/utils.rs index db2c84e4..f4ed1727 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -465,38 +465,32 @@ pub fn load_collection( None }; - let collection = collection.or_else(|| { - dbg!("attempting to load as manifest"); - match collection_from_manifest(&sigpath, &report_type) { + let collection = + collection.or_else(|| match collection_from_manifest(&sigpath, &report_type) { Ok(coll) => Some((coll, 0)), Err(e) => { last_error = Some(e); None } - } - }); + }); - let collection = collection.or_else(|| { - dbg!("attempting to load as signature"); - match collection_from_signature(&sigpath, &report_type) { + let collection = + collection.or_else(|| match collection_from_signature(&sigpath, &report_type) { Ok(coll) => Some((coll, 0)), Err(e) => { last_error = Some(e); None } - } - }); + }); - let collection = collection.or_else(|| { - dbg!("attempting to load as pathlist"); - match collection_from_pathlist(&sigpath, &report_type) { + let collection = + collection.or_else(|| match collection_from_pathlist(&sigpath, &report_type) { Ok((coll, n_failed)) => Some((coll, n_failed)), Err(e) => { last_error = Some(e); None } - } - }); + }); match collection { Some((coll, n_failed)) => {